torch-audit 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torch_audit-0.2.0/PKG-INFO +251 -0
- torch_audit-0.2.0/README.md +215 -0
- torch_audit-0.2.0/pyproject.toml +65 -0
- torch_audit-0.2.0/src/torch_audit/__init__.py +16 -0
- torch_audit-0.2.0/src/torch_audit/callbacks.py +51 -0
- torch_audit-0.2.0/src/torch_audit/core/__init__.py +2 -0
- torch_audit-0.2.0/src/torch_audit/core/auditor.py +268 -0
- torch_audit-0.2.0/src/torch_audit/core/config.py +25 -0
- torch_audit-0.2.0/src/torch_audit/core/issue.py +18 -0
- torch_audit-0.2.0/src/torch_audit/core/reporter.py +73 -0
- torch_audit-0.2.0/src/torch_audit/core/validator.py +47 -0
- torch_audit-0.2.0/src/torch_audit/modules/__init__.py +3 -0
- torch_audit-0.2.0/src/torch_audit/modules/cv/__init__.py +0 -0
- torch_audit-0.2.0/src/torch_audit/modules/cv/images.py +100 -0
- torch_audit-0.2.0/src/torch_audit/modules/cv/layers.py +97 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/__init__.py +0 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/activations.py +116 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/gradients.py +79 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/graph.py +126 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/hardware.py +195 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/hygiene.py +104 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/optimizer_config.py +100 -0
- torch_audit-0.2.0/src/torch_audit/modules/general/stability.py +72 -0
- torch_audit-0.2.0/src/torch_audit/modules/nlp/__init__.py +0 -0
- torch_audit-0.2.0/src/torch_audit/modules/nlp/structure.py +89 -0
- torch_audit-0.2.0/src/torch_audit/modules/nlp/tokenization.py +144 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: torch-audit
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: The Linter for PyTorch: Detects silent training bugs.
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: pytorch,audit,debugging,linter,deep-learning
|
|
7
|
+
Author: Roman Malkiv
|
|
8
|
+
Author-email: malkiv.roman@gmail.com
|
|
9
|
+
Requires-Python: >=3.8,<4.0
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Provides-Extra: all
|
|
23
|
+
Provides-Extra: hf
|
|
24
|
+
Provides-Extra: lightning
|
|
25
|
+
Requires-Dist: accelerate (>=0.20.0) ; extra == "hf" or extra == "all"
|
|
26
|
+
Requires-Dist: datasets (>=2.10.0) ; extra == "hf" or extra == "all"
|
|
27
|
+
Requires-Dist: lightning (>=2.0.0) ; extra == "lightning" or extra == "all"
|
|
28
|
+
Requires-Dist: numpy (>=1.20.0) ; extra == "all"
|
|
29
|
+
Requires-Dist: rich (>=12.0.0)
|
|
30
|
+
Requires-Dist: torch (>=1.10.0)
|
|
31
|
+
Requires-Dist: transformers (>=4.30.0) ; extra == "hf" or extra == "all"
|
|
32
|
+
Project-URL: Homepage, https://github.com/RMalkiv/torch-audit
|
|
33
|
+
Project-URL: Repository, https://github.com/RMalkiv/torch-audit
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# ๐ฅ torch-audit
|
|
37
|
+
### The Linter for PyTorch Models
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/torch-audit/)
|
|
40
|
+
[](https://opensource.org/licenses/MIT)
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](https://github.com/psf/black)
|
|
43
|
+
|
|
44
|
+
**torch-audit** is a "check engine light" for your Deep Learning training loop. It detects silent bugs that don't crash your code but ruin your training or waste compute.
|
|
45
|
+
|
|
46
|
+
- ๐ฅ๏ธ **Hardware Efficiency:** Detects slow memory layouts (NHWC vs NCHW), mixed-precision failures, and tensor core misalignment.
|
|
47
|
+
- ๐งช **Data Integrity:** Catches broken attention masks, CV layout bugs, and silent NaN/Inf propagation.
|
|
48
|
+
- ๐ **Training Stability:** Identifies exploding gradients, bad optimizer config (Adam vs AdamW), and "dead" neurons.
|
|
49
|
+
- ๐ง **Graph Logic:** Identifies DDP-unsafe "Zombie" layers and redundant computations (e.g., Bias before BatchNorm).
|
|
50
|
+
- ๐ง **Domain Awareness:** Deep inspection for **NLP** (Padding waste, Tokenizer quality) and **CV** (Dead filters, Redundant biases).
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## ๐ฆ Installation
|
|
55
|
+
|
|
56
|
+
Install the standard version (lightweight):
|
|
57
|
+
```bash
|
|
58
|
+
pip install torch-audit
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Optional Integrations:
|
|
62
|
+
```
|
|
63
|
+
# For PyTorch Lightning support
|
|
64
|
+
pip install "torch-audit[lightning]"
|
|
65
|
+
|
|
66
|
+
# For Hugging Face Transformers support
|
|
67
|
+
pip install "torch-audit[hf]"
|
|
68
|
+
|
|
69
|
+
# For everything
|
|
70
|
+
pip install "torch-audit[all]"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## ๐ Quick Start
|
|
74
|
+
You have two ways to use `torch-audit`: the **Decorator** (easiest) or the **Context Manager** (most control).
|
|
75
|
+
|
|
76
|
+
### The Decorator Method (Recommended)
|
|
77
|
+
```python
|
|
78
|
+
import torch
|
|
79
|
+
from torch_audit import Auditor, AuditConfig
|
|
80
|
+
|
|
81
|
+
# 1. Setup Auditor (Audits every 1000 steps)
|
|
82
|
+
config = AuditConfig(interval=1000)
|
|
83
|
+
auditor = Auditor(model, optimizer, config=config)
|
|
84
|
+
|
|
85
|
+
# 2. Static Audit (Run once before training)
|
|
86
|
+
# Checks architecture, unused layers, and weight initialization
|
|
87
|
+
auditor.audit_static()
|
|
88
|
+
|
|
89
|
+
# 3. Training Loop
|
|
90
|
+
# The decorator handles hooks, data auditing, and error reporting automatically.
|
|
91
|
+
@auditor.audit_step
|
|
92
|
+
def train_step(batch, targets):
|
|
93
|
+
optimizer.zero_grad()
|
|
94
|
+
pred = model(batch)
|
|
95
|
+
loss = criterion(pred, targets)
|
|
96
|
+
loss.backward()
|
|
97
|
+
optimizer.step()
|
|
98
|
+
|
|
99
|
+
for batch, targets in dataloader:
|
|
100
|
+
train_step(batch, targets)
|
|
101
|
+
```
|
|
102
|
+
### The Context Manager Method
|
|
103
|
+
```python
|
|
104
|
+
# 3. Training Loop
|
|
105
|
+
for batch in dataloader:
|
|
106
|
+
# Manual data check (optional but recommended)
|
|
107
|
+
auditor.audit_data(batch)
|
|
108
|
+
|
|
109
|
+
# Dynamic checks (Gradients, Activations, Stability)
|
|
110
|
+
with auditor.audit_dynamic():
|
|
111
|
+
pred = model(batch)
|
|
112
|
+
loss = criterion(pred, target)
|
|
113
|
+
loss.backward()
|
|
114
|
+
optimizer.step()
|
|
115
|
+
```
|
|
116
|
+
### The Output
|
|
117
|
+
When a bug is found, `torch-audit` prints a structured report. It supports **Rich Console** tables (default) or **JSON/System Logs** for production.
|
|
118
|
+
|
|
119
|
+
```text
|
|
120
|
+
๐ Audit Running (Step 5000)...
|
|
121
|
+
๐ก Batch size is tiny (4). BatchNorm is unstable. (in Input Batch)
|
|
122
|
+
|
|
123
|
+
โ ๏ธ Audit Report (Step 5000)
|
|
124
|
+
โโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
125
|
+
โ Type โ Layer โ Message โ
|
|
126
|
+
โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ
|
|
127
|
+
โ ๐ด DDP Safety โ ghost_layer โ Layer defined but NEVER called (Zombie). โ
|
|
128
|
+
โ ๐ด Data Integrity โ Input Batch โ Attention Mask mismatch on 50 tokens. โ
|
|
129
|
+
โ ๐ก Tensor Core โ fc1 โ Dims (127->64) not divisible by 8. โ
|
|
130
|
+
โ ๐ก Stability โ Global โ Optimizer epsilon (1e-08) too low for AMP. โ
|
|
131
|
+
โ ๐ต CV Opt โ conv1 โ Bias=True followed by BatchNorm (Redundant).โ
|
|
132
|
+
โโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
133
|
+
```
|
|
134
|
+
## ๐ Runnable Demos
|
|
135
|
+
Don't just take our word for it! Break things yourself! We have prepared sabotaged scripts that trigger auditor warnings.
|
|
136
|
+
|
|
137
|
+
Check out the `examples/` folder:
|
|
138
|
+
- `python examples/demo_general.py` (General hardware/optimizer issues)
|
|
139
|
+
- `python examples/demo_nlp.py` (NLP & Tokenizer bugs)
|
|
140
|
+
- `python examples/demo_cv.py` (Computer Vision bugs)
|
|
141
|
+
- `python examples/demo_lightning.py` (PyTorch Lightning integration)
|
|
142
|
+
- `python examples/demo_hf.py` (Hugging Face integration)
|
|
143
|
+
- `python examples/demo_accelerate.py` (Accelerate integration)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
## ๐งฉ Integrations
|
|
147
|
+
We support the ecosystem you already use.
|
|
148
|
+
|
|
149
|
+
### โก PyTorch Lightning
|
|
150
|
+
Zero code changes to your loop. Just add the callback.
|
|
151
|
+
```python
|
|
152
|
+
from lightning.pytorch import Trainer
|
|
153
|
+
from torch_audit import Auditor, AuditConfig
|
|
154
|
+
from torch_audit.callbacks import LightningAuditCallback
|
|
155
|
+
|
|
156
|
+
auditor = Auditor(model, config=AuditConfig(interval=100))
|
|
157
|
+
trainer = Trainer(callbacks=[LightningAuditCallback(auditor)])
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### ๐ค Hugging Face Trainer
|
|
161
|
+
Plug-and-play with the Trainer API.
|
|
162
|
+
```python
|
|
163
|
+
from transformers import Trainer
|
|
164
|
+
from torch_audit import Auditor, AuditConfig
|
|
165
|
+
from torch_audit.callbacks import HFAuditCallback
|
|
166
|
+
|
|
167
|
+
config = AuditConfig(monitor_nlp=True, interval=500)
|
|
168
|
+
auditor = Auditor(model, config=config)
|
|
169
|
+
|
|
170
|
+
trainer = Trainer(..., callbacks=[HFAuditCallback(auditor)])
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## ๐ ๏ธ Capabilities & Modules
|
|
174
|
+
### ๐ฅ๏ธ Hardware & System (Always Active)
|
|
175
|
+
|
|
176
|
+
* **Device Placement:** Detects "Split Brain" (CPU/GPU mix) and forgotten `.cuda()` calls.
|
|
177
|
+
* **Tensor Cores:** Warns if matrix multiplications aren't aligned to 8 (FP16) or 16 (INT8).
|
|
178
|
+
* **Memory Layout:** Detects `NCHW` vs `NHWC` memory format issues.
|
|
179
|
+
* **Precision:** Suggests AMP/BFloat16 if model is 100% FP32.
|
|
180
|
+
|
|
181
|
+
### ๐งช Optimization & Stability
|
|
182
|
+
|
|
183
|
+
* **Config:** Warns if using `Adam` with `weight_decay` (suggests `AdamW`).
|
|
184
|
+
* **Regularization:** Detects weight decay applied to Biases or Norm layers.
|
|
185
|
+
* **Dynamics:** Checks for low `epsilon` in Mixed Precision (underflow risk).
|
|
186
|
+
|
|
187
|
+
### ๐ NLP Mode
|
|
188
|
+
Detects tokenizer issues, padding waste, and untied embeddings.
|
|
189
|
+
```python
|
|
190
|
+
config = {
|
|
191
|
+
'monitor_nlp': True,
|
|
192
|
+
'pad_token_id': tokenizer.pad_token_id,
|
|
193
|
+
'vocab_size': tokenizer.vocab_size
|
|
194
|
+
}
|
|
195
|
+
auditor = Auditor(model, config=config)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
* **Data Integrity:** Checks if `attention_mask` actually masks the padding tokens in `input_ids`.
|
|
199
|
+
* **Efficiency:** Calculates wasted compute due to excessive padding (>50%).
|
|
200
|
+
* **Architecture:** Checks if Embedding weights are tied to the Output Head.
|
|
201
|
+
|
|
202
|
+
### ๐ผ๏ธ Computer Vision Mode
|
|
203
|
+
Detects normalization bugs (0-255 inputs) and dead convolution filters.
|
|
204
|
+
```python
|
|
205
|
+
auditor = Auditor(model, config={'monitor_cv': True})
|
|
206
|
+
```
|
|
207
|
+
* **Layout:** Detects accidental `[Batch, Height, Width, Channel]` input (crashes PyTorch).
|
|
208
|
+
* **Redundant Bias:** Detects `Conv2d(bias=True)` followed immediately by `BatchNorm`.
|
|
209
|
+
* **Dead Filters:** Identifies convolution filters that have been pruned or collapsed to zero.
|
|
210
|
+
|
|
211
|
+
## โ๏ธ Configuration
|
|
212
|
+
|
|
213
|
+
You can configure the auditor via a dictionary or the `AuditConfig` object.
|
|
214
|
+
|
|
215
|
+
| Parameter | Default | Description |
|
|
216
|
+
| :--- |:--------|:----------------------------------------------------------------|
|
|
217
|
+
| `interval` | `1` | Run audit every N steps. Set to `1000+` or more for production. |
|
|
218
|
+
| `limit` | `None` | Stop auditing after N reports. |
|
|
219
|
+
| `float_threshold` | `10.0` | Max value allowed in inputs before warning. |
|
|
220
|
+
| `monitor_dead_neurons` | `True` | Check for activations death. |
|
|
221
|
+
| `graph_atomic_modules` | `[]` |List of custom layers (e.g. FlashAttn) to treat as leaves.
|
|
222
|
+
| `monitor_graph` | `True` | Check for unused (zombie) layers. |
|
|
223
|
+
| `monitor_nlp` | `False` | Enable NLP-specific hooks (requires `pad_token_id`). |
|
|
224
|
+
| `monitor_cv` | `False` | Enable CV-specific hooks. |
|
|
225
|
+
|
|
226
|
+
## ๐ญ Production Logging
|
|
227
|
+
For headless training where you can't see the console, switch to the `LogReporter`.
|
|
228
|
+
```python
|
|
229
|
+
from torch_audit.core.reporter import LogReporter
|
|
230
|
+
|
|
231
|
+
# Writes to standard Python logging (INFO/WARN/ERROR)
|
|
232
|
+
auditor = Auditor(model, reporters=[LogReporter()])
|
|
233
|
+
```
|
|
234
|
+
## ๐ ๏ธ Manual Triggering
|
|
235
|
+
|
|
236
|
+
Sometimes you want to audit, for example, when the loss spikes.
|
|
237
|
+
```python
|
|
238
|
+
loss = criterion(output, target)
|
|
239
|
+
|
|
240
|
+
if loss.item() > 10.0:
|
|
241
|
+
print("Loss spike! Debugging next step...")
|
|
242
|
+
auditor.schedule_next_step() # Forces audit on next forward pass
|
|
243
|
+
```
|
|
244
|
+
## ๐ค Contributing & Feedback
|
|
245
|
+
Found a silent bug that `torch-audit` missed? Have a suggestion for a new Validator?
|
|
246
|
+
**[Open an Issue](https://github.com/RMalkiv/torch-audit/issues)!** We love feedback and contributions.
|
|
247
|
+
|
|
248
|
+
## License
|
|
249
|
+
|
|
250
|
+
Distributed under the MIT License.
|
|
251
|
+
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# ๐ฅ torch-audit
|
|
2
|
+
### The Linter for PyTorch Models
|
|
3
|
+
|
|
4
|
+
[](https://pypi.org/project/torch-audit/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://github.com/psf/black)
|
|
8
|
+
|
|
9
|
+
**torch-audit** is a "check engine light" for your Deep Learning training loop. It detects silent bugs that don't crash your code but ruin your training or waste compute.
|
|
10
|
+
|
|
11
|
+
- ๐ฅ๏ธ **Hardware Efficiency:** Detects slow memory layouts (NHWC vs NCHW), mixed-precision failures, and tensor core misalignment.
|
|
12
|
+
- ๐งช **Data Integrity:** Catches broken attention masks, CV layout bugs, and silent NaN/Inf propagation.
|
|
13
|
+
- ๐ **Training Stability:** Identifies exploding gradients, bad optimizer config (Adam vs AdamW), and "dead" neurons.
|
|
14
|
+
- ๐ง **Graph Logic:** Identifies DDP-unsafe "Zombie" layers and redundant computations (e.g., Bias before BatchNorm).
|
|
15
|
+
- ๐ง **Domain Awareness:** Deep inspection for **NLP** (Padding waste, Tokenizer quality) and **CV** (Dead filters, Redundant biases).
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## ๐ฆ Installation
|
|
20
|
+
|
|
21
|
+
Install the standard version (lightweight):
|
|
22
|
+
```bash
|
|
23
|
+
pip install torch-audit
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Optional Integrations:
|
|
27
|
+
```
|
|
28
|
+
# For PyTorch Lightning support
|
|
29
|
+
pip install "torch-audit[lightning]"
|
|
30
|
+
|
|
31
|
+
# For Hugging Face Transformers support
|
|
32
|
+
pip install "torch-audit[hf]"
|
|
33
|
+
|
|
34
|
+
# For everything
|
|
35
|
+
pip install "torch-audit[all]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## ๐ Quick Start
|
|
39
|
+
You have two ways to use `torch-audit`: the **Decorator** (easiest) or the **Context Manager** (most control).
|
|
40
|
+
|
|
41
|
+
### The Decorator Method (Recommended)
|
|
42
|
+
```python
|
|
43
|
+
import torch
|
|
44
|
+
from torch_audit import Auditor, AuditConfig
|
|
45
|
+
|
|
46
|
+
# 1. Setup Auditor (Audits every 1000 steps)
|
|
47
|
+
config = AuditConfig(interval=1000)
|
|
48
|
+
auditor = Auditor(model, optimizer, config=config)
|
|
49
|
+
|
|
50
|
+
# 2. Static Audit (Run once before training)
|
|
51
|
+
# Checks architecture, unused layers, and weight initialization
|
|
52
|
+
auditor.audit_static()
|
|
53
|
+
|
|
54
|
+
# 3. Training Loop
|
|
55
|
+
# The decorator handles hooks, data auditing, and error reporting automatically.
|
|
56
|
+
@auditor.audit_step
|
|
57
|
+
def train_step(batch, targets):
|
|
58
|
+
optimizer.zero_grad()
|
|
59
|
+
pred = model(batch)
|
|
60
|
+
loss = criterion(pred, targets)
|
|
61
|
+
loss.backward()
|
|
62
|
+
optimizer.step()
|
|
63
|
+
|
|
64
|
+
for batch, targets in dataloader:
|
|
65
|
+
train_step(batch, targets)
|
|
66
|
+
```
|
|
67
|
+
### The Context Manager Method
|
|
68
|
+
```python
|
|
69
|
+
# 3. Training Loop
|
|
70
|
+
for batch in dataloader:
|
|
71
|
+
# Manual data check (optional but recommended)
|
|
72
|
+
auditor.audit_data(batch)
|
|
73
|
+
|
|
74
|
+
# Dynamic checks (Gradients, Activations, Stability)
|
|
75
|
+
with auditor.audit_dynamic():
|
|
76
|
+
pred = model(batch)
|
|
77
|
+
loss = criterion(pred, target)
|
|
78
|
+
loss.backward()
|
|
79
|
+
optimizer.step()
|
|
80
|
+
```
|
|
81
|
+
### The Output
|
|
82
|
+
When a bug is found, `torch-audit` prints a structured report. It supports **Rich Console** tables (default) or **JSON/System Logs** for production.
|
|
83
|
+
|
|
84
|
+
```text
|
|
85
|
+
๐ Audit Running (Step 5000)...
|
|
86
|
+
๐ก Batch size is tiny (4). BatchNorm is unstable. (in Input Batch)
|
|
87
|
+
|
|
88
|
+
โ ๏ธ Audit Report (Step 5000)
|
|
89
|
+
โโโโโโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโณโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
90
|
+
โ Type โ Layer โ Message โ
|
|
91
|
+
โกโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฉ
|
|
92
|
+
โ ๐ด DDP Safety โ ghost_layer โ Layer defined but NEVER called (Zombie). โ
|
|
93
|
+
โ ๐ด Data Integrity โ Input Batch โ Attention Mask mismatch on 50 tokens. โ
|
|
94
|
+
โ ๐ก Tensor Core โ fc1 โ Dims (127->64) not divisible by 8. โ
|
|
95
|
+
โ ๐ก Stability โ Global โ Optimizer epsilon (1e-08) too low for AMP. โ
|
|
96
|
+
โ ๐ต CV Opt โ conv1 โ Bias=True followed by BatchNorm (Redundant).โ
|
|
97
|
+
โโโโโโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
98
|
+
```
|
|
99
|
+
## ๐ Runnable Demos
|
|
100
|
+
Don't just take our word for it! Break things yourself! We have prepared sabotaged scripts that trigger auditor warnings.
|
|
101
|
+
|
|
102
|
+
Check out the `examples/` folder:
|
|
103
|
+
- `python examples/demo_general.py` (General hardware/optimizer issues)
|
|
104
|
+
- `python examples/demo_nlp.py` (NLP & Tokenizer bugs)
|
|
105
|
+
- `python examples/demo_cv.py` (Computer Vision bugs)
|
|
106
|
+
- `python examples/demo_lightning.py` (PyTorch Lightning integration)
|
|
107
|
+
- `python examples/demo_hf.py` (Hugging Face integration)
|
|
108
|
+
- `python examples/demo_accelerate.py` (Accelerate integration)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
## ๐งฉ Integrations
|
|
112
|
+
We support the ecosystem you already use.
|
|
113
|
+
|
|
114
|
+
### โก PyTorch Lightning
|
|
115
|
+
Zero code changes to your loop. Just add the callback.
|
|
116
|
+
```python
|
|
117
|
+
from lightning.pytorch import Trainer
|
|
118
|
+
from torch_audit import Auditor, AuditConfig
|
|
119
|
+
from torch_audit.callbacks import LightningAuditCallback
|
|
120
|
+
|
|
121
|
+
auditor = Auditor(model, config=AuditConfig(interval=100))
|
|
122
|
+
trainer = Trainer(callbacks=[LightningAuditCallback(auditor)])
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### ๐ค Hugging Face Trainer
|
|
126
|
+
Plug-and-play with the Trainer API.
|
|
127
|
+
```python
|
|
128
|
+
from transformers import Trainer
|
|
129
|
+
from torch_audit import Auditor, AuditConfig
|
|
130
|
+
from torch_audit.callbacks import HFAuditCallback
|
|
131
|
+
|
|
132
|
+
config = AuditConfig(monitor_nlp=True, interval=500)
|
|
133
|
+
auditor = Auditor(model, config=config)
|
|
134
|
+
|
|
135
|
+
trainer = Trainer(..., callbacks=[HFAuditCallback(auditor)])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## ๐ ๏ธ Capabilities & Modules
|
|
139
|
+
### ๐ฅ๏ธ Hardware & System (Always Active)
|
|
140
|
+
|
|
141
|
+
* **Device Placement:** Detects "Split Brain" (CPU/GPU mix) and forgotten `.cuda()` calls.
|
|
142
|
+
* **Tensor Cores:** Warns if matrix multiplications aren't aligned to 8 (FP16) or 16 (INT8).
|
|
143
|
+
* **Memory Layout:** Detects `NCHW` vs `NHWC` memory format issues.
|
|
144
|
+
* **Precision:** Suggests AMP/BFloat16 if model is 100% FP32.
|
|
145
|
+
|
|
146
|
+
### ๐งช Optimization & Stability
|
|
147
|
+
|
|
148
|
+
* **Config:** Warns if using `Adam` with `weight_decay` (suggests `AdamW`).
|
|
149
|
+
* **Regularization:** Detects weight decay applied to Biases or Norm layers.
|
|
150
|
+
* **Dynamics:** Checks for low `epsilon` in Mixed Precision (underflow risk).
|
|
151
|
+
|
|
152
|
+
### ๐ NLP Mode
|
|
153
|
+
Detects tokenizer issues, padding waste, and untied embeddings.
|
|
154
|
+
```python
|
|
155
|
+
config = {
|
|
156
|
+
'monitor_nlp': True,
|
|
157
|
+
'pad_token_id': tokenizer.pad_token_id,
|
|
158
|
+
'vocab_size': tokenizer.vocab_size
|
|
159
|
+
}
|
|
160
|
+
auditor = Auditor(model, config=config)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
* **Data Integrity:** Checks if `attention_mask` actually masks the padding tokens in `input_ids`.
|
|
164
|
+
* **Efficiency:** Calculates wasted compute due to excessive padding (>50%).
|
|
165
|
+
* **Architecture:** Checks if Embedding weights are tied to the Output Head.
|
|
166
|
+
|
|
167
|
+
### ๐ผ๏ธ Computer Vision Mode
|
|
168
|
+
Detects normalization bugs (0-255 inputs) and dead convolution filters.
|
|
169
|
+
```python
|
|
170
|
+
auditor = Auditor(model, config={'monitor_cv': True})
|
|
171
|
+
```
|
|
172
|
+
* **Layout:** Detects accidental `[Batch, Height, Width, Channel]` input (crashes PyTorch).
|
|
173
|
+
* **Redundant Bias:** Detects `Conv2d(bias=True)` followed immediately by `BatchNorm`.
|
|
174
|
+
* **Dead Filters:** Identifies convolution filters that have been pruned or collapsed to zero.
|
|
175
|
+
|
|
176
|
+
## โ๏ธ Configuration
|
|
177
|
+
|
|
178
|
+
You can configure the auditor via a dictionary or the `AuditConfig` object.
|
|
179
|
+
|
|
180
|
+
| Parameter | Default | Description |
|
|
181
|
+
| :--- |:--------|:----------------------------------------------------------------|
|
|
182
|
+
| `interval` | `1` | Run audit every N steps. Set to `1000+` or more for production. |
|
|
183
|
+
| `limit` | `None` | Stop auditing after N reports. |
|
|
184
|
+
| `float_threshold` | `10.0` | Max value allowed in inputs before warning. |
|
|
185
|
+
| `monitor_dead_neurons` | `True` | Check for activations death. |
|
|
186
|
+
| `graph_atomic_modules` | `[]` |List of custom layers (e.g. FlashAttn) to treat as leaves.
|
|
187
|
+
| `monitor_graph` | `True` | Check for unused (zombie) layers. |
|
|
188
|
+
| `monitor_nlp` | `False` | Enable NLP-specific hooks (requires `pad_token_id`). |
|
|
189
|
+
| `monitor_cv` | `False` | Enable CV-specific hooks. |
|
|
190
|
+
|
|
191
|
+
## ๐ญ Production Logging
|
|
192
|
+
For headless training where you can't see the console, switch to the `LogReporter`.
|
|
193
|
+
```python
|
|
194
|
+
from torch_audit.core.reporter import LogReporter
|
|
195
|
+
|
|
196
|
+
# Writes to standard Python logging (INFO/WARN/ERROR)
|
|
197
|
+
auditor = Auditor(model, reporters=[LogReporter()])
|
|
198
|
+
```
|
|
199
|
+
## ๐ ๏ธ Manual Triggering
|
|
200
|
+
|
|
201
|
+
Sometimes you want to audit, for example, when the loss spikes.
|
|
202
|
+
```python
|
|
203
|
+
loss = criterion(output, target)
|
|
204
|
+
|
|
205
|
+
if loss.item() > 10.0:
|
|
206
|
+
print("Loss spike! Debugging next step...")
|
|
207
|
+
auditor.schedule_next_step() # Forces audit on next forward pass
|
|
208
|
+
```
|
|
209
|
+
## ๐ค Contributing & Feedback
|
|
210
|
+
Found a silent bug that `torch-audit` missed? Have a suggestion for a new Validator?
|
|
211
|
+
**[Open an Issue](https://github.com/RMalkiv/torch-audit/issues)!** We love feedback and contributions.
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
Distributed under the MIT License.
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["poetry-core>=1.0.0"]
|
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
|
4
|
+
|
|
5
|
+
[tool.poetry]
|
|
6
|
+
name = "torch-audit"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "The Linter for PyTorch: Detects silent training bugs."
|
|
9
|
+
authors = ["Roman Malkiv <malkiv.roman@gmail.com>"]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
repository = "https://github.com/RMalkiv/torch-audit"
|
|
13
|
+
homepage = "https://github.com/RMalkiv/torch-audit"
|
|
14
|
+
packages = [{include = "torch_audit", from = "src"}]
|
|
15
|
+
|
|
16
|
+
keywords = ["pytorch", "audit", "debugging", "linter", "deep-learning"]
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 4 - Beta",
|
|
20
|
+
"Intended Audience :: Developers",
|
|
21
|
+
"Intended Audience :: Science/Research",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.8",
|
|
25
|
+
"Programming Language :: Python :: 3.9",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[tool.poetry.dependencies]
|
|
32
|
+
python = "^3.8"
|
|
33
|
+
torch = ">=1.10.0"
|
|
34
|
+
rich = ">=12.0.0"
|
|
35
|
+
|
|
36
|
+
# --- Optional Integrations ---
|
|
37
|
+
lightning = {version = ">=2.0.0", optional = true}
|
|
38
|
+
accelerate = {version = ">=0.20.0", optional = true}
|
|
39
|
+
transformers = {version = ">=4.30.0", optional = true}
|
|
40
|
+
datasets = {version = ">=2.10.0", optional = true}
|
|
41
|
+
numpy = {version = ">=1.20.0", optional = true}
|
|
42
|
+
|
|
43
|
+
[tool.poetry.extras]
|
|
44
|
+
lightning = ["lightning"]
|
|
45
|
+
hf = ["transformers", "accelerate", "datasets"]
|
|
46
|
+
all = ["lightning", "transformers", "accelerate", "datasets", "numpy"]
|
|
47
|
+
|
|
48
|
+
[tool.poetry.group.dev.dependencies]
|
|
49
|
+
pytest = "^7.0"
|
|
50
|
+
black = "^23.0"
|
|
51
|
+
isort = "^5.0"
|
|
52
|
+
twine = "^4.0"
|
|
53
|
+
docutils = "<0.21"
|
|
54
|
+
|
|
55
|
+
[tool.black]
|
|
56
|
+
line-length = 88
|
|
57
|
+
target-version = ['py38']
|
|
58
|
+
|
|
59
|
+
[tool.isort]
|
|
60
|
+
profile = "black"
|
|
61
|
+
|
|
62
|
+
[tool.pytest.ini_options]
|
|
63
|
+
minversion = "6.0"
|
|
64
|
+
addopts = "-ra -q"
|
|
65
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from .core.auditor import Auditor
|
|
2
|
+
from .core.config import AuditConfig
|
|
3
|
+
from .core.reporter import LogReporter, RichConsoleReporter
|
|
4
|
+
|
|
5
|
+
from .callbacks import LightningAuditCallback, HFAuditCallback
|
|
6
|
+
|
|
7
|
+
__version__ = "0.2.0"
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Auditor",
|
|
11
|
+
"AuditConfig",
|
|
12
|
+
"LogReporter",
|
|
13
|
+
"RichConsoleReporter",
|
|
14
|
+
"LightningAuditCallback",
|
|
15
|
+
"HFAuditCallback"
|
|
16
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from .core.auditor import Auditor
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from lightning.pytorch.callbacks import Callback as PLCallback
|
|
5
|
+
|
|
6
|
+
class LightningAuditCallback(PLCallback):
|
|
7
|
+
"""
|
|
8
|
+
Automatically audits LightningModule training steps.
|
|
9
|
+
Add to `Trainer(callbacks=[LightningAuditCallback(auditor)])`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, auditor: Auditor):
|
|
13
|
+
self.auditor = auditor
|
|
14
|
+
|
|
15
|
+
def on_fit_start(self, trainer, pl_module):
|
|
16
|
+
self.auditor.audit_static()
|
|
17
|
+
|
|
18
|
+
def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
|
|
19
|
+
self.auditor.audit_data(batch)
|
|
20
|
+
|
|
21
|
+
self.auditor.start_dynamic_audit()
|
|
22
|
+
|
|
23
|
+
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
|
|
24
|
+
self.auditor.stop_dynamic_audit()
|
|
25
|
+
|
|
26
|
+
except ImportError:
|
|
27
|
+
LightningAuditCallback = None
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class HFAuditCallback(TrainerCallback):
|
|
34
|
+
"""
|
|
35
|
+
Automatically audits HF Trainer steps.
|
|
36
|
+
"""
|
|
37
|
+
def __init__(self, auditor: Auditor):
|
|
38
|
+
self.auditor = auditor
|
|
39
|
+
|
|
40
|
+
def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
|
|
41
|
+
inputs = kwargs.get('inputs')
|
|
42
|
+
if inputs is not None:
|
|
43
|
+
self.auditor.audit_data(inputs)
|
|
44
|
+
|
|
45
|
+
self.auditor.start_dynamic_audit()
|
|
46
|
+
|
|
47
|
+
def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
|
|
48
|
+
self.auditor.stop_dynamic_audit()
|
|
49
|
+
|
|
50
|
+
except ImportError:
|
|
51
|
+
HFAuditCallback = None
|