torch-audit 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. torch_audit-0.2.0/PKG-INFO +251 -0
  2. torch_audit-0.2.0/README.md +215 -0
  3. torch_audit-0.2.0/pyproject.toml +65 -0
  4. torch_audit-0.2.0/src/torch_audit/__init__.py +16 -0
  5. torch_audit-0.2.0/src/torch_audit/callbacks.py +51 -0
  6. torch_audit-0.2.0/src/torch_audit/core/__init__.py +2 -0
  7. torch_audit-0.2.0/src/torch_audit/core/auditor.py +268 -0
  8. torch_audit-0.2.0/src/torch_audit/core/config.py +25 -0
  9. torch_audit-0.2.0/src/torch_audit/core/issue.py +18 -0
  10. torch_audit-0.2.0/src/torch_audit/core/reporter.py +73 -0
  11. torch_audit-0.2.0/src/torch_audit/core/validator.py +47 -0
  12. torch_audit-0.2.0/src/torch_audit/modules/__init__.py +3 -0
  13. torch_audit-0.2.0/src/torch_audit/modules/cv/__init__.py +0 -0
  14. torch_audit-0.2.0/src/torch_audit/modules/cv/images.py +100 -0
  15. torch_audit-0.2.0/src/torch_audit/modules/cv/layers.py +97 -0
  16. torch_audit-0.2.0/src/torch_audit/modules/general/__init__.py +0 -0
  17. torch_audit-0.2.0/src/torch_audit/modules/general/activations.py +116 -0
  18. torch_audit-0.2.0/src/torch_audit/modules/general/gradients.py +79 -0
  19. torch_audit-0.2.0/src/torch_audit/modules/general/graph.py +126 -0
  20. torch_audit-0.2.0/src/torch_audit/modules/general/hardware.py +195 -0
  21. torch_audit-0.2.0/src/torch_audit/modules/general/hygiene.py +104 -0
  22. torch_audit-0.2.0/src/torch_audit/modules/general/optimizer_config.py +100 -0
  23. torch_audit-0.2.0/src/torch_audit/modules/general/stability.py +72 -0
  24. torch_audit-0.2.0/src/torch_audit/modules/nlp/__init__.py +0 -0
  25. torch_audit-0.2.0/src/torch_audit/modules/nlp/structure.py +89 -0
  26. torch_audit-0.2.0/src/torch_audit/modules/nlp/tokenization.py +144 -0
@@ -0,0 +1,251 @@
1
+ Metadata-Version: 2.3
2
+ Name: torch-audit
3
+ Version: 0.2.0
4
+ Summary: The Linter for PyTorch: Detects silent training bugs.
5
+ License: MIT
6
+ Keywords: pytorch,audit,debugging,linter,deep-learning
7
+ Author: Roman Malkiv
8
+ Author-email: malkiv.roman@gmail.com
9
+ Requires-Python: >=3.8,<4.0
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: all
23
+ Provides-Extra: hf
24
+ Provides-Extra: lightning
25
+ Requires-Dist: accelerate (>=0.20.0) ; extra == "hf" or extra == "all"
26
+ Requires-Dist: datasets (>=2.10.0) ; extra == "hf" or extra == "all"
27
+ Requires-Dist: lightning (>=2.0.0) ; extra == "lightning" or extra == "all"
28
+ Requires-Dist: numpy (>=1.20.0) ; extra == "all"
29
+ Requires-Dist: rich (>=12.0.0)
30
+ Requires-Dist: torch (>=1.10.0)
31
+ Requires-Dist: transformers (>=4.30.0) ; extra == "hf" or extra == "all"
32
+ Project-URL: Homepage, https://github.com/RMalkiv/torch-audit
33
+ Project-URL: Repository, https://github.com/RMalkiv/torch-audit
34
+ Description-Content-Type: text/markdown
35
+
36
+ # ๐Ÿ”ฅ torch-audit
37
+ ### The Linter for PyTorch Models
38
+
39
+ [![PyPI](https://img.shields.io/pypi/v/torch-audit)](https://pypi.org/project/torch-audit/)
40
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
42
+ [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
43
+
44
+ **torch-audit** is a "check engine light" for your Deep Learning training loop. It detects silent bugs that don't crash your code but ruin your training or waste compute.
45
+
46
+ - ๐Ÿ–ฅ๏ธ **Hardware Efficiency:** Detects slow memory layouts (NHWC vs NCHW), mixed-precision failures, and tensor core misalignment.
47
+ - ๐Ÿงช **Data Integrity:** Catches broken attention masks, CV layout bugs, and silent NaN/Inf propagation.
48
+ - ๐Ÿ“‰ **Training Stability:** Identifies exploding gradients, bad optimizer config (Adam vs AdamW), and "dead" neurons.
49
+ - ๐ŸงŸ **Graph Logic:** Identifies DDP-unsafe "Zombie" layers and redundant computations (e.g., Bias before BatchNorm).
50
+ - ๐Ÿง  **Domain Awareness:** Deep inspection for **NLP** (Padding waste, Tokenizer quality) and **CV** (Dead filters, Redundant biases).
51
+
52
+ ---
53
+
54
+ ## ๐Ÿ“ฆ Installation
55
+
56
+ Install the standard version (lightweight):
57
+ ```bash
58
+ pip install torch-audit
59
+ ```
60
+
61
+ ### Optional Integrations:
62
+ ```
63
+ # For PyTorch Lightning support
64
+ pip install "torch-audit[lightning]"
65
+
66
+ # For Hugging Face Transformers support
67
+ pip install "torch-audit[hf]"
68
+
69
+ # For everything
70
+ pip install "torch-audit[all]"
71
+ ```
72
+
73
+ ## ๐Ÿš€ Quick Start
74
+ You have two ways to use `torch-audit`: the **Decorator** (easiest) or the **Context Manager** (most control).
75
+
76
+ ### The Decorator Method (Recommended)
77
+ ```python
78
+ import torch
79
+ from torch_audit import Auditor, AuditConfig
80
+
81
+ # 1. Setup Auditor (Audits every 1000 steps)
82
+ config = AuditConfig(interval=1000)
83
+ auditor = Auditor(model, optimizer, config=config)
84
+
85
+ # 2. Static Audit (Run once before training)
86
+ # Checks architecture, unused layers, and weight initialization
87
+ auditor.audit_static()
88
+
89
+ # 3. Training Loop
90
+ # The decorator handles hooks, data auditing, and error reporting automatically.
91
+ @auditor.audit_step
92
+ def train_step(batch, targets):
93
+ optimizer.zero_grad()
94
+ pred = model(batch)
95
+ loss = criterion(pred, targets)
96
+ loss.backward()
97
+ optimizer.step()
98
+
99
+ for batch, targets in dataloader:
100
+ train_step(batch, targets)
101
+ ```
102
+ ### The Context Manager Method
103
+ ```python
104
+ # 3. Training Loop
105
+ for batch in dataloader:
106
+ # Manual data check (optional but recommended)
107
+ auditor.audit_data(batch)
108
+
109
+ # Dynamic checks (Gradients, Activations, Stability)
110
+ with auditor.audit_dynamic():
111
+ pred = model(batch)
112
+ loss = criterion(pred, target)
113
+ loss.backward()
114
+ optimizer.step()
115
+ ```
116
+ ### The Output
117
+ When a bug is found, `torch-audit` prints a structured report. It supports **Rich Console** tables (default) or **JSON/System Logs** for production.
118
+
119
+ ```text
120
+ ๐Ÿš€ Audit Running (Step 5000)...
121
+ ๐ŸŸก Batch size is tiny (4). BatchNorm is unstable. (in Input Batch)
122
+
123
+ โš ๏ธ Audit Report (Step 5000)
124
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
125
+ โ”ƒ Type โ”ƒ Layer โ”ƒ Message โ”ƒ
126
+ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
127
+ โ”‚ ๐Ÿ”ด DDP Safety โ”‚ ghost_layer โ”‚ Layer defined but NEVER called (Zombie). โ”‚
128
+ โ”‚ ๐Ÿ”ด Data Integrity โ”‚ Input Batch โ”‚ Attention Mask mismatch on 50 tokens. โ”‚
129
+ โ”‚ ๐ŸŸก Tensor Core โ”‚ fc1 โ”‚ Dims (127->64) not divisible by 8. โ”‚
130
+ โ”‚ ๐ŸŸก Stability โ”‚ Global โ”‚ Optimizer epsilon (1e-08) too low for AMP. โ”‚
131
+ โ”‚ ๐Ÿ”ต CV Opt โ”‚ conv1 โ”‚ Bias=True followed by BatchNorm (Redundant).โ”‚
132
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
133
+ ```
134
+ ## ๐Ÿ“‚ Runnable Demos
135
+ Don't just take our word for it! Break things yourself! We have prepared sabotaged scripts that trigger auditor warnings.
136
+
137
+ Check out the `examples/` folder:
138
+ - `python examples/demo_general.py` (General hardware/optimizer issues)
139
+ - `python examples/demo_nlp.py` (NLP & Tokenizer bugs)
140
+ - `python examples/demo_cv.py` (Computer Vision bugs)
141
+ - `python examples/demo_lightning.py` (PyTorch Lightning integration)
142
+ - `python examples/demo_hf.py` (Hugging Face integration)
143
+ - `python examples/demo_accelerate.py` (Accelerate integration)
144
+
145
+
146
+ ## ๐Ÿงฉ Integrations
147
+ We support the ecosystem you already use.
148
+
149
+ ### โšก PyTorch Lightning
150
+ Zero code changes to your loop. Just add the callback.
151
+ ```python
152
+ from lightning.pytorch import Trainer
153
+ from torch_audit import Auditor, AuditConfig
154
+ from torch_audit.callbacks import LightningAuditCallback
155
+
156
+ auditor = Auditor(model, config=AuditConfig(interval=100))
157
+ trainer = Trainer(callbacks=[LightningAuditCallback(auditor)])
158
+ ```
159
+
160
+ ### ๐Ÿค— Hugging Face Trainer
161
+ Plug-and-play with the Trainer API.
162
+ ```python
163
+ from transformers import Trainer
164
+ from torch_audit import Auditor, AuditConfig
165
+ from torch_audit.callbacks import HFAuditCallback
166
+
167
+ config = AuditConfig(monitor_nlp=True, interval=500)
168
+ auditor = Auditor(model, config=config)
169
+
170
+ trainer = Trainer(..., callbacks=[HFAuditCallback(auditor)])
171
+ ```
172
+
173
+ ## ๐Ÿ› ๏ธ Capabilities & Modules
174
+ ### ๐Ÿ–ฅ๏ธ Hardware & System (Always Active)
175
+
176
+ * **Device Placement:** Detects "Split Brain" (CPU/GPU mix) and forgotten `.cuda()` calls.
177
+ * **Tensor Cores:** Warns if matrix multiplications aren't aligned to 8 (FP16) or 16 (INT8).
178
+ * **Memory Layout:** Detects `NCHW` vs `NHWC` memory format issues.
179
+ * **Precision:** Suggests AMP/BFloat16 if model is 100% FP32.
180
+
181
+ ### ๐Ÿงช Optimization & Stability
182
+
183
+ * **Config:** Warns if using `Adam` with `weight_decay` (suggests `AdamW`).
184
+ * **Regularization:** Detects weight decay applied to Biases or Norm layers.
185
+ * **Dynamics:** Checks for low `epsilon` in Mixed Precision (underflow risk).
186
+
187
+ ### ๐Ÿ“– NLP Mode
188
+ Detects tokenizer issues, padding waste, and untied embeddings.
189
+ ```python
190
+ config = {
191
+ 'monitor_nlp': True,
192
+ 'pad_token_id': tokenizer.pad_token_id,
193
+ 'vocab_size': tokenizer.vocab_size
194
+ }
195
+ auditor = Auditor(model, config=config)
196
+ ```
197
+
198
+ * **Data Integrity:** Checks if `attention_mask` actually masks the padding tokens in `input_ids`.
199
+ * **Efficiency:** Calculates wasted compute due to excessive padding (>50%).
200
+ * **Architecture:** Checks if Embedding weights are tied to the Output Head.
201
+
202
+ ### ๐Ÿ–ผ๏ธ Computer Vision Mode
203
+ Detects normalization bugs (0-255 inputs) and dead convolution filters.
204
+ ```python
205
+ auditor = Auditor(model, config={'monitor_cv': True})
206
+ ```
207
+ * **Layout:** Detects accidental `[Batch, Height, Width, Channel]` input (crashes PyTorch).
208
+ * **Redundant Bias:** Detects `Conv2d(bias=True)` followed immediately by `BatchNorm`.
209
+ * **Dead Filters:** Identifies convolution filters that have been pruned or collapsed to zero.
210
+
211
+ ## โš™๏ธ Configuration
212
+
213
+ You can configure the auditor via a dictionary or the `AuditConfig` object.
214
+
215
+ | Parameter | Default | Description |
216
+ | :--- |:--------|:----------------------------------------------------------------|
217
+ | `interval` | `1` | Run audit every N steps. Set to `1000+` or more for production. |
218
+ | `limit` | `None` | Stop auditing after N reports. |
219
+ | `float_threshold` | `10.0` | Max value allowed in inputs before warning. |
220
+ | `monitor_dead_neurons` | `True` | Check for activations death. |
221
+ | `graph_atomic_modules` | `[]` |List of custom layers (e.g. FlashAttn) to treat as leaves.
222
+ | `monitor_graph` | `True` | Check for unused (zombie) layers. |
223
+ | `monitor_nlp` | `False` | Enable NLP-specific hooks (requires `pad_token_id`). |
224
+ | `monitor_cv` | `False` | Enable CV-specific hooks. |
225
+
226
+ ## ๐Ÿญ Production Logging
227
+ For headless training where you can't see the console, switch to the `LogReporter`.
228
+ ```python
229
+ from torch_audit.core.reporter import LogReporter
230
+
231
+ # Writes to standard Python logging (INFO/WARN/ERROR)
232
+ auditor = Auditor(model, reporters=[LogReporter()])
233
+ ```
234
+ ## ๐Ÿ› ๏ธ Manual Triggering
235
+
236
+ Sometimes you want to audit, for example, when the loss spikes.
237
+ ```python
238
+ loss = criterion(output, target)
239
+
240
+ if loss.item() > 10.0:
241
+ print("Loss spike! Debugging next step...")
242
+ auditor.schedule_next_step() # Forces audit on next forward pass
243
+ ```
244
+ ## ๐Ÿค Contributing & Feedback
245
+ Found a silent bug that `torch-audit` missed? Have a suggestion for a new Validator?
246
+ **[Open an Issue](https://github.com/RMalkiv/torch-audit/issues)!** We love feedback and contributions.
247
+
248
+ ## License
249
+
250
+ Distributed under the MIT License.
251
+
@@ -0,0 +1,215 @@
1
+ # ๐Ÿ”ฅ torch-audit
2
+ ### The Linter for PyTorch Models
3
+
4
+ [![PyPI](https://img.shields.io/pypi/v/torch-audit)](https://pypi.org/project/torch-audit/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
7
+ [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
8
+
9
+ **torch-audit** is a "check engine light" for your Deep Learning training loop. It detects silent bugs that don't crash your code but ruin your training or waste compute.
10
+
11
+ - ๐Ÿ–ฅ๏ธ **Hardware Efficiency:** Detects slow memory layouts (NHWC vs NCHW), mixed-precision failures, and tensor core misalignment.
12
+ - ๐Ÿงช **Data Integrity:** Catches broken attention masks, CV layout bugs, and silent NaN/Inf propagation.
13
+ - ๐Ÿ“‰ **Training Stability:** Identifies exploding gradients, bad optimizer config (Adam vs AdamW), and "dead" neurons.
14
+ - ๐ŸงŸ **Graph Logic:** Identifies DDP-unsafe "Zombie" layers and redundant computations (e.g., Bias before BatchNorm).
15
+ - ๐Ÿง  **Domain Awareness:** Deep inspection for **NLP** (Padding waste, Tokenizer quality) and **CV** (Dead filters, Redundant biases).
16
+
17
+ ---
18
+
19
+ ## ๐Ÿ“ฆ Installation
20
+
21
+ Install the standard version (lightweight):
22
+ ```bash
23
+ pip install torch-audit
24
+ ```
25
+
26
+ ### Optional Integrations:
27
+ ```
28
+ # For PyTorch Lightning support
29
+ pip install "torch-audit[lightning]"
30
+
31
+ # For Hugging Face Transformers support
32
+ pip install "torch-audit[hf]"
33
+
34
+ # For everything
35
+ pip install "torch-audit[all]"
36
+ ```
37
+
38
+ ## ๐Ÿš€ Quick Start
39
+ You have two ways to use `torch-audit`: the **Decorator** (easiest) or the **Context Manager** (most control).
40
+
41
+ ### The Decorator Method (Recommended)
42
+ ```python
43
+ import torch
44
+ from torch_audit import Auditor, AuditConfig
45
+
46
+ # 1. Setup Auditor (Audits every 1000 steps)
47
+ config = AuditConfig(interval=1000)
48
+ auditor = Auditor(model, optimizer, config=config)
49
+
50
+ # 2. Static Audit (Run once before training)
51
+ # Checks architecture, unused layers, and weight initialization
52
+ auditor.audit_static()
53
+
54
+ # 3. Training Loop
55
+ # The decorator handles hooks, data auditing, and error reporting automatically.
56
+ @auditor.audit_step
57
+ def train_step(batch, targets):
58
+ optimizer.zero_grad()
59
+ pred = model(batch)
60
+ loss = criterion(pred, targets)
61
+ loss.backward()
62
+ optimizer.step()
63
+
64
+ for batch, targets in dataloader:
65
+ train_step(batch, targets)
66
+ ```
67
+ ### The Context Manager Method
68
+ ```python
69
+ # 3. Training Loop
70
+ for batch in dataloader:
71
+ # Manual data check (optional but recommended)
72
+ auditor.audit_data(batch)
73
+
74
+ # Dynamic checks (Gradients, Activations, Stability)
75
+ with auditor.audit_dynamic():
76
+ pred = model(batch)
77
+ loss = criterion(pred, target)
78
+ loss.backward()
79
+ optimizer.step()
80
+ ```
81
+ ### The Output
82
+ When a bug is found, `torch-audit` prints a structured report. It supports **Rich Console** tables (default) or **JSON/System Logs** for production.
83
+
84
+ ```text
85
+ ๐Ÿš€ Audit Running (Step 5000)...
86
+ ๐ŸŸก Batch size is tiny (4). BatchNorm is unstable. (in Input Batch)
87
+
88
+ โš ๏ธ Audit Report (Step 5000)
89
+ โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ณโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”“
90
+ โ”ƒ Type โ”ƒ Layer โ”ƒ Message โ”ƒ
91
+ โ”กโ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ•‡โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”ฉ
92
+ โ”‚ ๐Ÿ”ด DDP Safety โ”‚ ghost_layer โ”‚ Layer defined but NEVER called (Zombie). โ”‚
93
+ โ”‚ ๐Ÿ”ด Data Integrity โ”‚ Input Batch โ”‚ Attention Mask mismatch on 50 tokens. โ”‚
94
+ โ”‚ ๐ŸŸก Tensor Core โ”‚ fc1 โ”‚ Dims (127->64) not divisible by 8. โ”‚
95
+ โ”‚ ๐ŸŸก Stability โ”‚ Global โ”‚ Optimizer epsilon (1e-08) too low for AMP. โ”‚
96
+ โ”‚ ๐Ÿ”ต CV Opt โ”‚ conv1 โ”‚ Bias=True followed by BatchNorm (Redundant).โ”‚
97
+ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
98
+ ```
99
+ ## ๐Ÿ“‚ Runnable Demos
100
+ Don't just take our word for it! Break things yourself! We have prepared sabotaged scripts that trigger auditor warnings.
101
+
102
+ Check out the `examples/` folder:
103
+ - `python examples/demo_general.py` (General hardware/optimizer issues)
104
+ - `python examples/demo_nlp.py` (NLP & Tokenizer bugs)
105
+ - `python examples/demo_cv.py` (Computer Vision bugs)
106
+ - `python examples/demo_lightning.py` (PyTorch Lightning integration)
107
+ - `python examples/demo_hf.py` (Hugging Face integration)
108
+ - `python examples/demo_accelerate.py` (Accelerate integration)
109
+
110
+
111
+ ## ๐Ÿงฉ Integrations
112
+ We support the ecosystem you already use.
113
+
114
+ ### โšก PyTorch Lightning
115
+ Zero code changes to your loop. Just add the callback.
116
+ ```python
117
+ from lightning.pytorch import Trainer
118
+ from torch_audit import Auditor, AuditConfig
119
+ from torch_audit.callbacks import LightningAuditCallback
120
+
121
+ auditor = Auditor(model, config=AuditConfig(interval=100))
122
+ trainer = Trainer(callbacks=[LightningAuditCallback(auditor)])
123
+ ```
124
+
125
+ ### ๐Ÿค— Hugging Face Trainer
126
+ Plug-and-play with the Trainer API.
127
+ ```python
128
+ from transformers import Trainer
129
+ from torch_audit import Auditor, AuditConfig
130
+ from torch_audit.callbacks import HFAuditCallback
131
+
132
+ config = AuditConfig(monitor_nlp=True, interval=500)
133
+ auditor = Auditor(model, config=config)
134
+
135
+ trainer = Trainer(..., callbacks=[HFAuditCallback(auditor)])
136
+ ```
137
+
138
+ ## ๐Ÿ› ๏ธ Capabilities & Modules
139
+ ### ๐Ÿ–ฅ๏ธ Hardware & System (Always Active)
140
+
141
+ * **Device Placement:** Detects "Split Brain" (CPU/GPU mix) and forgotten `.cuda()` calls.
142
+ * **Tensor Cores:** Warns if matrix multiplications aren't aligned to 8 (FP16) or 16 (INT8).
143
+ * **Memory Layout:** Detects `NCHW` vs `NHWC` memory format issues.
144
+ * **Precision:** Suggests AMP/BFloat16 if model is 100% FP32.
145
+
146
+ ### ๐Ÿงช Optimization & Stability
147
+
148
+ * **Config:** Warns if using `Adam` with `weight_decay` (suggests `AdamW`).
149
+ * **Regularization:** Detects weight decay applied to Biases or Norm layers.
150
+ * **Dynamics:** Checks for low `epsilon` in Mixed Precision (underflow risk).
151
+
152
+ ### ๐Ÿ“– NLP Mode
153
+ Detects tokenizer issues, padding waste, and untied embeddings.
154
+ ```python
155
+ config = {
156
+ 'monitor_nlp': True,
157
+ 'pad_token_id': tokenizer.pad_token_id,
158
+ 'vocab_size': tokenizer.vocab_size
159
+ }
160
+ auditor = Auditor(model, config=config)
161
+ ```
162
+
163
+ * **Data Integrity:** Checks if `attention_mask` actually masks the padding tokens in `input_ids`.
164
+ * **Efficiency:** Calculates wasted compute due to excessive padding (>50%).
165
+ * **Architecture:** Checks if Embedding weights are tied to the Output Head.
166
+
167
+ ### ๐Ÿ–ผ๏ธ Computer Vision Mode
168
+ Detects normalization bugs (0-255 inputs) and dead convolution filters.
169
+ ```python
170
+ auditor = Auditor(model, config={'monitor_cv': True})
171
+ ```
172
+ * **Layout:** Detects accidental `[Batch, Height, Width, Channel]` input (crashes PyTorch).
173
+ * **Redundant Bias:** Detects `Conv2d(bias=True)` followed immediately by `BatchNorm`.
174
+ * **Dead Filters:** Identifies convolution filters that have been pruned or collapsed to zero.
175
+
176
+ ## โš™๏ธ Configuration
177
+
178
+ You can configure the auditor via a dictionary or the `AuditConfig` object.
179
+
180
+ | Parameter | Default | Description |
181
+ | :--- |:--------|:----------------------------------------------------------------|
182
+ | `interval` | `1` | Run audit every N steps. Set to `1000+` or more for production. |
183
+ | `limit` | `None` | Stop auditing after N reports. |
184
+ | `float_threshold` | `10.0` | Max value allowed in inputs before warning. |
185
+ | `monitor_dead_neurons` | `True` | Check for activations death. |
186
+ | `graph_atomic_modules` | `[]` |List of custom layers (e.g. FlashAttn) to treat as leaves.
187
+ | `monitor_graph` | `True` | Check for unused (zombie) layers. |
188
+ | `monitor_nlp` | `False` | Enable NLP-specific hooks (requires `pad_token_id`). |
189
+ | `monitor_cv` | `False` | Enable CV-specific hooks. |
190
+
191
+ ## ๐Ÿญ Production Logging
192
+ For headless training where you can't see the console, switch to the `LogReporter`.
193
+ ```python
194
+ from torch_audit.core.reporter import LogReporter
195
+
196
+ # Writes to standard Python logging (INFO/WARN/ERROR)
197
+ auditor = Auditor(model, reporters=[LogReporter()])
198
+ ```
199
+ ## ๐Ÿ› ๏ธ Manual Triggering
200
+
201
+ Sometimes you want to audit, for example, when the loss spikes.
202
+ ```python
203
+ loss = criterion(output, target)
204
+
205
+ if loss.item() > 10.0:
206
+ print("Loss spike! Debugging next step...")
207
+ auditor.schedule_next_step() # Forces audit on next forward pass
208
+ ```
209
+ ## ๐Ÿค Contributing & Feedback
210
+ Found a silent bug that `torch-audit` missed? Have a suggestion for a new Validator?
211
+ **[Open an Issue](https://github.com/RMalkiv/torch-audit/issues)!** We love feedback and contributions.
212
+
213
+ ## License
214
+
215
+ Distributed under the MIT License.
@@ -0,0 +1,65 @@
1
+ [build-system]
2
+ requires = ["poetry-core>=1.0.0"]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
+ [tool.poetry]
6
+ name = "torch-audit"
7
+ version = "0.2.0"
8
+ description = "The Linter for PyTorch: Detects silent training bugs."
9
+ authors = ["Roman Malkiv <malkiv.roman@gmail.com>"]
10
+ readme = "README.md"
11
+ license = "MIT"
12
+ repository = "https://github.com/RMalkiv/torch-audit"
13
+ homepage = "https://github.com/RMalkiv/torch-audit"
14
+ packages = [{include = "torch_audit", from = "src"}]
15
+
16
+ keywords = ["pytorch", "audit", "debugging", "linter", "deep-learning"]
17
+
18
+ classifiers = [
19
+ "Development Status :: 4 - Beta",
20
+ "Intended Audience :: Developers",
21
+ "Intended Audience :: Science/Research",
22
+ "License :: OSI Approved :: MIT License",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.8",
25
+ "Programming Language :: Python :: 3.9",
26
+ "Programming Language :: Python :: 3.10",
27
+ "Programming Language :: Python :: 3.11",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ ]
30
+
31
+ [tool.poetry.dependencies]
32
+ python = "^3.8"
33
+ torch = ">=1.10.0"
34
+ rich = ">=12.0.0"
35
+
36
+ # --- Optional Integrations ---
37
+ lightning = {version = ">=2.0.0", optional = true}
38
+ accelerate = {version = ">=0.20.0", optional = true}
39
+ transformers = {version = ">=4.30.0", optional = true}
40
+ datasets = {version = ">=2.10.0", optional = true}
41
+ numpy = {version = ">=1.20.0", optional = true}
42
+
43
+ [tool.poetry.extras]
44
+ lightning = ["lightning"]
45
+ hf = ["transformers", "accelerate", "datasets"]
46
+ all = ["lightning", "transformers", "accelerate", "datasets", "numpy"]
47
+
48
+ [tool.poetry.group.dev.dependencies]
49
+ pytest = "^7.0"
50
+ black = "^23.0"
51
+ isort = "^5.0"
52
+ twine = "^4.0"
53
+ docutils = "<0.21"
54
+
55
+ [tool.black]
56
+ line-length = 88
57
+ target-version = ['py38']
58
+
59
+ [tool.isort]
60
+ profile = "black"
61
+
62
+ [tool.pytest.ini_options]
63
+ minversion = "6.0"
64
+ addopts = "-ra -q"
65
+ testpaths = ["tests"]
@@ -0,0 +1,16 @@
1
+ from .core.auditor import Auditor
2
+ from .core.config import AuditConfig
3
+ from .core.reporter import LogReporter, RichConsoleReporter
4
+
5
+ from .callbacks import LightningAuditCallback, HFAuditCallback
6
+
7
+ __version__ = "0.2.0"
8
+
9
+ __all__ = [
10
+ "Auditor",
11
+ "AuditConfig",
12
+ "LogReporter",
13
+ "RichConsoleReporter",
14
+ "LightningAuditCallback",
15
+ "HFAuditCallback"
16
+ ]
@@ -0,0 +1,51 @@
1
+ from .core.auditor import Auditor
2
+
3
+ try:
4
+ from lightning.pytorch.callbacks import Callback as PLCallback
5
+
6
+ class LightningAuditCallback(PLCallback):
7
+ """
8
+ Automatically audits LightningModule training steps.
9
+ Add to `Trainer(callbacks=[LightningAuditCallback(auditor)])`.
10
+ """
11
+
12
+ def __init__(self, auditor: Auditor):
13
+ self.auditor = auditor
14
+
15
+ def on_fit_start(self, trainer, pl_module):
16
+ self.auditor.audit_static()
17
+
18
+ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
19
+ self.auditor.audit_data(batch)
20
+
21
+ self.auditor.start_dynamic_audit()
22
+
23
+ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
24
+ self.auditor.stop_dynamic_audit()
25
+
26
+ except ImportError:
27
+ LightningAuditCallback = None
28
+
29
+ try:
30
+ from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
31
+
32
+
33
+ class HFAuditCallback(TrainerCallback):
34
+ """
35
+ Automatically audits HF Trainer steps.
36
+ """
37
+ def __init__(self, auditor: Auditor):
38
+ self.auditor = auditor
39
+
40
+ def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
41
+ inputs = kwargs.get('inputs')
42
+ if inputs is not None:
43
+ self.auditor.audit_data(inputs)
44
+
45
+ self.auditor.start_dynamic_audit()
46
+
47
+ def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
48
+ self.auditor.stop_dynamic_audit()
49
+
50
+ except ImportError:
51
+ HFAuditCallback = None
@@ -0,0 +1,2 @@
1
+ from .auditor import Auditor
2
+ from .config import AuditConfig