uni-layer 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- uni_layer-0.2.0/LICENSE +21 -0
- uni_layer-0.2.0/PKG-INFO +459 -0
- uni_layer-0.2.0/README.md +403 -0
- uni_layer-0.2.0/pyproject.toml +112 -0
- uni_layer-0.2.0/setup.cfg +4 -0
- uni_layer-0.2.0/setup.py +9 -0
- uni_layer-0.2.0/tests/test_all_metrics.py +439 -0
- uni_layer-0.2.0/tests/test_analyzer.py +173 -0
- uni_layer-0.2.0/tests/test_benchmark.py +407 -0
- uni_layer-0.2.0/tests/test_cache.py +96 -0
- uni_layer-0.2.0/tests/test_fast_math.py +151 -0
- uni_layer-0.2.0/tests/test_hf_adapter.py +342 -0
- uni_layer-0.2.0/tests/test_integrations.py +131 -0
- uni_layer-0.2.0/tests/test_metrics.py +190 -0
- uni_layer-0.2.0/uni_layer/__init__.py +36 -0
- uni_layer-0.2.0/uni_layer/benchmark/__init__.py +5 -0
- uni_layer-0.2.0/uni_layer/benchmark/runner.py +343 -0
- uni_layer-0.2.0/uni_layer/cli.py +157 -0
- uni_layer-0.2.0/uni_layer/compression/__init__.py +14 -0
- uni_layer-0.2.0/uni_layer/compression/pruner.py +334 -0
- uni_layer-0.2.0/uni_layer/core/__init__.py +6 -0
- uni_layer-0.2.0/uni_layer/core/analyzer.py +470 -0
- uni_layer-0.2.0/uni_layer/core/base_metric.py +185 -0
- uni_layer-0.2.0/uni_layer/core/base_metric_cn.py +435 -0
- uni_layer-0.2.0/uni_layer/core/cache.py +204 -0
- uni_layer-0.2.0/uni_layer/core/schema.py +177 -0
- uni_layer-0.2.0/uni_layer/experimental/__init__.py +11 -0
- uni_layer-0.2.0/uni_layer/experimental/distiller.py +477 -0
- uni_layer-0.2.0/uni_layer/experimental/peft.py +486 -0
- uni_layer-0.2.0/uni_layer/integrations/__init__.py +23 -0
- uni_layer-0.2.0/uni_layer/integrations/distillation.py +189 -0
- uni_layer-0.2.0/uni_layer/integrations/huggingface_peft.py +251 -0
- uni_layer-0.2.0/uni_layer/integrations/torch_pruning.py +212 -0
- uni_layer-0.2.0/uni_layer/metrics/__init__.py +51 -0
- uni_layer-0.2.0/uni_layer/metrics/architecture_specific/__init__.py +5 -0
- uni_layer-0.2.0/uni_layer/metrics/architecture_specific/attention_flow.py +194 -0
- uni_layer-0.2.0/uni_layer/metrics/bayesian/__init__.py +5 -0
- uni_layer-0.2.0/uni_layer/metrics/bayesian/laplace_posterior.py +241 -0
- uni_layer-0.2.0/uni_layer/metrics/information_theory/__init__.py +6 -0
- uni_layer-0.2.0/uni_layer/metrics/information_theory/entropy.py +111 -0
- uni_layer-0.2.0/uni_layer/metrics/information_theory/mutual_information.py +129 -0
- uni_layer-0.2.0/uni_layer/metrics/optimization/__init__.py +7 -0
- uni_layer-0.2.0/uni_layer/metrics/optimization/fisher_information.py +126 -0
- uni_layer-0.2.0/uni_layer/metrics/optimization/gradient_norm.py +134 -0
- uni_layer-0.2.0/uni_layer/metrics/optimization/hessian_trace.py +158 -0
- uni_layer-0.2.0/uni_layer/metrics/representation/__init__.py +6 -0
- uni_layer-0.2.0/uni_layer/metrics/representation/block_influence.py +119 -0
- uni_layer-0.2.0/uni_layer/metrics/representation/jacobian_rank.py +156 -0
- uni_layer-0.2.0/uni_layer/metrics/robustness/__init__.py +5 -0
- uni_layer-0.2.0/uni_layer/metrics/robustness/droplayer.py +221 -0
- uni_layer-0.2.0/uni_layer/metrics/spectral/__init__.py +7 -0
- uni_layer-0.2.0/uni_layer/metrics/spectral/cka.py +198 -0
- uni_layer-0.2.0/uni_layer/metrics/spectral/effective_rank.py +163 -0
- uni_layer-0.2.0/uni_layer/metrics/spectral/ntk.py +131 -0
- uni_layer-0.2.0/uni_layer/utils/__init__.py +20 -0
- uni_layer-0.2.0/uni_layer/utils/fast_math.py +242 -0
- uni_layer-0.2.0/uni_layer/utils/hook_utils.py +79 -0
- uni_layer-0.2.0/uni_layer/utils/layer_utils.py +254 -0
- uni_layer-0.2.0/uni_layer/utils/model_adapter.py +133 -0
- uni_layer-0.2.0/uni_layer/utils/report.py +348 -0
- uni_layer-0.2.0/uni_layer/visualization/__init__.py +27 -0
- uni_layer-0.2.0/uni_layer/visualization/interactive.py +285 -0
- uni_layer-0.2.0/uni_layer/visualization/plot_utils.py +238 -0
- uni_layer-0.2.0/uni_layer.egg-info/PKG-INFO +459 -0
- uni_layer-0.2.0/uni_layer.egg-info/SOURCES.txt +67 -0
- uni_layer-0.2.0/uni_layer.egg-info/dependency_links.txt +1 -0
- uni_layer-0.2.0/uni_layer.egg-info/not-zip-safe +1 -0
- uni_layer-0.2.0/uni_layer.egg-info/requires.txt +38 -0
- uni_layer-0.2.0/uni_layer.egg-info/top_level.txt +1 -0
uni_layer-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Uni-Layer Team
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
uni_layer-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: uni-layer
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A Universal Framework for Layer Contribution Analysis
|
|
5
|
+
Author-email: Uni-Layer Team <contact@uni-layer.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/GeoffreyWang1117/Uni-Layer
|
|
8
|
+
Project-URL: Repository, https://github.com/GeoffreyWang1117/Uni-Layer
|
|
9
|
+
Project-URL: Issues, https://github.com/GeoffreyWang1117/Uni-Layer/issues
|
|
10
|
+
Keywords: deep learning,layer analysis,interpretability,model compression
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Requires-Python: >=3.8
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: torch>=1.12.0
|
|
23
|
+
Requires-Dist: numpy>=1.21.0
|
|
24
|
+
Requires-Dist: scipy>=1.7.0
|
|
25
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
26
|
+
Requires-Dist: matplotlib>=3.5.0
|
|
27
|
+
Requires-Dist: seaborn>=0.11.0
|
|
28
|
+
Requires-Dist: pandas>=1.3.0
|
|
29
|
+
Requires-Dist: tqdm>=4.62.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pytest-cov>=3.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: flake8>=4.0.0; extra == "dev"
|
|
35
|
+
Requires-Dist: mypy>=0.950; extra == "dev"
|
|
36
|
+
Requires-Dist: isort>=5.10.0; extra == "dev"
|
|
37
|
+
Provides-Extra: docs
|
|
38
|
+
Requires-Dist: sphinx>=4.5.0; extra == "docs"
|
|
39
|
+
Requires-Dist: sphinx-rtd-theme>=1.0.0; extra == "docs"
|
|
40
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.18.0; extra == "docs"
|
|
41
|
+
Provides-Extra: viz
|
|
42
|
+
Requires-Dist: networkx>=2.6.0; extra == "viz"
|
|
43
|
+
Requires-Dist: plotly>=5.0.0; extra == "viz"
|
|
44
|
+
Provides-Extra: integrations
|
|
45
|
+
Requires-Dist: torch-pruning>=1.2.0; extra == "integrations"
|
|
46
|
+
Requires-Dist: peft>=0.6.0; extra == "integrations"
|
|
47
|
+
Requires-Dist: transformers>=4.20.0; extra == "integrations"
|
|
48
|
+
Provides-Extra: all
|
|
49
|
+
Requires-Dist: transformers>=4.20.0; extra == "all"
|
|
50
|
+
Requires-Dist: timm>=0.6.0; extra == "all"
|
|
51
|
+
Requires-Dist: networkx>=2.6.0; extra == "all"
|
|
52
|
+
Requires-Dist: plotly>=5.0.0; extra == "all"
|
|
53
|
+
Requires-Dist: torch-pruning>=1.2.0; extra == "all"
|
|
54
|
+
Requires-Dist: peft>=0.6.0; extra == "all"
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
|
|
57
|
+
# Uni-Layer
|
|
58
|
+
|
|
59
|
+
**Understand your layers before you optimize them.**
|
|
60
|
+
|
|
61
|
+
[](https://pypi.org/project/uni-layer/)
|
|
62
|
+
[](https://www.python.org/downloads/)
|
|
63
|
+
[](https://opensource.org/licenses/MIT)
|
|
64
|
+
[]()
|
|
65
|
+
|
|
66
|
+
Uni-Layer is a PyTorch toolkit that scores every layer in your neural network across **13 metrics in 7 theoretical categories**. It tells you which layers matter most — so you can prune smarter, fine-tune better, and distill more effectively.
|
|
67
|
+
|
|
68
|
+
**[English](#quick-start)** | **[中文](#中文说明)**
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Why Uni-Layer?
|
|
73
|
+
|
|
74
|
+
Most compression and fine-tuning tools treat all layers equally or rely on simple magnitude heuristics. Uni-Layer replaces guesswork with principled, multi-metric layer analysis.
|
|
75
|
+
|
|
76
|
+
There is no other library that does this. Captum does input attribution. Torch-Pruning does structural pruning. TransformerLens does mechanistic interpretability. **Uni-Layer is the only tool that unifies 13 layer importance metrics under one API and bridges them to downstream tools.**
|
|
77
|
+
|
|
78
|
+
| You want to... | Uni-Layer provides | Works with |
|
|
79
|
+
|---|---|---|
|
|
80
|
+
| **Prune** a model | Per-layer importance scores & pruning ratios | [Torch-Pruning](https://github.com/VainF/Torch-Pruning) |
|
|
81
|
+
| **LoRA fine-tune** | Which layers to target, adaptive rank allocation | [HuggingFace PEFT](https://github.com/huggingface/peft) |
|
|
82
|
+
| **Distill** knowledge | Layer pairing & per-layer distillation weights | Any distillation framework |
|
|
83
|
+
| **Understand** a model | Multi-metric layer contribution profile | Standalone |
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Quick Start
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install uni-layer
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from uni_layer import LayerAnalyzer
|
|
95
|
+
from uni_layer.metrics import GradientNorm, CKA, BlockInfluence
|
|
96
|
+
|
|
97
|
+
analyzer = LayerAnalyzer(model, task_type='classification')
|
|
98
|
+
contributions = analyzer.compute_metrics(
|
|
99
|
+
metrics=[GradientNorm(), CKA(), BlockInfluence()],
|
|
100
|
+
data_loader=train_loader,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Rank layers by importance
|
|
104
|
+
for name, score in analyzer.rank_layers(contributions, 'gradient_norm'):
|
|
105
|
+
print(f" {name}: {score:.4f}")
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Output Format
|
|
111
|
+
|
|
112
|
+
Every call to `compute_metrics()` returns a structured dict. Here is a real example from a 4-layer MLP:
|
|
113
|
+
|
|
114
|
+
```json
|
|
115
|
+
{
|
|
116
|
+
"0": {
|
|
117
|
+
"layer_idx": 0,
|
|
118
|
+
"layer_type": "linear",
|
|
119
|
+
"gradient_norm": 0.0193,
|
|
120
|
+
"gradient_norm_std": 0.0016,
|
|
121
|
+
"cka_score": 0.4161,
|
|
122
|
+
"effective_rank": 10.54,
|
|
123
|
+
"block_influence": 1.0,
|
|
124
|
+
"fisher_information": 0.0001
|
|
125
|
+
},
|
|
126
|
+
"2": {
|
|
127
|
+
"layer_idx": 1,
|
|
128
|
+
"layer_type": "linear",
|
|
129
|
+
"gradient_norm": 0.0494,
|
|
130
|
+
"cka_score": 0.5449,
|
|
131
|
+
"effective_rank": 20.18,
|
|
132
|
+
"block_influence": 1.0,
|
|
133
|
+
"fisher_information": 0.0002
|
|
134
|
+
},
|
|
135
|
+
"4": {
|
|
136
|
+
"layer_idx": 2,
|
|
137
|
+
"layer_type": "linear",
|
|
138
|
+
"gradient_norm": 0.0624,
|
|
139
|
+
"cka_score": 0.6233,
|
|
140
|
+
"effective_rank": 9.58,
|
|
141
|
+
"block_influence": 1.0,
|
|
142
|
+
"fisher_information": 0.0003
|
|
143
|
+
},
|
|
144
|
+
"6": {
|
|
145
|
+
"layer_idx": 3,
|
|
146
|
+
"layer_type": "linear",
|
|
147
|
+
"gradient_norm": 0.1094,
|
|
148
|
+
"cka_score": 1.0,
|
|
149
|
+
"effective_rank": 2.36,
|
|
150
|
+
"block_influence": 1.0,
|
|
151
|
+
"fisher_information": 0.0009
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
`rank_layers()` returns sorted `(name, score)` tuples:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
[("6", 0.1094), ("4", 0.0624), ("2", 0.0494), ("0", 0.0193)]
|
|
160
|
+
# Layer 6 (output head) contributes most; Layer 0 (input) contributes least.
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
And here is a 4-block Transformer analyzed with `GradientNorm`, `BlockInfluence`, and `EffectiveRank`:
|
|
164
|
+
|
|
165
|
+
```
|
|
166
|
+
Layer Type GradNorm BlockInfluence EffectiveRank
|
|
167
|
+
--------------------------------------------------------------------------------
|
|
168
|
+
blocks.0 transformer_block 0.1425 0.0278 94.47
|
|
169
|
+
blocks.1 transformer_block 0.1404 0.0275 94.21
|
|
170
|
+
blocks.2 transformer_block 0.1319 0.0265 93.92
|
|
171
|
+
blocks.3 transformer_block 0.1276 0.0269 93.66
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
> Early blocks have slightly higher gradient norms — they are adapting more. BlockInfluence is low everywhere (all ~0.027) because residual connections dominate, meaning each block's transformation is small relative to the skip path. EffectiveRank is uniformly high (~94), indicating rich, non-degenerate representations.
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## 13 Metrics in 7 Categories
|
|
179
|
+
|
|
180
|
+
| Category | Metrics | What it measures |
|
|
181
|
+
|---|---|---|
|
|
182
|
+
| **Optimization** | `GradientNorm`, `HessianTrace`, `FisherInformation` | How much the layer affects the loss landscape |
|
|
183
|
+
| **Spectral** | `CKA`, `EffectiveRank`, `NTKTrace` | Representation similarity, diversity, kernel influence |
|
|
184
|
+
| **Information Theory** | `ActivationEntropy`, `MutualInformation` | Information content and task relevance |
|
|
185
|
+
| **Representation** | `JacobianRank`, `BlockInfluence` | Expressiveness and layer redundancy |
|
|
186
|
+
| **Robustness** | `DropLayerRobustness` | Performance impact of removing the layer |
|
|
187
|
+
| **Bayesian** | `LaplacePosterior` | Parameter uncertainty (Laplace approximation) |
|
|
188
|
+
| **Architecture** | `AttentionFlow` | Attention entropy, head diversity (Transformers) |
|
|
189
|
+
|
|
190
|
+
Each metric returns a dict with a **primary key** (used for ranking) and optional secondary keys:
|
|
191
|
+
|
|
192
|
+
| Metric | Primary Key | Additional Keys |
|
|
193
|
+
|---|---|---|
|
|
194
|
+
| GradientNorm | `gradient_norm` | `gradient_norm_std`, `_max`, `_min` |
|
|
195
|
+
| HessianTrace | `hessian_trace` | `hessian_trace_std` |
|
|
196
|
+
| FisherInformation | `fisher_information` | `fisher_mean` |
|
|
197
|
+
| CKA | `cka_score` | |
|
|
198
|
+
| EffectiveRank | `effective_rank` | `stable_rank`, `rank_ratio` |
|
|
199
|
+
| NTKTrace | `ntk_trace` | `ntk_trace_per_param` |
|
|
200
|
+
| ActivationEntropy | `activation_entropy` | `activation_mean`, `_std`, `_sparsity` |
|
|
201
|
+
| MutualInformation | `mutual_information` | `mi_max`, `mi_std` |
|
|
202
|
+
| JacobianRank | `jacobian_rank` | `jacobian_rank_ratio`, `_condition`, `_max_sv` |
|
|
203
|
+
| BlockInfluence | `block_influence` | `block_similarity` |
|
|
204
|
+
| DropLayerRobustness | `droplayer_loss_increase` | `droplayer_loss_ratio` |
|
|
205
|
+
| LaplacePosterior | `laplace_posterior` | `laplace_posterior_std` |
|
|
206
|
+
| AttentionFlow | `attention_entropy` | `attention_max_weight`, `head_diversity`, `attention_distance` |
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## Integration Bridges
|
|
211
|
+
|
|
212
|
+
### Torch-Pruning
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
from uni_layer.integrations import TorchPruningBridge
|
|
216
|
+
|
|
217
|
+
bridge = TorchPruningBridge(model, contributions)
|
|
218
|
+
|
|
219
|
+
# Important layers get low pruning ratios, unimportant layers get high ratios
|
|
220
|
+
pruning_ratios = bridge.as_layer_pruning_ratios(
|
|
221
|
+
metric_name='gradient_norm', target_sparsity=0.5
|
|
222
|
+
)
|
|
223
|
+
protected = bridge.get_protected_layers(top_k=3)
|
|
224
|
+
|
|
225
|
+
# Use with torch-pruning
|
|
226
|
+
import torch_pruning as tp
|
|
227
|
+
pruner = tp.pruner.MetaPruner(
|
|
228
|
+
model, example_inputs,
|
|
229
|
+
importance=tp.importance.MagnitudeImportance(),
|
|
230
|
+
pruning_ratio_dict=pruning_ratios,
|
|
231
|
+
)
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### HuggingFace PEFT
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from uni_layer.integrations import HuggingFacePEFTBridge
|
|
238
|
+
from peft import LoraConfig, get_peft_model
|
|
239
|
+
|
|
240
|
+
bridge = HuggingFacePEFTBridge(model, contributions)
|
|
241
|
+
|
|
242
|
+
# Auto-select LoRA targets and adaptive rank
|
|
243
|
+
config_params = bridge.recommend_lora_config_params(metric_name='gradient_norm')
|
|
244
|
+
peft_model = get_peft_model(model, LoraConfig(**config_params))
|
|
245
|
+
|
|
246
|
+
# Or fine-grained control: different rank per layer
|
|
247
|
+
ranks = bridge.recommend_adaptive_ranks(base_rank=8, max_rank=64)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Knowledge Distillation
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
from uni_layer.integrations import DistillationBridge
|
|
254
|
+
|
|
255
|
+
bridge = DistillationBridge(teacher, student, contributions)
|
|
256
|
+
|
|
257
|
+
pairs = bridge.recommend_layer_pairs(top_k=4) # teacher-student layer mapping
|
|
258
|
+
weights = bridge.recommend_layer_weights() # per-layer distillation weights
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## HuggingFace Model Support
|
|
264
|
+
|
|
265
|
+
Uni-Layer natively handles HuggingFace models that return dataclass/dict outputs, with automatic `attention_mask` injection:
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
from transformers import AutoModel
|
|
269
|
+
from uni_layer import LayerAnalyzer
|
|
270
|
+
from uni_layer.metrics import GradientNorm, BlockInfluence
|
|
271
|
+
|
|
272
|
+
model = AutoModel.from_pretrained("bert-base-uncased")
|
|
273
|
+
analyzer = LayerAnalyzer(model, task_type='classification')
|
|
274
|
+
|
|
275
|
+
# Just works -- dict outputs, attention_mask, labels all handled automatically
|
|
276
|
+
contributions = analyzer.compute_metrics(
|
|
277
|
+
metrics=[GradientNorm(), BlockInfluence()],
|
|
278
|
+
data_loader=tokenized_loader,
|
|
279
|
+
)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Examples
|
|
285
|
+
|
|
286
|
+
| Example | Model | File |
|
|
287
|
+
|---|---|---|
|
|
288
|
+
| ResNet layer analysis | ResNet-18 (CNN) | [`examples/resnet_layer_analysis.py`](examples/resnet_layer_analysis.py) |
|
|
289
|
+
| ViT attention analysis | Vision Transformer | [`examples/vit_layer_analysis.py`](examples/vit_layer_analysis.py) |
|
|
290
|
+
| BERT layer analysis + LoRA | BERT-style Transformer | [`examples/bert_layer_analysis.py`](examples/bert_layer_analysis.py) |
|
|
291
|
+
| Torch-Pruning integration | Any model | [`examples/integrate_torch_pruning.py`](examples/integrate_torch_pruning.py) |
|
|
292
|
+
| HuggingFace PEFT integration | Any model | [`examples/integrate_huggingface_peft.py`](examples/integrate_huggingface_peft.py) |
|
|
293
|
+
| Knowledge distillation | Teacher-Student | [`examples/integrate_distillation.py`](examples/integrate_distillation.py) |
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## Installation
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
pip install uni-layer # core
|
|
301
|
+
pip install uni-layer[integrations] # + torch-pruning, peft, transformers
|
|
302
|
+
pip install uni-layer[dev] # + pytest, black, flake8, mypy
|
|
303
|
+
pip install uni-layer[all] # everything
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
From source:
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
git clone https://github.com/GeoffreyWang1117/Uni-Layer.git
|
|
310
|
+
cd Uni-Layer && pip install -e ".[dev]"
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
---
|
|
314
|
+
|
|
315
|
+
## Roadmap
|
|
316
|
+
|
|
317
|
+
### v0.3.0 (Next)
|
|
318
|
+
- [ ] Diffusion model support (UNet timestep-aware analysis)
|
|
319
|
+
- [ ] Mamba / SSM architecture support
|
|
320
|
+
- [ ] MoE router layer analysis
|
|
321
|
+
- [ ] Residual-aware DropLayer metric (understand skip connections)
|
|
322
|
+
- [ ] Layer-to-layer CKA similarity matrix
|
|
323
|
+
|
|
324
|
+
### v0.4.0
|
|
325
|
+
- [ ] GNN support (PyG MessagePassing layers)
|
|
326
|
+
- [ ] Multi-modal model branch analysis (vision encoder + language decoder)
|
|
327
|
+
- [ ] Wanda-style importance (weight x activation norm)
|
|
328
|
+
- [ ] IG-based sensitivity scoring (IGU-LoRA style)
|
|
329
|
+
- [ ] Export to ONNX / TensorRT optimization hints
|
|
330
|
+
|
|
331
|
+
### v1.0.0
|
|
332
|
+
- [ ] Stable API with full backward compatibility
|
|
333
|
+
- [ ] Interactive web dashboard for layer analysis
|
|
334
|
+
- [ ] Distributed analysis for large models (FSDP/DeepSpeed)
|
|
335
|
+
- [ ] Pre-computed analysis for popular models (BERT, LLaMA, ViT, etc.)
|
|
336
|
+
- [ ] Academic paper and comprehensive benchmark suite
|
|
337
|
+
|
|
338
|
+
---
|
|
339
|
+
|
|
340
|
+
## Citation
|
|
341
|
+
|
|
342
|
+
```bibtex
|
|
343
|
+
@software{unilayer2025,
|
|
344
|
+
title={Uni-Layer: A Universal Framework for Layer Contribution Analysis},
|
|
345
|
+
author={Geoffrey Wang},
|
|
346
|
+
year={2025},
|
|
347
|
+
url={https://github.com/GeoffreyWang1117/Uni-Layer}
|
|
348
|
+
}
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
## License
|
|
352
|
+
|
|
353
|
+
MIT License. See [LICENSE](LICENSE).
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
<a id="中文说明"></a>
|
|
358
|
+
|
|
359
|
+
# 中文说明
|
|
360
|
+
|
|
361
|
+
## Uni-Layer:神经网络层贡献度分析框架
|
|
362
|
+
|
|
363
|
+
**先理解你的层,再优化它们。**
|
|
364
|
+
|
|
365
|
+
Uni-Layer 是一个 PyTorch 工具库,通过 **7 大理论类别的 13 种指标** 为神经网络的每一层打分,告诉你哪些层最重要——从而实现更精准的剪枝、更高效的微调和更有效的蒸馏。
|
|
366
|
+
|
|
367
|
+
### 核心优势
|
|
368
|
+
|
|
369
|
+
- **唯一的层重要性通用评分库**:Captum 做输入归因,Torch-Pruning 做剪枝,TransformerLens 做机制解释——只有 Uni-Layer 把 13 种层重要性指标统一到一个 API 中
|
|
370
|
+
- **与下游工具解耦**:通过 Bridge 模式无缝连接 Torch-Pruning / PEFT / 蒸馏框架
|
|
371
|
+
- **兼容 HuggingFace**:自动处理 dict/dataclass 输出、attention_mask、labels 透传
|
|
372
|
+
|
|
373
|
+
### 快速开始
|
|
374
|
+
|
|
375
|
+
```bash
|
|
376
|
+
pip install uni-layer
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
```python
|
|
380
|
+
from uni_layer import LayerAnalyzer
|
|
381
|
+
from uni_layer.metrics import GradientNorm, CKA, BlockInfluence
|
|
382
|
+
|
|
383
|
+
analyzer = LayerAnalyzer(model, task_type='classification')
|
|
384
|
+
contributions = analyzer.compute_metrics(
|
|
385
|
+
metrics=[GradientNorm(), CKA(), BlockInfluence()],
|
|
386
|
+
data_loader=train_loader,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# 按重要性排序
|
|
390
|
+
for name, score in analyzer.rank_layers(contributions, 'gradient_norm'):
|
|
391
|
+
print(f" {name}: {score:.4f}")
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
### 输出格式
|
|
395
|
+
|
|
396
|
+
`compute_metrics()` 返回结构化字典:
|
|
397
|
+
|
|
398
|
+
```python
|
|
399
|
+
{
|
|
400
|
+
"layer_name": {
|
|
401
|
+
"layer_idx": 0, # 层索引
|
|
402
|
+
"layer_type": "linear", # 层类型
|
|
403
|
+
"gradient_norm": 0.0193, # 各指标值
|
|
404
|
+
"cka_score": 0.4161,
|
|
405
|
+
"block_influence": 1.0,
|
|
406
|
+
...
|
|
407
|
+
},
|
|
408
|
+
...
|
|
409
|
+
}
|
|
410
|
+
```
|
|
411
|
+
|
|
412
|
+
`rank_layers()` 返回排序后的元组列表:
|
|
413
|
+
|
|
414
|
+
```python
|
|
415
|
+
[("layer_6", 0.1094), ("layer_4", 0.0624), ...] # 降序
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
### 13 种指标
|
|
419
|
+
|
|
420
|
+
| 类别 | 指标 | 衡量内容 |
|
|
421
|
+
|---|---|---|
|
|
422
|
+
| 优化几何 | GradientNorm, HessianTrace, FisherInformation | 层对损失曲面的影响 |
|
|
423
|
+
| 谱方法 | CKA, EffectiveRank, NTKTrace | 表征相似性、多样性、核影响力 |
|
|
424
|
+
| 信息论 | ActivationEntropy, MutualInformation | 信息含量与任务相关性 |
|
|
425
|
+
| 表征结构 | JacobianRank, BlockInfluence | 表达能力与层冗余度 |
|
|
426
|
+
| 鲁棒性 | DropLayerRobustness | 移除该层后的性能损失 |
|
|
427
|
+
| 贝叶斯 | LaplacePosterior | 参数不确定性 |
|
|
428
|
+
| 架构特定 | AttentionFlow | 注意力熵、头多样性 (Transformer) |
|
|
429
|
+
|
|
430
|
+
### 集成桥
|
|
431
|
+
|
|
432
|
+
```python
|
|
433
|
+
# Torch-Pruning:重要层少剪,不重要层多剪
|
|
434
|
+
from uni_layer.integrations import TorchPruningBridge
|
|
435
|
+
bridge = TorchPruningBridge(model, contributions)
|
|
436
|
+
ratios = bridge.as_layer_pruning_ratios(target_sparsity=0.5)
|
|
437
|
+
|
|
438
|
+
# PEFT:自动选择 LoRA 目标层和自适应秩
|
|
439
|
+
from uni_layer.integrations import HuggingFacePEFTBridge
|
|
440
|
+
bridge = HuggingFacePEFTBridge(model, contributions)
|
|
441
|
+
config = bridge.recommend_lora_config_params()
|
|
442
|
+
|
|
443
|
+
# 蒸馏:教师-学生层配对和权重分配
|
|
444
|
+
from uni_layer.integrations import DistillationBridge
|
|
445
|
+
bridge = DistillationBridge(teacher, student, contributions)
|
|
446
|
+
pairs = bridge.recommend_layer_pairs(top_k=4)
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
### 路线图
|
|
450
|
+
|
|
451
|
+
**v0.3.0**:扩散模型支持 / Mamba-SSM / MoE 路由层分析 / 残差感知 DropLayer / 层间 CKA 矩阵
|
|
452
|
+
|
|
453
|
+
**v0.4.0**:GNN 支持 / 多模态分支分析 / Wanda 重要性 / IG 灵敏度 / ONNX 导出
|
|
454
|
+
|
|
455
|
+
**v1.0.0**:稳定 API / Web 可视化面板 / 分布式分析 / 预计算热门模型 / 学术论文
|
|
456
|
+
|
|
457
|
+
### 许可证
|
|
458
|
+
|
|
459
|
+
MIT License。详见 [LICENSE](LICENSE)。
|