quantbenchx 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantbenchx/__init__.py +132 -0
- quantbenchx/_types.py +220 -0
- quantbenchx/bandwidth.py +290 -0
- quantbenchx/cli.py +153 -0
- quantbenchx/compare.py +101 -0
- quantbenchx/imatrix.py +201 -0
- quantbenchx/layerwise.py +167 -0
- quantbenchx/matrix.py +289 -0
- quantbenchx/perplexity.py +168 -0
- quantbenchx/predict.py +125 -0
- quantbenchx/profile.py +301 -0
- quantbenchx/py.typed +0 -0
- quantbenchx/recommend.py +240 -0
- quantbenchx/report.py +171 -0
- quantbenchx-0.3.0.dist-info/METADATA +213 -0
- quantbenchx-0.3.0.dist-info/RECORD +17 -0
- quantbenchx-0.3.0.dist-info/WHEEL +4 -0
quantbenchx/cli.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""CLI for quantbenchx."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _build_cli(): # type: ignore[no-untyped-def]
|
|
11
|
+
try:
|
|
12
|
+
import click
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise SystemExit("CLI dependencies required: pip install quantbenchx[cli]")
|
|
15
|
+
|
|
16
|
+
@click.group()
|
|
17
|
+
@click.version_option(package_name="quantbenchx")
|
|
18
|
+
def cli() -> None:
|
|
19
|
+
"""quantbenchx — quantization quality analyzer."""
|
|
20
|
+
|
|
21
|
+
@cli.command()
|
|
22
|
+
@click.argument("model_path", type=click.Path(exists=True))
|
|
23
|
+
@click.option("--json-out", "-o", type=click.Path(), default=None)
|
|
24
|
+
@click.option("--markdown", "-m", is_flag=True)
|
|
25
|
+
def profile(model_path: str, json_out: Optional[str], markdown: bool) -> None:
|
|
26
|
+
"""Profile a quantized model file (GGUF or safetensors)."""
|
|
27
|
+
from quantbenchx.predict import estimate_quality
|
|
28
|
+
from quantbenchx.profile import profile_gguf, profile_safetensors
|
|
29
|
+
from quantbenchx.report import (
|
|
30
|
+
format_markdown,
|
|
31
|
+
format_report_rich,
|
|
32
|
+
format_report_text,
|
|
33
|
+
save_json,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
p = Path(model_path)
|
|
37
|
+
if p.suffix == ".gguf":
|
|
38
|
+
prof = profile_gguf(p)
|
|
39
|
+
elif p.suffix == ".safetensors":
|
|
40
|
+
prof = profile_safetensors(p)
|
|
41
|
+
else:
|
|
42
|
+
click.echo(f"Unknown format: {p.suffix}", err=True)
|
|
43
|
+
raise SystemExit(1)
|
|
44
|
+
|
|
45
|
+
quality = estimate_quality(prof)
|
|
46
|
+
|
|
47
|
+
if markdown:
|
|
48
|
+
click.echo(format_markdown(prof, quality))
|
|
49
|
+
else:
|
|
50
|
+
try:
|
|
51
|
+
click.echo(format_report_rich(prof, quality))
|
|
52
|
+
except Exception:
|
|
53
|
+
click.echo(format_report_text(prof, quality))
|
|
54
|
+
|
|
55
|
+
if json_out:
|
|
56
|
+
save_json(prof, json_out, quality)
|
|
57
|
+
click.echo(f"Report saved to {json_out}", err=True)
|
|
58
|
+
|
|
59
|
+
@cli.command()
|
|
60
|
+
@click.argument("model_a", type=click.Path(exists=True))
|
|
61
|
+
@click.argument("model_b", type=click.Path(exists=True))
|
|
62
|
+
def compare(model_a: str, model_b: str) -> None:
|
|
63
|
+
"""Compare two quantized model files."""
|
|
64
|
+
from quantbenchx.compare import compare_profiles
|
|
65
|
+
from quantbenchx.profile import profile_gguf, profile_safetensors
|
|
66
|
+
|
|
67
|
+
def _load(path: str): # type: ignore[no-untyped-def]
|
|
68
|
+
p = Path(path)
|
|
69
|
+
if p.suffix == ".gguf":
|
|
70
|
+
return profile_gguf(p)
|
|
71
|
+
elif p.suffix == ".safetensors":
|
|
72
|
+
return profile_safetensors(p)
|
|
73
|
+
else:
|
|
74
|
+
click.echo(f"Unknown format: {p.suffix}", err=True)
|
|
75
|
+
raise SystemExit(1)
|
|
76
|
+
|
|
77
|
+
prof_a = _load(model_a)
|
|
78
|
+
prof_b = _load(model_b)
|
|
79
|
+
result = compare_profiles(prof_a, prof_b)
|
|
80
|
+
click.echo(json.dumps(result, indent=2, default=str))
|
|
81
|
+
|
|
82
|
+
@cli.command()
|
|
83
|
+
@click.argument("model_path", type=click.Path(exists=True))
|
|
84
|
+
def layers(model_path: str) -> None:
|
|
85
|
+
"""Show layerwise analysis for a quantized model."""
|
|
86
|
+
from quantbenchx.layerwise import analyze_layers
|
|
87
|
+
from quantbenchx.profile import profile_gguf, profile_safetensors
|
|
88
|
+
|
|
89
|
+
p = Path(model_path)
|
|
90
|
+
if p.suffix == ".gguf":
|
|
91
|
+
prof = profile_gguf(p)
|
|
92
|
+
elif p.suffix == ".safetensors":
|
|
93
|
+
prof = profile_safetensors(p)
|
|
94
|
+
else:
|
|
95
|
+
click.echo(f"Unknown format: {p.suffix}", err=True)
|
|
96
|
+
raise SystemExit(1)
|
|
97
|
+
|
|
98
|
+
analysis = analyze_layers(prof)
|
|
99
|
+
try:
|
|
100
|
+
from rich.console import Console
|
|
101
|
+
from rich.table import Table
|
|
102
|
+
|
|
103
|
+
console = Console()
|
|
104
|
+
table = Table(title="Layerwise Analysis")
|
|
105
|
+
table.add_column("Layer", style="bold")
|
|
106
|
+
table.add_column("Params", justify="right")
|
|
107
|
+
table.add_column("BPW", justify="right")
|
|
108
|
+
table.add_column("Dtype")
|
|
109
|
+
table.add_column("Sensitivity", justify="right")
|
|
110
|
+
|
|
111
|
+
for row in analysis:
|
|
112
|
+
sens = row["sensitivity"]
|
|
113
|
+
sens_style = "green" if sens < 0.5 else "yellow" if sens < 0.7 else "red"
|
|
114
|
+
table.add_row(
|
|
115
|
+
row["name"],
|
|
116
|
+
f"{row['n_params']:,}",
|
|
117
|
+
f"{row['avg_bits_per_weight']:.2f}",
|
|
118
|
+
row["dominant_dtype"],
|
|
119
|
+
f"[{sens_style}]{sens:.3f}[/{sens_style}]",
|
|
120
|
+
)
|
|
121
|
+
console.print(table)
|
|
122
|
+
except ImportError:
|
|
123
|
+
for row in analysis:
|
|
124
|
+
click.echo(
|
|
125
|
+
f"{row['name']:30s} {row['n_params']:>12,} "
|
|
126
|
+
f"{row['avg_bits_per_weight']:>6.2f} {row['dominant_dtype']:8s} "
|
|
127
|
+
f"{row['sensitivity']:.3f}"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
@cli.command()
|
|
131
|
+
@click.argument("model_path", type=click.Path(exists=True))
|
|
132
|
+
@click.option("--target-bpw", type=float, default=4.5)
|
|
133
|
+
def recommend(model_path: str, target_bpw: float) -> None:
|
|
134
|
+
"""Recommend mixed-precision quantization strategy."""
|
|
135
|
+
from quantbenchx.layerwise import recommend_mixed_quant
|
|
136
|
+
from quantbenchx.profile import profile_gguf, profile_safetensors
|
|
137
|
+
|
|
138
|
+
p = Path(model_path)
|
|
139
|
+
if p.suffix == ".gguf":
|
|
140
|
+
prof = profile_gguf(p)
|
|
141
|
+
elif p.suffix == ".safetensors":
|
|
142
|
+
prof = profile_safetensors(p)
|
|
143
|
+
else:
|
|
144
|
+
click.echo(f"Unknown format: {p.suffix}", err=True)
|
|
145
|
+
raise SystemExit(1)
|
|
146
|
+
|
|
147
|
+
result = recommend_mixed_quant(prof, target_bpw=target_bpw)
|
|
148
|
+
click.echo(json.dumps(result, indent=2, default=str))
|
|
149
|
+
|
|
150
|
+
return cli
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
cli = _build_cli()
|
quantbenchx/compare.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Compare quantized model profiles across formats and methods."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from quantbenchx._types import ModelProfile
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compare_profiles(profile_a: ModelProfile, profile_b: ModelProfile) -> Dict[str, Any]:
|
|
11
|
+
"""Compare two model profiles and return a structured diff."""
|
|
12
|
+
size_delta = profile_b.total_size_bytes - profile_a.total_size_bytes
|
|
13
|
+
size_ratio = (
|
|
14
|
+
profile_b.total_size_bytes / profile_a.total_size_bytes
|
|
15
|
+
if profile_a.total_size_bytes > 0
|
|
16
|
+
else 0.0
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
bpw_a = profile_a.quant.avg_bits_per_weight
|
|
20
|
+
bpw_b = profile_b.quant.avg_bits_per_weight
|
|
21
|
+
bpw_delta = bpw_b - bpw_a
|
|
22
|
+
|
|
23
|
+
# Find tensors that differ in dtype
|
|
24
|
+
tensors_a = {t.name: t for t in profile_a.tensors}
|
|
25
|
+
tensors_b = {t.name: t for t in profile_b.tensors}
|
|
26
|
+
common = set(tensors_a.keys()) & set(tensors_b.keys())
|
|
27
|
+
|
|
28
|
+
dtype_changes: List[Dict[str, str]] = []
|
|
29
|
+
for name in sorted(common):
|
|
30
|
+
ta, tb = tensors_a[name], tensors_b[name]
|
|
31
|
+
if ta.dtype != tb.dtype:
|
|
32
|
+
dtype_changes.append({
|
|
33
|
+
"tensor": name,
|
|
34
|
+
"dtype_a": ta.dtype.value,
|
|
35
|
+
"dtype_b": tb.dtype.value,
|
|
36
|
+
"bpw_a": ta.bits_per_weight,
|
|
37
|
+
"bpw_b": tb.bits_per_weight,
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
only_a = sorted(set(tensors_a.keys()) - set(tensors_b.keys()))
|
|
41
|
+
only_b = sorted(set(tensors_b.keys()) - set(tensors_a.keys()))
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
"model_a": profile_a.name,
|
|
45
|
+
"model_b": profile_b.name,
|
|
46
|
+
"format_a": profile_a.format.value,
|
|
47
|
+
"format_b": profile_b.format.value,
|
|
48
|
+
"size_a_gb": round(profile_a.size_gb, 3),
|
|
49
|
+
"size_b_gb": round(profile_b.size_gb, 3),
|
|
50
|
+
"size_delta_bytes": size_delta,
|
|
51
|
+
"size_ratio": round(size_ratio, 3),
|
|
52
|
+
"bpw_a": round(bpw_a, 2),
|
|
53
|
+
"bpw_b": round(bpw_b, 2),
|
|
54
|
+
"bpw_delta": round(bpw_delta, 2),
|
|
55
|
+
"method_a": profile_a.quant.method.value,
|
|
56
|
+
"method_b": profile_b.quant.method.value,
|
|
57
|
+
"n_dtype_changes": len(dtype_changes),
|
|
58
|
+
"dtype_changes": dtype_changes[:50], # limit output
|
|
59
|
+
"tensors_only_in_a": only_a[:20],
|
|
60
|
+
"tensors_only_in_b": only_b[:20],
|
|
61
|
+
"params_a": profile_a.total_params,
|
|
62
|
+
"params_b": profile_b.total_params,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def compare_formats(profiles: List[ModelProfile]) -> Dict[str, Any]:
|
|
67
|
+
"""Compare multiple quantization formats of the same base model.
|
|
68
|
+
|
|
69
|
+
Returns a summary table and ranking by size vs quality trade-off.
|
|
70
|
+
"""
|
|
71
|
+
if not profiles:
|
|
72
|
+
return {"models": [], "ranking": []}
|
|
73
|
+
|
|
74
|
+
rows: List[Dict[str, Any]] = []
|
|
75
|
+
for p in profiles:
|
|
76
|
+
rows.append({
|
|
77
|
+
"name": p.name,
|
|
78
|
+
"format": p.format.value,
|
|
79
|
+
"method": p.quant.method.value,
|
|
80
|
+
"size_gb": round(p.size_gb, 3),
|
|
81
|
+
"avg_bpw": round(p.quant.avg_bits_per_weight, 2),
|
|
82
|
+
"compression_ratio": round(p.compression_ratio, 2),
|
|
83
|
+
"n_params": p.total_params,
|
|
84
|
+
"n_quantized": p.quant.n_quantized_layers,
|
|
85
|
+
"n_full_precision": p.quant.n_full_precision_layers,
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
# Rank by compression efficiency (higher compression at higher bpw = better quality)
|
|
89
|
+
ranked = sorted(
|
|
90
|
+
rows,
|
|
91
|
+
key=lambda r: (r["avg_bpw"], -r["compression_ratio"]),
|
|
92
|
+
reverse=True,
|
|
93
|
+
)
|
|
94
|
+
for i, r in enumerate(ranked):
|
|
95
|
+
r["rank"] = i + 1
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"models": rows,
|
|
99
|
+
"ranking": ranked,
|
|
100
|
+
"n_models": len(profiles),
|
|
101
|
+
}
|
quantbenchx/imatrix.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""GGUF importance matrix (imatrix) parsing and analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
import struct
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Optional, Union
|
|
10
|
+
|
|
11
|
+
from quantbenchx._types import QuantbenchError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class ImatrixEntry:
|
|
16
|
+
"""A single tensor entry in an importance matrix."""
|
|
17
|
+
|
|
18
|
+
name: str
|
|
19
|
+
num_values: int
|
|
20
|
+
num_calls: int
|
|
21
|
+
values: list[float]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ImatrixData:
|
|
26
|
+
"""Parsed importance matrix data."""
|
|
27
|
+
|
|
28
|
+
entries: list[ImatrixEntry]
|
|
29
|
+
total_calls: int
|
|
30
|
+
|
|
31
|
+
def by_name(self, name: str) -> Optional[ImatrixEntry]:
|
|
32
|
+
"""Look up an entry by tensor name."""
|
|
33
|
+
for entry in self.entries:
|
|
34
|
+
if entry.name == name:
|
|
35
|
+
return entry
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def top_k(self, k: int) -> list[ImatrixEntry]:
|
|
39
|
+
"""Return the top-k entries ranked by mean importance value."""
|
|
40
|
+
def _mean(entry: ImatrixEntry) -> float:
|
|
41
|
+
if not entry.values:
|
|
42
|
+
return 0.0
|
|
43
|
+
return sum(entry.values) / len(entry.values)
|
|
44
|
+
|
|
45
|
+
return sorted(self.entries, key=_mean, reverse=True)[:k]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ImatrixAnalysis:
|
|
50
|
+
"""Result of analyzing an importance matrix."""
|
|
51
|
+
|
|
52
|
+
total_tensors: int
|
|
53
|
+
mean_importance_per_layer: Dict[str, float]
|
|
54
|
+
variance_per_layer: Dict[str, float]
|
|
55
|
+
outlier_layers: list[str]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def parse_imatrix(path: Union[str, Path]) -> ImatrixData:
|
|
59
|
+
"""Parse a binary imatrix file.
|
|
60
|
+
|
|
61
|
+
Format per tensor:
|
|
62
|
+
name_length (int32) + name (bytes) + num_values (int32)
|
|
63
|
+
+ num_calls (int32) + values (float32 * num_values)
|
|
64
|
+
"""
|
|
65
|
+
path = Path(path)
|
|
66
|
+
if not path.exists():
|
|
67
|
+
raise QuantbenchError(f"imatrix file not found: {path}")
|
|
68
|
+
|
|
69
|
+
entries: list[ImatrixEntry] = []
|
|
70
|
+
total_calls = 0
|
|
71
|
+
|
|
72
|
+
with open(path, "rb") as f:
|
|
73
|
+
data = f.read()
|
|
74
|
+
|
|
75
|
+
offset = 0
|
|
76
|
+
while offset < len(data):
|
|
77
|
+
# name_length (int32)
|
|
78
|
+
if offset + 4 > len(data):
|
|
79
|
+
break
|
|
80
|
+
(name_length,) = struct.unpack_from("<i", data, offset)
|
|
81
|
+
offset += 4
|
|
82
|
+
|
|
83
|
+
if name_length <= 0 or offset + name_length > len(data):
|
|
84
|
+
raise QuantbenchError(
|
|
85
|
+
f"Invalid name length {name_length} at offset {offset - 4}"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# name (bytes)
|
|
89
|
+
name = data[offset : offset + name_length].decode("utf-8")
|
|
90
|
+
offset += name_length
|
|
91
|
+
|
|
92
|
+
# num_values (int32)
|
|
93
|
+
if offset + 4 > len(data):
|
|
94
|
+
raise QuantbenchError(f"Unexpected EOF reading num_values for '{name}'")
|
|
95
|
+
(num_values,) = struct.unpack_from("<i", data, offset)
|
|
96
|
+
offset += 4
|
|
97
|
+
|
|
98
|
+
if num_values < 0:
|
|
99
|
+
raise QuantbenchError(f"Invalid num_values {num_values} for '{name}'")
|
|
100
|
+
|
|
101
|
+
# num_calls (int32)
|
|
102
|
+
if offset + 4 > len(data):
|
|
103
|
+
raise QuantbenchError(f"Unexpected EOF reading num_calls for '{name}'")
|
|
104
|
+
(num_calls,) = struct.unpack_from("<i", data, offset)
|
|
105
|
+
offset += 4
|
|
106
|
+
|
|
107
|
+
# values (float32 * num_values)
|
|
108
|
+
values_size = num_values * 4
|
|
109
|
+
if offset + values_size > len(data):
|
|
110
|
+
raise QuantbenchError(
|
|
111
|
+
f"Unexpected EOF reading {num_values} values for '{name}'"
|
|
112
|
+
)
|
|
113
|
+
values = list(struct.unpack_from(f"<{num_values}f", data, offset))
|
|
114
|
+
offset += values_size
|
|
115
|
+
|
|
116
|
+
total_calls = max(total_calls, num_calls)
|
|
117
|
+
entries.append(
|
|
118
|
+
ImatrixEntry(
|
|
119
|
+
name=name,
|
|
120
|
+
num_values=num_values,
|
|
121
|
+
num_calls=num_calls,
|
|
122
|
+
values=values,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return ImatrixData(entries=entries, total_calls=total_calls)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def analyze_imatrix(data: ImatrixData) -> ImatrixAnalysis:
|
|
130
|
+
"""Analyze an importance matrix and identify outlier layers."""
|
|
131
|
+
mean_importance: Dict[str, float] = {}
|
|
132
|
+
variance: Dict[str, float] = {}
|
|
133
|
+
|
|
134
|
+
for entry in data.entries:
|
|
135
|
+
if not entry.values:
|
|
136
|
+
mean_importance[entry.name] = 0.0
|
|
137
|
+
variance[entry.name] = 0.0
|
|
138
|
+
continue
|
|
139
|
+
n = len(entry.values)
|
|
140
|
+
mu = sum(entry.values) / n
|
|
141
|
+
var = sum((v - mu) ** 2 for v in entry.values) / n
|
|
142
|
+
mean_importance[entry.name] = mu
|
|
143
|
+
variance[entry.name] = var
|
|
144
|
+
|
|
145
|
+
# Identify outliers: layers with mean importance > 2 std above global mean
|
|
146
|
+
all_means = list(mean_importance.values())
|
|
147
|
+
outlier_layers: list[str] = []
|
|
148
|
+
if all_means:
|
|
149
|
+
global_mean = sum(all_means) / len(all_means)
|
|
150
|
+
global_var = sum((m - global_mean) ** 2 for m in all_means) / len(all_means)
|
|
151
|
+
global_std = math.sqrt(global_var)
|
|
152
|
+
threshold = global_mean + 2 * global_std
|
|
153
|
+
outlier_layers = [
|
|
154
|
+
name
|
|
155
|
+
for name, mu in mean_importance.items()
|
|
156
|
+
if mu > threshold
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
return ImatrixAnalysis(
|
|
160
|
+
total_tensors=len(data.entries),
|
|
161
|
+
mean_importance_per_layer=mean_importance,
|
|
162
|
+
variance_per_layer=variance,
|
|
163
|
+
outlier_layers=outlier_layers,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def format_imatrix_report(analysis: ImatrixAnalysis) -> str:
|
|
168
|
+
"""Format the analysis as a human-readable text report."""
|
|
169
|
+
lines: list[str] = []
|
|
170
|
+
lines.append("=" * 60)
|
|
171
|
+
lines.append("Importance Matrix Analysis Report")
|
|
172
|
+
lines.append("=" * 60)
|
|
173
|
+
lines.append(f"Total tensors: {analysis.total_tensors}")
|
|
174
|
+
lines.append("")
|
|
175
|
+
|
|
176
|
+
# Sort layers by importance descending
|
|
177
|
+
sorted_layers = sorted(
|
|
178
|
+
analysis.mean_importance_per_layer.items(),
|
|
179
|
+
key=lambda kv: kv[1],
|
|
180
|
+
reverse=True,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
lines.append("Layer Importance (descending):")
|
|
184
|
+
lines.append("-" * 60)
|
|
185
|
+
lines.append(f"{'Layer':<40} {'Mean':>10} {'Variance':>10}")
|
|
186
|
+
lines.append("-" * 60)
|
|
187
|
+
for name, mu in sorted_layers:
|
|
188
|
+
var = analysis.variance_per_layer.get(name, 0.0)
|
|
189
|
+
tag = " [OUTLIER]" if name in analysis.outlier_layers else ""
|
|
190
|
+
lines.append(f"{name:<40} {mu:>10.6f} {var:>10.6f}{tag}")
|
|
191
|
+
|
|
192
|
+
lines.append("")
|
|
193
|
+
if analysis.outlier_layers:
|
|
194
|
+
lines.append(f"Outlier layers ({len(analysis.outlier_layers)}):")
|
|
195
|
+
for name in analysis.outlier_layers:
|
|
196
|
+
lines.append(f" - {name}")
|
|
197
|
+
else:
|
|
198
|
+
lines.append("No outlier layers detected.")
|
|
199
|
+
|
|
200
|
+
lines.append("=" * 60)
|
|
201
|
+
return "\n".join(lines)
|
quantbenchx/layerwise.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Layerwise analysis — sensitivity estimation and mixed-quant recommendations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List
|
|
6
|
+
|
|
7
|
+
from quantbenchx._types import DType, ModelProfile
|
|
8
|
+
|
|
9
|
+
# Sensitivity heuristics based on layer position and type
|
|
10
|
+
_SENSITIVITY_KEYWORDS = {
|
|
11
|
+
"embed": 0.9,
|
|
12
|
+
"lm_head": 0.9,
|
|
13
|
+
"output": 0.8,
|
|
14
|
+
"norm": 0.85,
|
|
15
|
+
"ln": 0.85,
|
|
16
|
+
"layernorm": 0.85,
|
|
17
|
+
"rmsnorm": 0.85,
|
|
18
|
+
"attn_q": 0.7,
|
|
19
|
+
"attn_k": 0.7,
|
|
20
|
+
"attn_v": 0.65,
|
|
21
|
+
"attn_output": 0.6,
|
|
22
|
+
"q_proj": 0.7,
|
|
23
|
+
"k_proj": 0.7,
|
|
24
|
+
"v_proj": 0.65,
|
|
25
|
+
"o_proj": 0.6,
|
|
26
|
+
"gate": 0.5,
|
|
27
|
+
"up": 0.5,
|
|
28
|
+
"down": 0.45,
|
|
29
|
+
"ffn": 0.45,
|
|
30
|
+
"mlp": 0.45,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Position-based sensitivity multiplier (first and last layers are more sensitive)
|
|
34
|
+
_POSITION_CURVE = {
|
|
35
|
+
"first_10pct": 1.3,
|
|
36
|
+
"last_10pct": 1.2,
|
|
37
|
+
"middle": 1.0,
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def analyze_layers(profile: ModelProfile) -> List[Dict[str, Any]]:
|
|
42
|
+
"""Analyze each layer's quantization characteristics.
|
|
43
|
+
|
|
44
|
+
Returns a list of dicts with layer name, param count, avg bpw,
|
|
45
|
+
dominant dtype, and estimated sensitivity.
|
|
46
|
+
"""
|
|
47
|
+
results: List[Dict[str, Any]] = []
|
|
48
|
+
n_layers = len(profile.layers)
|
|
49
|
+
|
|
50
|
+
for i, layer in enumerate(profile.layers):
|
|
51
|
+
sensitivity = _estimate_layer_sensitivity(layer.name, i, n_layers)
|
|
52
|
+
results.append({
|
|
53
|
+
"name": layer.name,
|
|
54
|
+
"n_params": layer.n_params,
|
|
55
|
+
"n_tensors": len(layer.tensors),
|
|
56
|
+
"size_bytes": layer.size_bytes,
|
|
57
|
+
"avg_bits_per_weight": round(layer.avg_bits_per_weight, 2),
|
|
58
|
+
"dominant_dtype": layer.dominant_dtype.value,
|
|
59
|
+
"sensitivity": round(sensitivity, 3),
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
return results
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def layer_sensitivity(profile: ModelProfile) -> Dict[str, float]:
|
|
66
|
+
"""Return a mapping of layer name → estimated sensitivity score (0-1)."""
|
|
67
|
+
n_layers = len(profile.layers)
|
|
68
|
+
return {
|
|
69
|
+
layer.name: round(_estimate_layer_sensitivity(layer.name, i, n_layers), 3)
|
|
70
|
+
for i, layer in enumerate(profile.layers)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def recommend_mixed_quant(
|
|
75
|
+
profile: ModelProfile,
|
|
76
|
+
target_bpw: float = 4.5,
|
|
77
|
+
high_quant: DType = DType.Q5_K_M,
|
|
78
|
+
low_quant: DType = DType.Q4_K_M,
|
|
79
|
+
) -> Dict[str, Any]:
|
|
80
|
+
"""Recommend a mixed-precision quantization strategy.
|
|
81
|
+
|
|
82
|
+
Assigns higher precision to more sensitive layers to meet a target
|
|
83
|
+
average bits-per-weight while maximizing quality.
|
|
84
|
+
"""
|
|
85
|
+
n_layers = len(profile.layers)
|
|
86
|
+
if n_layers == 0:
|
|
87
|
+
return {"strategy": [], "estimated_avg_bpw": 0.0}
|
|
88
|
+
|
|
89
|
+
# Score layers by sensitivity
|
|
90
|
+
scored: List[tuple] = [] # (sensitivity, layer_name, n_params)
|
|
91
|
+
for i, layer in enumerate(profile.layers):
|
|
92
|
+
sens = _estimate_layer_sensitivity(layer.name, i, n_layers)
|
|
93
|
+
scored.append((sens, layer.name, layer.n_params))
|
|
94
|
+
|
|
95
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
96
|
+
|
|
97
|
+
# Binary search for the cutoff: how many layers get high_quant
|
|
98
|
+
total_params = sum(s[2] for s in scored)
|
|
99
|
+
if total_params == 0:
|
|
100
|
+
return {"strategy": [], "estimated_avg_bpw": 0.0}
|
|
101
|
+
|
|
102
|
+
best_k = 0
|
|
103
|
+
for k in range(len(scored) + 1):
|
|
104
|
+
high_params = sum(scored[j][2] for j in range(k))
|
|
105
|
+
low_params = total_params - high_params
|
|
106
|
+
avg = (high_params * high_quant.bits_per_weight + low_params * low_quant.bits_per_weight) / total_params
|
|
107
|
+
if avg <= target_bpw:
|
|
108
|
+
best_k = k
|
|
109
|
+
break
|
|
110
|
+
else:
|
|
111
|
+
best_k = len(scored)
|
|
112
|
+
|
|
113
|
+
# Build strategy
|
|
114
|
+
high_set = {scored[j][1] for j in range(best_k)}
|
|
115
|
+
strategy = []
|
|
116
|
+
for sens, name, n_params in scored:
|
|
117
|
+
chosen = high_quant if name in high_set else low_quant
|
|
118
|
+
strategy.append({
|
|
119
|
+
"layer": name,
|
|
120
|
+
"dtype": chosen.value,
|
|
121
|
+
"sensitivity": round(sens, 3),
|
|
122
|
+
"n_params": n_params,
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
# Compute final average
|
|
126
|
+
high_params = sum(s[2] for s in scored[:best_k])
|
|
127
|
+
low_params = total_params - high_params
|
|
128
|
+
est_avg = (high_params * high_quant.bits_per_weight + low_params * low_quant.bits_per_weight) / total_params
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"strategy": strategy,
|
|
132
|
+
"estimated_avg_bpw": round(est_avg, 2),
|
|
133
|
+
"target_bpw": target_bpw,
|
|
134
|
+
"high_quant": high_quant.value,
|
|
135
|
+
"low_quant": low_quant.value,
|
|
136
|
+
"n_high_precision_layers": best_k,
|
|
137
|
+
"n_low_precision_layers": len(scored) - best_k,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _estimate_layer_sensitivity(name: str, position: int, total_layers: int) -> float:
|
|
142
|
+
"""Estimate how sensitive a layer is to quantization.
|
|
143
|
+
|
|
144
|
+
Score 0-1 where 1 = most sensitive (should keep high precision).
|
|
145
|
+
Uses name-based heuristics and position in the network.
|
|
146
|
+
"""
|
|
147
|
+
# Base sensitivity from name
|
|
148
|
+
base = 0.5
|
|
149
|
+
name_lower = name.lower()
|
|
150
|
+
for keyword, score in _SENSITIVITY_KEYWORDS.items():
|
|
151
|
+
if keyword in name_lower:
|
|
152
|
+
base = max(base, score)
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
# Position multiplier
|
|
156
|
+
if total_layers <= 1:
|
|
157
|
+
mult = 1.0
|
|
158
|
+
else:
|
|
159
|
+
frac = position / (total_layers - 1) if total_layers > 1 else 0.0
|
|
160
|
+
if frac < 0.1:
|
|
161
|
+
mult = _POSITION_CURVE["first_10pct"]
|
|
162
|
+
elif frac > 0.9:
|
|
163
|
+
mult = _POSITION_CURVE["last_10pct"]
|
|
164
|
+
else:
|
|
165
|
+
mult = _POSITION_CURVE["middle"]
|
|
166
|
+
|
|
167
|
+
return min(base * mult, 1.0)
|