quantbenchx 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantbenchx/__init__.py +132 -0
- quantbenchx/_types.py +220 -0
- quantbenchx/bandwidth.py +290 -0
- quantbenchx/cli.py +153 -0
- quantbenchx/compare.py +101 -0
- quantbenchx/imatrix.py +201 -0
- quantbenchx/layerwise.py +167 -0
- quantbenchx/matrix.py +289 -0
- quantbenchx/perplexity.py +168 -0
- quantbenchx/predict.py +125 -0
- quantbenchx/profile.py +301 -0
- quantbenchx/py.typed +0 -0
- quantbenchx/recommend.py +240 -0
- quantbenchx/report.py +171 -0
- quantbenchx-0.3.0.dist-info/METADATA +213 -0
- quantbenchx-0.3.0.dist-info/RECORD +17 -0
- quantbenchx-0.3.0.dist-info/WHEEL +4 -0
quantbenchx/matrix.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""Quantization format comparison matrix."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class QuantFormatSpec:
|
|
11
|
+
"""Specification of a quantization format."""
|
|
12
|
+
|
|
13
|
+
name: str
|
|
14
|
+
bits: int
|
|
15
|
+
block_size: Optional[int] = None
|
|
16
|
+
symmetric: bool = True
|
|
17
|
+
description: str = ""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def bytes_per_weight(self) -> float:
|
|
21
|
+
"""Effective bytes per weight (including overhead for block formats)."""
|
|
22
|
+
base = self.bits / 8.0
|
|
23
|
+
if self.block_size is not None and self.block_size > 0:
|
|
24
|
+
# Block formats store a scale per block
|
|
25
|
+
overhead = 2.0 / self.block_size # 2 bytes for FP16 scale
|
|
26
|
+
if not self.symmetric:
|
|
27
|
+
overhead += 2.0 / self.block_size # zero-point
|
|
28
|
+
return base + overhead
|
|
29
|
+
return base
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class FormatComparison:
|
|
34
|
+
"""Result of comparing two quantization formats."""
|
|
35
|
+
|
|
36
|
+
format_a: str
|
|
37
|
+
format_b: str
|
|
38
|
+
size_ratio: float
|
|
39
|
+
accuracy_delta: float
|
|
40
|
+
speed_ratio: float
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
KNOWN_FORMATS: Dict[str, QuantFormatSpec] = {
|
|
44
|
+
"FP16": QuantFormatSpec(
|
|
45
|
+
name="FP16", bits=16, block_size=None, symmetric=True,
|
|
46
|
+
description="IEEE 754 half-precision floating point",
|
|
47
|
+
),
|
|
48
|
+
"BF16": QuantFormatSpec(
|
|
49
|
+
name="BF16", bits=16, block_size=None, symmetric=True,
|
|
50
|
+
description="Brain floating point 16-bit",
|
|
51
|
+
),
|
|
52
|
+
"INT8": QuantFormatSpec(
|
|
53
|
+
name="INT8", bits=8, block_size=None, symmetric=True,
|
|
54
|
+
description="8-bit integer quantization",
|
|
55
|
+
),
|
|
56
|
+
"INT4": QuantFormatSpec(
|
|
57
|
+
name="INT4", bits=4, block_size=128, symmetric=True,
|
|
58
|
+
description="4-bit integer quantization with 128-element blocks",
|
|
59
|
+
),
|
|
60
|
+
"GPTQ": QuantFormatSpec(
|
|
61
|
+
name="GPTQ", bits=4, block_size=128, symmetric=False,
|
|
62
|
+
description="GPTQ 4-bit with group size 128",
|
|
63
|
+
),
|
|
64
|
+
"AWQ": QuantFormatSpec(
|
|
65
|
+
name="AWQ", bits=4, block_size=128, symmetric=False,
|
|
66
|
+
description="Activation-aware Weight Quantization 4-bit",
|
|
67
|
+
),
|
|
68
|
+
"GGUF_Q4_0": QuantFormatSpec(
|
|
69
|
+
name="GGUF_Q4_0", bits=4, block_size=32, symmetric=True,
|
|
70
|
+
description="GGUF Q4_0: 4-bit quantization, 32-element blocks",
|
|
71
|
+
),
|
|
72
|
+
"GGUF_Q5_1": QuantFormatSpec(
|
|
73
|
+
name="GGUF_Q5_1", bits=5, block_size=32, symmetric=False,
|
|
74
|
+
description="GGUF Q5_1: 5-bit quantization with zero-point, 32-element blocks",
|
|
75
|
+
),
|
|
76
|
+
"NF4": QuantFormatSpec(
|
|
77
|
+
name="NF4", bits=4, block_size=64, symmetric=True,
|
|
78
|
+
description="NormalFloat 4-bit (QLoRA)",
|
|
79
|
+
),
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
# Heuristic accuracy retention scores per bit-width (relative to FP16 = 1.0).
|
|
83
|
+
# Higher is better. Accounts for typical perplexity retention on LLMs.
|
|
84
|
+
_ACCURACY_BY_BITS: Dict[int, float] = {
|
|
85
|
+
16: 1.0,
|
|
86
|
+
8: 0.995,
|
|
87
|
+
5: 0.98,
|
|
88
|
+
4: 0.96,
|
|
89
|
+
3: 0.92,
|
|
90
|
+
2: 0.82,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
# Heuristic inference speed multiplier relative to FP16 = 1.0.
|
|
94
|
+
# Lower bit counts run faster due to reduced memory bandwidth.
|
|
95
|
+
_SPEED_BY_BITS: Dict[int, float] = {
|
|
96
|
+
16: 1.0,
|
|
97
|
+
8: 1.6,
|
|
98
|
+
5: 2.0,
|
|
99
|
+
4: 2.3,
|
|
100
|
+
3: 2.5,
|
|
101
|
+
2: 2.7,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _accuracy_estimate(fmt: QuantFormatSpec) -> float:
|
|
106
|
+
"""Heuristic accuracy retention for a format (0-1 scale, FP16=1.0)."""
|
|
107
|
+
bits = fmt.bits
|
|
108
|
+
if bits in _ACCURACY_BY_BITS:
|
|
109
|
+
score = _ACCURACY_BY_BITS[bits]
|
|
110
|
+
else:
|
|
111
|
+
# Linear interpolation between known points
|
|
112
|
+
lower = max(b for b in _ACCURACY_BY_BITS if b <= bits)
|
|
113
|
+
upper = min(b for b in _ACCURACY_BY_BITS if b >= bits)
|
|
114
|
+
if lower == upper:
|
|
115
|
+
score = _ACCURACY_BY_BITS[lower]
|
|
116
|
+
else:
|
|
117
|
+
t = (bits - lower) / (upper - lower)
|
|
118
|
+
score = _ACCURACY_BY_BITS[lower] + t * (
|
|
119
|
+
_ACCURACY_BY_BITS[upper] - _ACCURACY_BY_BITS[lower]
|
|
120
|
+
)
|
|
121
|
+
# Block-quantized formats with asymmetric quantization are slightly more accurate
|
|
122
|
+
if fmt.block_size is not None and not fmt.symmetric:
|
|
123
|
+
score = min(1.0, score + 0.005)
|
|
124
|
+
return score
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _speed_estimate(fmt: QuantFormatSpec) -> float:
|
|
128
|
+
"""Heuristic speed multiplier (relative to FP16 = 1.0x)."""
|
|
129
|
+
bits = fmt.bits
|
|
130
|
+
if bits in _SPEED_BY_BITS:
|
|
131
|
+
return _SPEED_BY_BITS[bits]
|
|
132
|
+
lower = max(b for b in _SPEED_BY_BITS if b <= bits)
|
|
133
|
+
upper = min(b for b in _SPEED_BY_BITS if b >= bits)
|
|
134
|
+
if lower == upper:
|
|
135
|
+
return _SPEED_BY_BITS[lower]
|
|
136
|
+
t = (bits - lower) / (upper - lower)
|
|
137
|
+
return _SPEED_BY_BITS[lower] + t * (
|
|
138
|
+
_SPEED_BY_BITS[upper] - _SPEED_BY_BITS[lower]
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _model_size_gb(fmt: QuantFormatSpec, base_size_gb: float) -> float:
|
|
143
|
+
"""Estimate model size in GB for a given format, relative to FP16 baseline."""
|
|
144
|
+
return base_size_gb * fmt.bytes_per_weight / 2.0 # FP16 = 2 bytes/weight
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class ComparisonMatrix:
|
|
148
|
+
"""N×N quantization format comparison matrix."""
|
|
149
|
+
|
|
150
|
+
def __init__(self, formats: Optional[List[QuantFormatSpec]] = None) -> None:
|
|
151
|
+
if formats is not None:
|
|
152
|
+
self._formats: Dict[str, QuantFormatSpec] = {f.name: f for f in formats}
|
|
153
|
+
else:
|
|
154
|
+
self._formats = dict(KNOWN_FORMATS)
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def formats(self) -> Dict[str, QuantFormatSpec]:
|
|
158
|
+
"""Return the registered formats."""
|
|
159
|
+
return dict(self._formats)
|
|
160
|
+
|
|
161
|
+
def add_format(self, fmt: QuantFormatSpec) -> None:
|
|
162
|
+
"""Register a new quantization format."""
|
|
163
|
+
self._formats[fmt.name] = fmt
|
|
164
|
+
|
|
165
|
+
def compare_pair(
|
|
166
|
+
self,
|
|
167
|
+
fmt_a: str,
|
|
168
|
+
fmt_b: str,
|
|
169
|
+
model_size_gb: float = 7.0,
|
|
170
|
+
) -> FormatComparison:
|
|
171
|
+
"""Compare two formats and return size/accuracy/speed ratios.
|
|
172
|
+
|
|
173
|
+
Ratios are expressed as A / B. A size_ratio < 1 means A is smaller.
|
|
174
|
+
An accuracy_delta > 0 means A is more accurate.
|
|
175
|
+
A speed_ratio > 1 means A is faster.
|
|
176
|
+
"""
|
|
177
|
+
if fmt_a not in self._formats:
|
|
178
|
+
raise KeyError(f"Unknown format: {fmt_a!r}")
|
|
179
|
+
if fmt_b not in self._formats:
|
|
180
|
+
raise KeyError(f"Unknown format: {fmt_b!r}")
|
|
181
|
+
|
|
182
|
+
fa = self._formats[fmt_a]
|
|
183
|
+
fb = self._formats[fmt_b]
|
|
184
|
+
|
|
185
|
+
size_a = _model_size_gb(fa, model_size_gb)
|
|
186
|
+
size_b = _model_size_gb(fb, model_size_gb)
|
|
187
|
+
size_ratio = size_a / size_b if size_b > 0 else float("inf")
|
|
188
|
+
|
|
189
|
+
acc_a = _accuracy_estimate(fa)
|
|
190
|
+
acc_b = _accuracy_estimate(fb)
|
|
191
|
+
|
|
192
|
+
speed_a = _speed_estimate(fa)
|
|
193
|
+
speed_b = _speed_estimate(fb)
|
|
194
|
+
speed_ratio = speed_a / speed_b if speed_b > 0 else float("inf")
|
|
195
|
+
|
|
196
|
+
return FormatComparison(
|
|
197
|
+
format_a=fmt_a,
|
|
198
|
+
format_b=fmt_b,
|
|
199
|
+
size_ratio=round(size_ratio, 4),
|
|
200
|
+
accuracy_delta=round(acc_a - acc_b, 6),
|
|
201
|
+
speed_ratio=round(speed_ratio, 4),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def full_matrix(self) -> List[List[FormatComparison]]:
|
|
205
|
+
"""Generate an N×N comparison matrix."""
|
|
206
|
+
names = sorted(self._formats)
|
|
207
|
+
return [
|
|
208
|
+
[self.compare_pair(a, b) for b in names]
|
|
209
|
+
for a in names
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
def rank_by(self, criterion: str = "size") -> List[QuantFormatSpec]:
|
|
213
|
+
"""Rank formats by a criterion: 'size', 'speed', or 'accuracy'."""
|
|
214
|
+
fmts = list(self._formats.values())
|
|
215
|
+
if criterion == "size":
|
|
216
|
+
fmts.sort(key=lambda f: f.bytes_per_weight)
|
|
217
|
+
elif criterion == "speed":
|
|
218
|
+
fmts.sort(key=lambda f: _speed_estimate(f), reverse=True)
|
|
219
|
+
elif criterion == "accuracy":
|
|
220
|
+
fmts.sort(key=lambda f: _accuracy_estimate(f), reverse=True)
|
|
221
|
+
else:
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"Unknown criterion: {criterion!r}. Choose 'size', 'speed', or 'accuracy'."
|
|
224
|
+
)
|
|
225
|
+
return fmts
|
|
226
|
+
|
|
227
|
+
def recommend(self, constraints: Dict[str, Any]) -> List[QuantFormatSpec]:
|
|
228
|
+
"""Recommend formats matching constraints.
|
|
229
|
+
|
|
230
|
+
Supported constraint keys:
|
|
231
|
+
max_bits (int): maximum bit-width
|
|
232
|
+
min_accuracy (float): minimum accuracy retention (0-1), FP16=1.0
|
|
233
|
+
min_speed (float): minimum speed multiplier (FP16=1.0)
|
|
234
|
+
max_size_gb (float): maximum model size in GB (requires base_size_gb)
|
|
235
|
+
base_size_gb (float): FP16 model size for size calculations (default 7.0)
|
|
236
|
+
"""
|
|
237
|
+
max_bits = constraints.get("max_bits", 32)
|
|
238
|
+
min_accuracy = constraints.get("min_accuracy", 0.0)
|
|
239
|
+
min_speed = constraints.get("min_speed", 0.0)
|
|
240
|
+
max_size_gb = constraints.get("max_size_gb", float("inf"))
|
|
241
|
+
base_size_gb = constraints.get("base_size_gb", 7.0)
|
|
242
|
+
|
|
243
|
+
results = []
|
|
244
|
+
for fmt in self._formats.values():
|
|
245
|
+
if fmt.bits > max_bits:
|
|
246
|
+
continue
|
|
247
|
+
if _accuracy_estimate(fmt) < min_accuracy:
|
|
248
|
+
continue
|
|
249
|
+
if _speed_estimate(fmt) < min_speed:
|
|
250
|
+
continue
|
|
251
|
+
if _model_size_gb(fmt, base_size_gb) > max_size_gb:
|
|
252
|
+
continue
|
|
253
|
+
results.append(fmt)
|
|
254
|
+
|
|
255
|
+
# Sort by accuracy descending, then speed descending
|
|
256
|
+
results.sort(
|
|
257
|
+
key=lambda f: (_accuracy_estimate(f), _speed_estimate(f)),
|
|
258
|
+
reverse=True,
|
|
259
|
+
)
|
|
260
|
+
return results
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def format_comparison_table(matrix: List[List[FormatComparison]]) -> str:
|
|
264
|
+
"""Render an N×N comparison matrix as an ASCII table.
|
|
265
|
+
|
|
266
|
+
Each cell shows the size ratio of row-format vs column-format.
|
|
267
|
+
"""
|
|
268
|
+
if not matrix or not matrix[0]:
|
|
269
|
+
return "(empty matrix)"
|
|
270
|
+
|
|
271
|
+
names = [row[0].format_a for row in matrix]
|
|
272
|
+
col_width = max(len(n) for n in names) + 2
|
|
273
|
+
cell_width = 8
|
|
274
|
+
|
|
275
|
+
header = " " * col_width + "".join(n.center(cell_width) for n in names)
|
|
276
|
+
separator = "-" * len(header)
|
|
277
|
+
|
|
278
|
+
lines = [header, separator]
|
|
279
|
+
for row in matrix:
|
|
280
|
+
label = row[0].format_a.ljust(col_width)
|
|
281
|
+
cells = []
|
|
282
|
+
for comp in row:
|
|
283
|
+
if comp.format_a == comp.format_b:
|
|
284
|
+
cells.append(" --- ".center(cell_width))
|
|
285
|
+
else:
|
|
286
|
+
cells.append(f"{comp.size_ratio:.2f}x".center(cell_width))
|
|
287
|
+
lines.append(label + "".join(cells))
|
|
288
|
+
|
|
289
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Perplexity-based quality scoring for quantized models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Dict, Sequence
|
|
8
|
+
|
|
9
|
+
from quantbenchx._types import ModelProfile
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class PerplexityScore:
|
|
14
|
+
"""Result of a perplexity computation."""
|
|
15
|
+
|
|
16
|
+
value: float
|
|
17
|
+
num_tokens: int
|
|
18
|
+
log_likelihood: float
|
|
19
|
+
normalized: bool
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class PerplexityDelta:
|
|
24
|
+
"""Estimated perplexity change between original and quantized models."""
|
|
25
|
+
|
|
26
|
+
original_bits: float
|
|
27
|
+
quantized_bits: float
|
|
28
|
+
estimated_ppl_increase_pct: float
|
|
29
|
+
quality_retention: float # 0-1
|
|
30
|
+
per_layer_impact: Dict[str, float] = field(default_factory=dict)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ── Heuristic constants ───────────────────────────────────────────────
|
|
34
|
+
|
|
35
|
+
# Attention layers are ~2× more sensitive than FFN layers.
|
|
36
|
+
_ATTN_SENSITIVITY = 2.0
|
|
37
|
+
_FFN_SENSITIVITY = 1.0
|
|
38
|
+
|
|
39
|
+
# Empirical base: each halving of bpw roughly doubles the ppl penalty.
|
|
40
|
+
_BASE_PPL_RATE = 0.04 # ppl % increase per bit of precision lost
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _layer_sensitivity(name: str) -> float:
|
|
44
|
+
"""Return a sensitivity multiplier based on the layer name."""
|
|
45
|
+
low = name.lower()
|
|
46
|
+
if any(kw in low for kw in ("attn", "attention", "self_attn", "q_proj", "k_proj", "v_proj", "o_proj")):
|
|
47
|
+
return _ATTN_SENSITIVITY
|
|
48
|
+
if any(kw in low for kw in ("embed", "lm_head", "output", "norm")):
|
|
49
|
+
return _ATTN_SENSITIVITY * 1.5 # most sensitive
|
|
50
|
+
return _FFN_SENSITIVITY
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _bits_for_profile(profile: ModelProfile) -> float:
|
|
54
|
+
bpw = profile.quant.avg_bits_per_weight
|
|
55
|
+
return bpw if bpw > 0 else 16.0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _per_layer_impact(profile: ModelProfile, ref_bits: float) -> Dict[str, float]:
|
|
59
|
+
impacts: Dict[str, float] = {}
|
|
60
|
+
for layer in profile.layers:
|
|
61
|
+
lbpw = layer.avg_bits_per_weight
|
|
62
|
+
if lbpw <= 0:
|
|
63
|
+
lbpw = ref_bits
|
|
64
|
+
bit_drop = max(0.0, ref_bits - lbpw)
|
|
65
|
+
sens = _layer_sensitivity(layer.name)
|
|
66
|
+
impacts[layer.name] = round(bit_drop * sens * _BASE_PPL_RATE, 6)
|
|
67
|
+
return impacts
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _ppl_increase_pct(original_bits: float, quantized_bits: float) -> float:
|
|
71
|
+
"""Estimate perplexity % increase from precision loss.
|
|
72
|
+
|
|
73
|
+
Uses an exponential heuristic: ppl_pct ≈ base_rate * 2^(bit_drop / scale) - base_rate
|
|
74
|
+
so small drops are nearly linear while large drops blow up.
|
|
75
|
+
"""
|
|
76
|
+
bit_drop = max(0.0, original_bits - quantized_bits)
|
|
77
|
+
if bit_drop == 0:
|
|
78
|
+
return 0.0
|
|
79
|
+
# scale factor: a 4-bit drop ≈ 1 doubling
|
|
80
|
+
return _BASE_PPL_RATE * (2.0 ** (bit_drop / 4.0) - 1.0) * 100.0
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ── Public API ────────────────────────────────────────────────────────
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def estimate_perplexity_delta(
|
|
87
|
+
profile_original: ModelProfile,
|
|
88
|
+
profile_quantized: ModelProfile,
|
|
89
|
+
) -> PerplexityDelta:
|
|
90
|
+
"""Estimate the perplexity delta between an original and quantized model.
|
|
91
|
+
|
|
92
|
+
Uses a heuristic: lower bit-widths increase perplexity roughly
|
|
93
|
+
exponentially, and attention layers are more sensitive than FFN layers.
|
|
94
|
+
"""
|
|
95
|
+
orig_bits = _bits_for_profile(profile_original)
|
|
96
|
+
quant_bits = _bits_for_profile(profile_quantized)
|
|
97
|
+
|
|
98
|
+
ppl_pct = _ppl_increase_pct(orig_bits, quant_bits)
|
|
99
|
+
retention = max(0.0, min(1.0, 1.0 - ppl_pct / 100.0))
|
|
100
|
+
|
|
101
|
+
impacts = _per_layer_impact(profile_quantized, orig_bits)
|
|
102
|
+
|
|
103
|
+
return PerplexityDelta(
|
|
104
|
+
original_bits=round(orig_bits, 4),
|
|
105
|
+
quantized_bits=round(quant_bits, 4),
|
|
106
|
+
estimated_ppl_increase_pct=round(ppl_pct, 4),
|
|
107
|
+
quality_retention=round(retention, 6),
|
|
108
|
+
per_layer_impact=impacts,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def quality_score(profile: ModelProfile, reference_bits: float = 16.0) -> float:
|
|
113
|
+
"""Score model quality relative to a reference precision.
|
|
114
|
+
|
|
115
|
+
Returns a float in [0, 1]. 1.0 means no quality loss (same precision
|
|
116
|
+
as reference), 0.0 means severe degradation.
|
|
117
|
+
"""
|
|
118
|
+
bpw = _bits_for_profile(profile)
|
|
119
|
+
if bpw >= reference_bits:
|
|
120
|
+
return 1.0
|
|
121
|
+
bit_drop = reference_bits - bpw
|
|
122
|
+
# Exponential decay: exp(-k * bit_drop^1.5) gives a smooth curve
|
|
123
|
+
score = math.exp(-0.06 * (bit_drop ** 1.5))
|
|
124
|
+
return round(max(0.0, min(1.0, score)), 6)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def perplexity_from_logprobs(logprobs: Sequence[float]) -> PerplexityScore:
|
|
128
|
+
"""Compute perplexity from a list of token log-probabilities.
|
|
129
|
+
|
|
130
|
+
Standard formula: ppl = exp( -1/N * Σ log p_i )
|
|
131
|
+
"""
|
|
132
|
+
n = len(logprobs)
|
|
133
|
+
if n == 0:
|
|
134
|
+
return PerplexityScore(value=float("inf"), num_tokens=0, log_likelihood=0.0, normalized=True)
|
|
135
|
+
|
|
136
|
+
total_ll = sum(logprobs)
|
|
137
|
+
avg_neg_ll = -total_ll / n
|
|
138
|
+
ppl = math.exp(avg_neg_ll)
|
|
139
|
+
|
|
140
|
+
return PerplexityScore(
|
|
141
|
+
value=ppl,
|
|
142
|
+
num_tokens=n,
|
|
143
|
+
log_likelihood=total_ll,
|
|
144
|
+
normalized=True,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def format_quality_report(delta: PerplexityDelta) -> str:
|
|
149
|
+
"""Format a PerplexityDelta as a human-readable report."""
|
|
150
|
+
lines = [
|
|
151
|
+
"Perplexity Quality Report",
|
|
152
|
+
"=" * 40,
|
|
153
|
+
f"Original precision : {delta.original_bits:.2f} bits",
|
|
154
|
+
f"Quantized precision: {delta.quantized_bits:.2f} bits",
|
|
155
|
+
f"Est. PPL increase : {delta.estimated_ppl_increase_pct:.2f}%",
|
|
156
|
+
f"Quality retention : {delta.quality_retention:.4f}",
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
if delta.per_layer_impact:
|
|
160
|
+
lines.append("")
|
|
161
|
+
lines.append("Per-layer impact:")
|
|
162
|
+
sorted_layers = sorted(delta.per_layer_impact.items(), key=lambda x: x[1], reverse=True)
|
|
163
|
+
for name, impact in sorted_layers[:10]:
|
|
164
|
+
lines.append(f" {name:40s} {impact:.6f}")
|
|
165
|
+
if len(sorted_layers) > 10:
|
|
166
|
+
lines.append(f" ... and {len(sorted_layers) - 10} more layers")
|
|
167
|
+
|
|
168
|
+
return "\n".join(lines)
|
quantbenchx/predict.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Quality prediction — estimate perplexity delta and quality score from quantization params."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
from quantbenchx._types import ModelProfile, QualityEstimate
|
|
9
|
+
|
|
10
|
+
# Empirical perplexity delta estimates per bits-per-weight
|
|
11
|
+
# Based on published quantization benchmarks (llama.cpp, GPTQ papers)
|
|
12
|
+
_PERPLEXITY_CURVE = {
|
|
13
|
+
16.0: 0.0,
|
|
14
|
+
8.5: 0.01,
|
|
15
|
+
6.5: 0.03,
|
|
16
|
+
5.5: 0.05,
|
|
17
|
+
4.85: 0.08,
|
|
18
|
+
4.5: 0.12,
|
|
19
|
+
4.0: 0.18,
|
|
20
|
+
3.9: 0.22,
|
|
21
|
+
3.5: 0.35,
|
|
22
|
+
3.35: 0.45,
|
|
23
|
+
3.0: 0.65,
|
|
24
|
+
2.5: 1.0,
|
|
25
|
+
2.0: 1.8,
|
|
26
|
+
1.5: 3.5,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _interpolate_perplexity(bpw: float) -> float:
|
|
31
|
+
"""Interpolate expected perplexity delta from bits-per-weight."""
|
|
32
|
+
points = sorted(_PERPLEXITY_CURVE.items(), reverse=True)
|
|
33
|
+
|
|
34
|
+
if bpw >= points[0][0]:
|
|
35
|
+
return points[0][1]
|
|
36
|
+
if bpw <= points[-1][0]:
|
|
37
|
+
return points[-1][1]
|
|
38
|
+
|
|
39
|
+
for i in range(len(points) - 1):
|
|
40
|
+
x1, y1 = points[i]
|
|
41
|
+
x2, y2 = points[i + 1]
|
|
42
|
+
if x2 <= bpw <= x1:
|
|
43
|
+
t = (bpw - x2) / (x1 - x2) if x1 != x2 else 0.0
|
|
44
|
+
return y1 * t + y2 * (1 - t)
|
|
45
|
+
|
|
46
|
+
return 0.5 # fallback
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def estimate_quality(profile: ModelProfile) -> QualityEstimate:
|
|
50
|
+
"""Estimate quantization quality from a model profile.
|
|
51
|
+
|
|
52
|
+
Returns a QualityEstimate with predicted perplexity delta,
|
|
53
|
+
quality score, risk level, and recommendations.
|
|
54
|
+
"""
|
|
55
|
+
bpw = profile.quant.avg_bits_per_weight
|
|
56
|
+
if bpw == 0:
|
|
57
|
+
bpw = 16.0 # assume full precision if unknown
|
|
58
|
+
|
|
59
|
+
ppl_delta = _interpolate_perplexity(bpw)
|
|
60
|
+
|
|
61
|
+
# Quality score: 1.0 = perfect (no loss), 0.0 = terrible
|
|
62
|
+
# Exponential decay from bpw
|
|
63
|
+
quality = math.exp(-0.15 * max(0, 16.0 - bpw))
|
|
64
|
+
quality = max(0.0, min(1.0, quality))
|
|
65
|
+
|
|
66
|
+
# Risk level
|
|
67
|
+
if ppl_delta < 0.05:
|
|
68
|
+
risk = "low"
|
|
69
|
+
elif ppl_delta < 0.15:
|
|
70
|
+
risk = "medium"
|
|
71
|
+
elif ppl_delta < 0.5:
|
|
72
|
+
risk = "high"
|
|
73
|
+
else:
|
|
74
|
+
risk = "critical"
|
|
75
|
+
|
|
76
|
+
# Find sensitive layers
|
|
77
|
+
sensitive: List[str] = []
|
|
78
|
+
for layer in profile.layers:
|
|
79
|
+
name_lower = layer.name.lower()
|
|
80
|
+
if any(kw in name_lower for kw in ("embed", "lm_head", "output", "norm")):
|
|
81
|
+
if layer.avg_bits_per_weight < 6.0:
|
|
82
|
+
sensitive.append(layer.name)
|
|
83
|
+
|
|
84
|
+
# Recommendations
|
|
85
|
+
recs: List[str] = []
|
|
86
|
+
if bpw < 3.0:
|
|
87
|
+
recs.append("Very aggressive quantization — expect significant quality loss")
|
|
88
|
+
recs.append("Consider Q4_K_M or higher for production use")
|
|
89
|
+
elif bpw < 4.0:
|
|
90
|
+
recs.append("Aggressive quantization — test on your specific workload")
|
|
91
|
+
elif bpw < 5.0:
|
|
92
|
+
recs.append("Good balance of size and quality for most use cases")
|
|
93
|
+
|
|
94
|
+
if sensitive:
|
|
95
|
+
recs.append(f"Sensitive layers at low precision: {', '.join(sensitive[:5])}")
|
|
96
|
+
recs.append("Consider mixed-quant with higher precision for embed/output layers")
|
|
97
|
+
|
|
98
|
+
if profile.quant.n_full_precision_layers == 0 and len(profile.tensors) > 10:
|
|
99
|
+
recs.append("No full-precision layers detected — norm layers may benefit from FP16")
|
|
100
|
+
|
|
101
|
+
total_params = profile.total_params
|
|
102
|
+
if total_params > 0:
|
|
103
|
+
param_b = total_params / 1e9
|
|
104
|
+
size_gb = profile.size_gb
|
|
105
|
+
if param_b > 0:
|
|
106
|
+
recs.append(f"Model: ~{param_b:.1f}B params, {size_gb:.1f} GB at {bpw:.1f} bpw")
|
|
107
|
+
|
|
108
|
+
return QualityEstimate(
|
|
109
|
+
model_name=profile.name,
|
|
110
|
+
method=profile.quant.method.value,
|
|
111
|
+
avg_bits_per_weight=round(bpw, 2),
|
|
112
|
+
estimated_perplexity_delta=round(ppl_delta, 4),
|
|
113
|
+
quality_score=round(quality, 4),
|
|
114
|
+
risk_level=risk,
|
|
115
|
+
sensitive_layers=sensitive,
|
|
116
|
+
recommendations=recs,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def perplexity_delta(bpw: float) -> float:
|
|
121
|
+
"""Estimate perplexity increase for a given bits-per-weight.
|
|
122
|
+
|
|
123
|
+
Based on published quantization benchmarks.
|
|
124
|
+
"""
|
|
125
|
+
return round(_interpolate_perplexity(bpw), 4)
|