quantbenchx 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quantbenchx/matrix.py ADDED
@@ -0,0 +1,289 @@
1
+ """Quantization format comparison matrix."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict, List, Optional
7
+
8
+
9
+ @dataclass
10
+ class QuantFormatSpec:
11
+ """Specification of a quantization format."""
12
+
13
+ name: str
14
+ bits: int
15
+ block_size: Optional[int] = None
16
+ symmetric: bool = True
17
+ description: str = ""
18
+
19
+ @property
20
+ def bytes_per_weight(self) -> float:
21
+ """Effective bytes per weight (including overhead for block formats)."""
22
+ base = self.bits / 8.0
23
+ if self.block_size is not None and self.block_size > 0:
24
+ # Block formats store a scale per block
25
+ overhead = 2.0 / self.block_size # 2 bytes for FP16 scale
26
+ if not self.symmetric:
27
+ overhead += 2.0 / self.block_size # zero-point
28
+ return base + overhead
29
+ return base
30
+
31
+
32
+ @dataclass
33
+ class FormatComparison:
34
+ """Result of comparing two quantization formats."""
35
+
36
+ format_a: str
37
+ format_b: str
38
+ size_ratio: float
39
+ accuracy_delta: float
40
+ speed_ratio: float
41
+
42
+
43
+ KNOWN_FORMATS: Dict[str, QuantFormatSpec] = {
44
+ "FP16": QuantFormatSpec(
45
+ name="FP16", bits=16, block_size=None, symmetric=True,
46
+ description="IEEE 754 half-precision floating point",
47
+ ),
48
+ "BF16": QuantFormatSpec(
49
+ name="BF16", bits=16, block_size=None, symmetric=True,
50
+ description="Brain floating point 16-bit",
51
+ ),
52
+ "INT8": QuantFormatSpec(
53
+ name="INT8", bits=8, block_size=None, symmetric=True,
54
+ description="8-bit integer quantization",
55
+ ),
56
+ "INT4": QuantFormatSpec(
57
+ name="INT4", bits=4, block_size=128, symmetric=True,
58
+ description="4-bit integer quantization with 128-element blocks",
59
+ ),
60
+ "GPTQ": QuantFormatSpec(
61
+ name="GPTQ", bits=4, block_size=128, symmetric=False,
62
+ description="GPTQ 4-bit with group size 128",
63
+ ),
64
+ "AWQ": QuantFormatSpec(
65
+ name="AWQ", bits=4, block_size=128, symmetric=False,
66
+ description="Activation-aware Weight Quantization 4-bit",
67
+ ),
68
+ "GGUF_Q4_0": QuantFormatSpec(
69
+ name="GGUF_Q4_0", bits=4, block_size=32, symmetric=True,
70
+ description="GGUF Q4_0: 4-bit quantization, 32-element blocks",
71
+ ),
72
+ "GGUF_Q5_1": QuantFormatSpec(
73
+ name="GGUF_Q5_1", bits=5, block_size=32, symmetric=False,
74
+ description="GGUF Q5_1: 5-bit quantization with zero-point, 32-element blocks",
75
+ ),
76
+ "NF4": QuantFormatSpec(
77
+ name="NF4", bits=4, block_size=64, symmetric=True,
78
+ description="NormalFloat 4-bit (QLoRA)",
79
+ ),
80
+ }
81
+
82
+ # Heuristic accuracy retention scores per bit-width (relative to FP16 = 1.0).
83
+ # Higher is better. Accounts for typical perplexity retention on LLMs.
84
+ _ACCURACY_BY_BITS: Dict[int, float] = {
85
+ 16: 1.0,
86
+ 8: 0.995,
87
+ 5: 0.98,
88
+ 4: 0.96,
89
+ 3: 0.92,
90
+ 2: 0.82,
91
+ }
92
+
93
+ # Heuristic inference speed multiplier relative to FP16 = 1.0.
94
+ # Lower bit counts run faster due to reduced memory bandwidth.
95
+ _SPEED_BY_BITS: Dict[int, float] = {
96
+ 16: 1.0,
97
+ 8: 1.6,
98
+ 5: 2.0,
99
+ 4: 2.3,
100
+ 3: 2.5,
101
+ 2: 2.7,
102
+ }
103
+
104
+
105
+ def _accuracy_estimate(fmt: QuantFormatSpec) -> float:
106
+ """Heuristic accuracy retention for a format (0-1 scale, FP16=1.0)."""
107
+ bits = fmt.bits
108
+ if bits in _ACCURACY_BY_BITS:
109
+ score = _ACCURACY_BY_BITS[bits]
110
+ else:
111
+ # Linear interpolation between known points
112
+ lower = max(b for b in _ACCURACY_BY_BITS if b <= bits)
113
+ upper = min(b for b in _ACCURACY_BY_BITS if b >= bits)
114
+ if lower == upper:
115
+ score = _ACCURACY_BY_BITS[lower]
116
+ else:
117
+ t = (bits - lower) / (upper - lower)
118
+ score = _ACCURACY_BY_BITS[lower] + t * (
119
+ _ACCURACY_BY_BITS[upper] - _ACCURACY_BY_BITS[lower]
120
+ )
121
+ # Block-quantized formats with asymmetric quantization are slightly more accurate
122
+ if fmt.block_size is not None and not fmt.symmetric:
123
+ score = min(1.0, score + 0.005)
124
+ return score
125
+
126
+
127
+ def _speed_estimate(fmt: QuantFormatSpec) -> float:
128
+ """Heuristic speed multiplier (relative to FP16 = 1.0x)."""
129
+ bits = fmt.bits
130
+ if bits in _SPEED_BY_BITS:
131
+ return _SPEED_BY_BITS[bits]
132
+ lower = max(b for b in _SPEED_BY_BITS if b <= bits)
133
+ upper = min(b for b in _SPEED_BY_BITS if b >= bits)
134
+ if lower == upper:
135
+ return _SPEED_BY_BITS[lower]
136
+ t = (bits - lower) / (upper - lower)
137
+ return _SPEED_BY_BITS[lower] + t * (
138
+ _SPEED_BY_BITS[upper] - _SPEED_BY_BITS[lower]
139
+ )
140
+
141
+
142
+ def _model_size_gb(fmt: QuantFormatSpec, base_size_gb: float) -> float:
143
+ """Estimate model size in GB for a given format, relative to FP16 baseline."""
144
+ return base_size_gb * fmt.bytes_per_weight / 2.0 # FP16 = 2 bytes/weight
145
+
146
+
147
+ class ComparisonMatrix:
148
+ """N×N quantization format comparison matrix."""
149
+
150
+ def __init__(self, formats: Optional[List[QuantFormatSpec]] = None) -> None:
151
+ if formats is not None:
152
+ self._formats: Dict[str, QuantFormatSpec] = {f.name: f for f in formats}
153
+ else:
154
+ self._formats = dict(KNOWN_FORMATS)
155
+
156
+ @property
157
+ def formats(self) -> Dict[str, QuantFormatSpec]:
158
+ """Return the registered formats."""
159
+ return dict(self._formats)
160
+
161
+ def add_format(self, fmt: QuantFormatSpec) -> None:
162
+ """Register a new quantization format."""
163
+ self._formats[fmt.name] = fmt
164
+
165
+ def compare_pair(
166
+ self,
167
+ fmt_a: str,
168
+ fmt_b: str,
169
+ model_size_gb: float = 7.0,
170
+ ) -> FormatComparison:
171
+ """Compare two formats and return size/accuracy/speed ratios.
172
+
173
+ Ratios are expressed as A / B. A size_ratio < 1 means A is smaller.
174
+ An accuracy_delta > 0 means A is more accurate.
175
+ A speed_ratio > 1 means A is faster.
176
+ """
177
+ if fmt_a not in self._formats:
178
+ raise KeyError(f"Unknown format: {fmt_a!r}")
179
+ if fmt_b not in self._formats:
180
+ raise KeyError(f"Unknown format: {fmt_b!r}")
181
+
182
+ fa = self._formats[fmt_a]
183
+ fb = self._formats[fmt_b]
184
+
185
+ size_a = _model_size_gb(fa, model_size_gb)
186
+ size_b = _model_size_gb(fb, model_size_gb)
187
+ size_ratio = size_a / size_b if size_b > 0 else float("inf")
188
+
189
+ acc_a = _accuracy_estimate(fa)
190
+ acc_b = _accuracy_estimate(fb)
191
+
192
+ speed_a = _speed_estimate(fa)
193
+ speed_b = _speed_estimate(fb)
194
+ speed_ratio = speed_a / speed_b if speed_b > 0 else float("inf")
195
+
196
+ return FormatComparison(
197
+ format_a=fmt_a,
198
+ format_b=fmt_b,
199
+ size_ratio=round(size_ratio, 4),
200
+ accuracy_delta=round(acc_a - acc_b, 6),
201
+ speed_ratio=round(speed_ratio, 4),
202
+ )
203
+
204
+ def full_matrix(self) -> List[List[FormatComparison]]:
205
+ """Generate an N×N comparison matrix."""
206
+ names = sorted(self._formats)
207
+ return [
208
+ [self.compare_pair(a, b) for b in names]
209
+ for a in names
210
+ ]
211
+
212
+ def rank_by(self, criterion: str = "size") -> List[QuantFormatSpec]:
213
+ """Rank formats by a criterion: 'size', 'speed', or 'accuracy'."""
214
+ fmts = list(self._formats.values())
215
+ if criterion == "size":
216
+ fmts.sort(key=lambda f: f.bytes_per_weight)
217
+ elif criterion == "speed":
218
+ fmts.sort(key=lambda f: _speed_estimate(f), reverse=True)
219
+ elif criterion == "accuracy":
220
+ fmts.sort(key=lambda f: _accuracy_estimate(f), reverse=True)
221
+ else:
222
+ raise ValueError(
223
+ f"Unknown criterion: {criterion!r}. Choose 'size', 'speed', or 'accuracy'."
224
+ )
225
+ return fmts
226
+
227
+ def recommend(self, constraints: Dict[str, Any]) -> List[QuantFormatSpec]:
228
+ """Recommend formats matching constraints.
229
+
230
+ Supported constraint keys:
231
+ max_bits (int): maximum bit-width
232
+ min_accuracy (float): minimum accuracy retention (0-1), FP16=1.0
233
+ min_speed (float): minimum speed multiplier (FP16=1.0)
234
+ max_size_gb (float): maximum model size in GB (requires base_size_gb)
235
+ base_size_gb (float): FP16 model size for size calculations (default 7.0)
236
+ """
237
+ max_bits = constraints.get("max_bits", 32)
238
+ min_accuracy = constraints.get("min_accuracy", 0.0)
239
+ min_speed = constraints.get("min_speed", 0.0)
240
+ max_size_gb = constraints.get("max_size_gb", float("inf"))
241
+ base_size_gb = constraints.get("base_size_gb", 7.0)
242
+
243
+ results = []
244
+ for fmt in self._formats.values():
245
+ if fmt.bits > max_bits:
246
+ continue
247
+ if _accuracy_estimate(fmt) < min_accuracy:
248
+ continue
249
+ if _speed_estimate(fmt) < min_speed:
250
+ continue
251
+ if _model_size_gb(fmt, base_size_gb) > max_size_gb:
252
+ continue
253
+ results.append(fmt)
254
+
255
+ # Sort by accuracy descending, then speed descending
256
+ results.sort(
257
+ key=lambda f: (_accuracy_estimate(f), _speed_estimate(f)),
258
+ reverse=True,
259
+ )
260
+ return results
261
+
262
+
263
+ def format_comparison_table(matrix: List[List[FormatComparison]]) -> str:
264
+ """Render an N×N comparison matrix as an ASCII table.
265
+
266
+ Each cell shows the size ratio of row-format vs column-format.
267
+ """
268
+ if not matrix or not matrix[0]:
269
+ return "(empty matrix)"
270
+
271
+ names = [row[0].format_a for row in matrix]
272
+ col_width = max(len(n) for n in names) + 2
273
+ cell_width = 8
274
+
275
+ header = " " * col_width + "".join(n.center(cell_width) for n in names)
276
+ separator = "-" * len(header)
277
+
278
+ lines = [header, separator]
279
+ for row in matrix:
280
+ label = row[0].format_a.ljust(col_width)
281
+ cells = []
282
+ for comp in row:
283
+ if comp.format_a == comp.format_b:
284
+ cells.append(" --- ".center(cell_width))
285
+ else:
286
+ cells.append(f"{comp.size_ratio:.2f}x".center(cell_width))
287
+ lines.append(label + "".join(cells))
288
+
289
+ return "\n".join(lines)
@@ -0,0 +1,168 @@
1
+ """Perplexity-based quality scoring for quantized models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from dataclasses import dataclass, field
7
+ from typing import Dict, Sequence
8
+
9
+ from quantbenchx._types import ModelProfile
10
+
11
+
12
+ @dataclass
13
+ class PerplexityScore:
14
+ """Result of a perplexity computation."""
15
+
16
+ value: float
17
+ num_tokens: int
18
+ log_likelihood: float
19
+ normalized: bool
20
+
21
+
22
+ @dataclass
23
+ class PerplexityDelta:
24
+ """Estimated perplexity change between original and quantized models."""
25
+
26
+ original_bits: float
27
+ quantized_bits: float
28
+ estimated_ppl_increase_pct: float
29
+ quality_retention: float # 0-1
30
+ per_layer_impact: Dict[str, float] = field(default_factory=dict)
31
+
32
+
33
+ # ── Heuristic constants ───────────────────────────────────────────────
34
+
35
+ # Attention layers are ~2× more sensitive than FFN layers.
36
+ _ATTN_SENSITIVITY = 2.0
37
+ _FFN_SENSITIVITY = 1.0
38
+
39
+ # Empirical base: each halving of bpw roughly doubles the ppl penalty.
40
+ _BASE_PPL_RATE = 0.04 # ppl % increase per bit of precision lost
41
+
42
+
43
+ def _layer_sensitivity(name: str) -> float:
44
+ """Return a sensitivity multiplier based on the layer name."""
45
+ low = name.lower()
46
+ if any(kw in low for kw in ("attn", "attention", "self_attn", "q_proj", "k_proj", "v_proj", "o_proj")):
47
+ return _ATTN_SENSITIVITY
48
+ if any(kw in low for kw in ("embed", "lm_head", "output", "norm")):
49
+ return _ATTN_SENSITIVITY * 1.5 # most sensitive
50
+ return _FFN_SENSITIVITY
51
+
52
+
53
+ def _bits_for_profile(profile: ModelProfile) -> float:
54
+ bpw = profile.quant.avg_bits_per_weight
55
+ return bpw if bpw > 0 else 16.0
56
+
57
+
58
+ def _per_layer_impact(profile: ModelProfile, ref_bits: float) -> Dict[str, float]:
59
+ impacts: Dict[str, float] = {}
60
+ for layer in profile.layers:
61
+ lbpw = layer.avg_bits_per_weight
62
+ if lbpw <= 0:
63
+ lbpw = ref_bits
64
+ bit_drop = max(0.0, ref_bits - lbpw)
65
+ sens = _layer_sensitivity(layer.name)
66
+ impacts[layer.name] = round(bit_drop * sens * _BASE_PPL_RATE, 6)
67
+ return impacts
68
+
69
+
70
+ def _ppl_increase_pct(original_bits: float, quantized_bits: float) -> float:
71
+ """Estimate perplexity % increase from precision loss.
72
+
73
+ Uses an exponential heuristic: ppl_pct ≈ base_rate * 2^(bit_drop / scale) - base_rate
74
+ so small drops are nearly linear while large drops blow up.
75
+ """
76
+ bit_drop = max(0.0, original_bits - quantized_bits)
77
+ if bit_drop == 0:
78
+ return 0.0
79
+ # scale factor: a 4-bit drop ≈ 1 doubling
80
+ return _BASE_PPL_RATE * (2.0 ** (bit_drop / 4.0) - 1.0) * 100.0
81
+
82
+
83
+ # ── Public API ────────────────────────────────────────────────────────
84
+
85
+
86
+ def estimate_perplexity_delta(
87
+ profile_original: ModelProfile,
88
+ profile_quantized: ModelProfile,
89
+ ) -> PerplexityDelta:
90
+ """Estimate the perplexity delta between an original and quantized model.
91
+
92
+ Uses a heuristic: lower bit-widths increase perplexity roughly
93
+ exponentially, and attention layers are more sensitive than FFN layers.
94
+ """
95
+ orig_bits = _bits_for_profile(profile_original)
96
+ quant_bits = _bits_for_profile(profile_quantized)
97
+
98
+ ppl_pct = _ppl_increase_pct(orig_bits, quant_bits)
99
+ retention = max(0.0, min(1.0, 1.0 - ppl_pct / 100.0))
100
+
101
+ impacts = _per_layer_impact(profile_quantized, orig_bits)
102
+
103
+ return PerplexityDelta(
104
+ original_bits=round(orig_bits, 4),
105
+ quantized_bits=round(quant_bits, 4),
106
+ estimated_ppl_increase_pct=round(ppl_pct, 4),
107
+ quality_retention=round(retention, 6),
108
+ per_layer_impact=impacts,
109
+ )
110
+
111
+
112
+ def quality_score(profile: ModelProfile, reference_bits: float = 16.0) -> float:
113
+ """Score model quality relative to a reference precision.
114
+
115
+ Returns a float in [0, 1]. 1.0 means no quality loss (same precision
116
+ as reference), 0.0 means severe degradation.
117
+ """
118
+ bpw = _bits_for_profile(profile)
119
+ if bpw >= reference_bits:
120
+ return 1.0
121
+ bit_drop = reference_bits - bpw
122
+ # Exponential decay: exp(-k * bit_drop^1.5) gives a smooth curve
123
+ score = math.exp(-0.06 * (bit_drop ** 1.5))
124
+ return round(max(0.0, min(1.0, score)), 6)
125
+
126
+
127
+ def perplexity_from_logprobs(logprobs: Sequence[float]) -> PerplexityScore:
128
+ """Compute perplexity from a list of token log-probabilities.
129
+
130
+ Standard formula: ppl = exp( -1/N * Σ log p_i )
131
+ """
132
+ n = len(logprobs)
133
+ if n == 0:
134
+ return PerplexityScore(value=float("inf"), num_tokens=0, log_likelihood=0.0, normalized=True)
135
+
136
+ total_ll = sum(logprobs)
137
+ avg_neg_ll = -total_ll / n
138
+ ppl = math.exp(avg_neg_ll)
139
+
140
+ return PerplexityScore(
141
+ value=ppl,
142
+ num_tokens=n,
143
+ log_likelihood=total_ll,
144
+ normalized=True,
145
+ )
146
+
147
+
148
+ def format_quality_report(delta: PerplexityDelta) -> str:
149
+ """Format a PerplexityDelta as a human-readable report."""
150
+ lines = [
151
+ "Perplexity Quality Report",
152
+ "=" * 40,
153
+ f"Original precision : {delta.original_bits:.2f} bits",
154
+ f"Quantized precision: {delta.quantized_bits:.2f} bits",
155
+ f"Est. PPL increase : {delta.estimated_ppl_increase_pct:.2f}%",
156
+ f"Quality retention : {delta.quality_retention:.4f}",
157
+ ]
158
+
159
+ if delta.per_layer_impact:
160
+ lines.append("")
161
+ lines.append("Per-layer impact:")
162
+ sorted_layers = sorted(delta.per_layer_impact.items(), key=lambda x: x[1], reverse=True)
163
+ for name, impact in sorted_layers[:10]:
164
+ lines.append(f" {name:40s} {impact:.6f}")
165
+ if len(sorted_layers) > 10:
166
+ lines.append(f" ... and {len(sorted_layers) - 10} more layers")
167
+
168
+ return "\n".join(lines)
quantbenchx/predict.py ADDED
@@ -0,0 +1,125 @@
1
+ """Quality prediction — estimate perplexity delta and quality score from quantization params."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from typing import List
7
+
8
+ from quantbenchx._types import ModelProfile, QualityEstimate
9
+
10
+ # Empirical perplexity delta estimates per bits-per-weight
11
+ # Based on published quantization benchmarks (llama.cpp, GPTQ papers)
12
+ _PERPLEXITY_CURVE = {
13
+ 16.0: 0.0,
14
+ 8.5: 0.01,
15
+ 6.5: 0.03,
16
+ 5.5: 0.05,
17
+ 4.85: 0.08,
18
+ 4.5: 0.12,
19
+ 4.0: 0.18,
20
+ 3.9: 0.22,
21
+ 3.5: 0.35,
22
+ 3.35: 0.45,
23
+ 3.0: 0.65,
24
+ 2.5: 1.0,
25
+ 2.0: 1.8,
26
+ 1.5: 3.5,
27
+ }
28
+
29
+
30
+ def _interpolate_perplexity(bpw: float) -> float:
31
+ """Interpolate expected perplexity delta from bits-per-weight."""
32
+ points = sorted(_PERPLEXITY_CURVE.items(), reverse=True)
33
+
34
+ if bpw >= points[0][0]:
35
+ return points[0][1]
36
+ if bpw <= points[-1][0]:
37
+ return points[-1][1]
38
+
39
+ for i in range(len(points) - 1):
40
+ x1, y1 = points[i]
41
+ x2, y2 = points[i + 1]
42
+ if x2 <= bpw <= x1:
43
+ t = (bpw - x2) / (x1 - x2) if x1 != x2 else 0.0
44
+ return y1 * t + y2 * (1 - t)
45
+
46
+ return 0.5 # fallback
47
+
48
+
49
+ def estimate_quality(profile: ModelProfile) -> QualityEstimate:
50
+ """Estimate quantization quality from a model profile.
51
+
52
+ Returns a QualityEstimate with predicted perplexity delta,
53
+ quality score, risk level, and recommendations.
54
+ """
55
+ bpw = profile.quant.avg_bits_per_weight
56
+ if bpw == 0:
57
+ bpw = 16.0 # assume full precision if unknown
58
+
59
+ ppl_delta = _interpolate_perplexity(bpw)
60
+
61
+ # Quality score: 1.0 = perfect (no loss), 0.0 = terrible
62
+ # Exponential decay from bpw
63
+ quality = math.exp(-0.15 * max(0, 16.0 - bpw))
64
+ quality = max(0.0, min(1.0, quality))
65
+
66
+ # Risk level
67
+ if ppl_delta < 0.05:
68
+ risk = "low"
69
+ elif ppl_delta < 0.15:
70
+ risk = "medium"
71
+ elif ppl_delta < 0.5:
72
+ risk = "high"
73
+ else:
74
+ risk = "critical"
75
+
76
+ # Find sensitive layers
77
+ sensitive: List[str] = []
78
+ for layer in profile.layers:
79
+ name_lower = layer.name.lower()
80
+ if any(kw in name_lower for kw in ("embed", "lm_head", "output", "norm")):
81
+ if layer.avg_bits_per_weight < 6.0:
82
+ sensitive.append(layer.name)
83
+
84
+ # Recommendations
85
+ recs: List[str] = []
86
+ if bpw < 3.0:
87
+ recs.append("Very aggressive quantization — expect significant quality loss")
88
+ recs.append("Consider Q4_K_M or higher for production use")
89
+ elif bpw < 4.0:
90
+ recs.append("Aggressive quantization — test on your specific workload")
91
+ elif bpw < 5.0:
92
+ recs.append("Good balance of size and quality for most use cases")
93
+
94
+ if sensitive:
95
+ recs.append(f"Sensitive layers at low precision: {', '.join(sensitive[:5])}")
96
+ recs.append("Consider mixed-quant with higher precision for embed/output layers")
97
+
98
+ if profile.quant.n_full_precision_layers == 0 and len(profile.tensors) > 10:
99
+ recs.append("No full-precision layers detected — norm layers may benefit from FP16")
100
+
101
+ total_params = profile.total_params
102
+ if total_params > 0:
103
+ param_b = total_params / 1e9
104
+ size_gb = profile.size_gb
105
+ if param_b > 0:
106
+ recs.append(f"Model: ~{param_b:.1f}B params, {size_gb:.1f} GB at {bpw:.1f} bpw")
107
+
108
+ return QualityEstimate(
109
+ model_name=profile.name,
110
+ method=profile.quant.method.value,
111
+ avg_bits_per_weight=round(bpw, 2),
112
+ estimated_perplexity_delta=round(ppl_delta, 4),
113
+ quality_score=round(quality, 4),
114
+ risk_level=risk,
115
+ sensitive_layers=sensitive,
116
+ recommendations=recs,
117
+ )
118
+
119
+
120
+ def perplexity_delta(bpw: float) -> float:
121
+ """Estimate perplexity increase for a given bits-per-weight.
122
+
123
+ Based on published quantization benchmarks.
124
+ """
125
+ return round(_interpolate_perplexity(bpw), 4)