quantbenchx 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantbenchx/__init__.py +132 -0
- quantbenchx/_types.py +220 -0
- quantbenchx/bandwidth.py +290 -0
- quantbenchx/cli.py +153 -0
- quantbenchx/compare.py +101 -0
- quantbenchx/imatrix.py +201 -0
- quantbenchx/layerwise.py +167 -0
- quantbenchx/matrix.py +289 -0
- quantbenchx/perplexity.py +168 -0
- quantbenchx/predict.py +125 -0
- quantbenchx/profile.py +301 -0
- quantbenchx/py.typed +0 -0
- quantbenchx/recommend.py +240 -0
- quantbenchx/report.py +171 -0
- quantbenchx-0.3.0.dist-info/METADATA +213 -0
- quantbenchx-0.3.0.dist-info/RECORD +17 -0
- quantbenchx-0.3.0.dist-info/WHEEL +4 -0
quantbenchx/profile.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Profile quantized models from GGUF, safetensors, or dict metadata."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import struct
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, BinaryIO, Dict, List
|
|
9
|
+
|
|
10
|
+
from quantbenchx._types import (
|
|
11
|
+
DType,
|
|
12
|
+
LayerInfo,
|
|
13
|
+
ModelProfile,
|
|
14
|
+
QuantbenchError,
|
|
15
|
+
QuantFormat,
|
|
16
|
+
QuantMethod,
|
|
17
|
+
QuantProfile,
|
|
18
|
+
TensorInfo,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# ── GGUF constants ──
|
|
22
|
+
GGUF_MAGIC = 0x46475547 # "GGUF" in little-endian
|
|
23
|
+
GGUF_DTYPE_MAP = {
|
|
24
|
+
0: DType.F32, 1: DType.F16, 2: DType.Q4_0, 3: DType.Q4_1,
|
|
25
|
+
6: DType.Q5_0, 7: DType.Q5_1, 8: DType.Q8_0,
|
|
26
|
+
10: DType.Q2_K, 11: DType.Q3_K_S, 12: DType.Q3_K_M,
|
|
27
|
+
13: DType.Q3_K_L, 14: DType.Q4_K_S, 15: DType.Q4_K_M,
|
|
28
|
+
16: DType.Q5_K_S, 17: DType.Q5_K_M, 18: DType.Q6_K,
|
|
29
|
+
19: DType.IQ2_XXS, 20: DType.IQ3_XXS, 26: DType.IQ4_XS,
|
|
30
|
+
24: DType.IQ1_S, 28: DType.BF16,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# Type tags for GGUF metadata values
|
|
34
|
+
_GGUF_TYPE_UINT8 = 0
|
|
35
|
+
_GGUF_TYPE_INT8 = 1
|
|
36
|
+
_GGUF_TYPE_UINT16 = 2
|
|
37
|
+
_GGUF_TYPE_INT16 = 3
|
|
38
|
+
_GGUF_TYPE_UINT32 = 4
|
|
39
|
+
_GGUF_TYPE_INT32 = 5
|
|
40
|
+
_GGUF_TYPE_FLOAT32 = 6
|
|
41
|
+
_GGUF_TYPE_BOOL = 7
|
|
42
|
+
_GGUF_TYPE_STRING = 8
|
|
43
|
+
_GGUF_TYPE_ARRAY = 9
|
|
44
|
+
_GGUF_TYPE_UINT64 = 10
|
|
45
|
+
_GGUF_TYPE_INT64 = 11
|
|
46
|
+
_GGUF_TYPE_FLOAT64 = 12
|
|
47
|
+
|
|
48
|
+
# safetensors dtype sizes
|
|
49
|
+
_SAFETENSORS_DTYPE_BPW = {
|
|
50
|
+
"F32": 32, "F16": 16, "BF16": 16, "I64": 64, "I32": 32,
|
|
51
|
+
"I16": 16, "I8": 8, "U8": 8, "BOOL": 1, "F8_E4M3": 8, "F8_E5M2": 8,
|
|
52
|
+
}
|
|
53
|
+
_SAFETENSORS_DTYPE_MAP = {
|
|
54
|
+
"F32": DType.F32, "F16": DType.F16, "BF16": DType.BF16,
|
|
55
|
+
"I8": DType.Q8_0, "U8": DType.Q8_0,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _read_gguf_string(f: BinaryIO) -> str:
|
|
60
|
+
"""Read a GGUF string (uint64 length + bytes)."""
|
|
61
|
+
length = struct.unpack("<Q", f.read(8))[0]
|
|
62
|
+
return f.read(length).decode("utf-8", errors="replace")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _read_gguf_value(f: BinaryIO, vtype: int) -> Any:
|
|
66
|
+
"""Read a single GGUF metadata value."""
|
|
67
|
+
if vtype == _GGUF_TYPE_UINT8:
|
|
68
|
+
return struct.unpack("<B", f.read(1))[0]
|
|
69
|
+
elif vtype == _GGUF_TYPE_INT8:
|
|
70
|
+
return struct.unpack("<b", f.read(1))[0]
|
|
71
|
+
elif vtype == _GGUF_TYPE_UINT16:
|
|
72
|
+
return struct.unpack("<H", f.read(2))[0]
|
|
73
|
+
elif vtype == _GGUF_TYPE_INT16:
|
|
74
|
+
return struct.unpack("<h", f.read(2))[0]
|
|
75
|
+
elif vtype == _GGUF_TYPE_UINT32:
|
|
76
|
+
return struct.unpack("<I", f.read(4))[0]
|
|
77
|
+
elif vtype == _GGUF_TYPE_INT32:
|
|
78
|
+
return struct.unpack("<i", f.read(4))[0]
|
|
79
|
+
elif vtype == _GGUF_TYPE_FLOAT32:
|
|
80
|
+
return struct.unpack("<f", f.read(4))[0]
|
|
81
|
+
elif vtype == _GGUF_TYPE_BOOL:
|
|
82
|
+
return struct.unpack("<B", f.read(1))[0] != 0
|
|
83
|
+
elif vtype == _GGUF_TYPE_STRING:
|
|
84
|
+
return _read_gguf_string(f)
|
|
85
|
+
elif vtype == _GGUF_TYPE_UINT64:
|
|
86
|
+
return struct.unpack("<Q", f.read(8))[0]
|
|
87
|
+
elif vtype == _GGUF_TYPE_INT64:
|
|
88
|
+
return struct.unpack("<q", f.read(8))[0]
|
|
89
|
+
elif vtype == _GGUF_TYPE_FLOAT64:
|
|
90
|
+
return struct.unpack("<d", f.read(8))[0]
|
|
91
|
+
elif vtype == _GGUF_TYPE_ARRAY:
|
|
92
|
+
elem_type = struct.unpack("<I", f.read(4))[0]
|
|
93
|
+
count = struct.unpack("<Q", f.read(8))[0]
|
|
94
|
+
return [_read_gguf_value(f, elem_type) for _ in range(count)]
|
|
95
|
+
else:
|
|
96
|
+
raise QuantbenchError(f"Unknown GGUF value type: {vtype}")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def profile_gguf(path: str | Path) -> ModelProfile:
|
|
100
|
+
"""Parse a GGUF file and return a model profile.
|
|
101
|
+
|
|
102
|
+
This is a pure-Python parser — no external dependencies required.
|
|
103
|
+
Reads only the header (metadata + tensor info), not the actual weights.
|
|
104
|
+
"""
|
|
105
|
+
path = Path(path)
|
|
106
|
+
if not path.exists():
|
|
107
|
+
raise QuantbenchError(f"File not found: {path}")
|
|
108
|
+
|
|
109
|
+
with open(path, "rb") as f:
|
|
110
|
+
# Read magic number
|
|
111
|
+
magic = struct.unpack("<I", f.read(4))[0]
|
|
112
|
+
if magic != GGUF_MAGIC:
|
|
113
|
+
raise QuantbenchError(f"Not a GGUF file (magic: {magic:#x})")
|
|
114
|
+
|
|
115
|
+
# Read version
|
|
116
|
+
version = struct.unpack("<I", f.read(4))[0]
|
|
117
|
+
if version not in (2, 3):
|
|
118
|
+
raise QuantbenchError(f"Unsupported GGUF version: {version}")
|
|
119
|
+
|
|
120
|
+
# Read counts
|
|
121
|
+
n_tensors = struct.unpack("<Q", f.read(8))[0]
|
|
122
|
+
n_kv = struct.unpack("<Q", f.read(8))[0]
|
|
123
|
+
|
|
124
|
+
# Read metadata
|
|
125
|
+
metadata: Dict[str, Any] = {}
|
|
126
|
+
for _ in range(n_kv):
|
|
127
|
+
key = _read_gguf_string(f)
|
|
128
|
+
vtype = struct.unpack("<I", f.read(4))[0]
|
|
129
|
+
value = _read_gguf_value(f, vtype)
|
|
130
|
+
metadata[key] = value
|
|
131
|
+
|
|
132
|
+
# Read tensor info
|
|
133
|
+
tensors: List[TensorInfo] = []
|
|
134
|
+
for _ in range(n_tensors):
|
|
135
|
+
name = _read_gguf_string(f)
|
|
136
|
+
n_dims = struct.unpack("<I", f.read(4))[0]
|
|
137
|
+
shape = [struct.unpack("<Q", f.read(8))[0] for _ in range(n_dims)]
|
|
138
|
+
dtype_id = struct.unpack("<I", f.read(4))[0]
|
|
139
|
+
_offset = struct.unpack("<Q", f.read(8))[0] # tensor data offset
|
|
140
|
+
|
|
141
|
+
dtype = GGUF_DTYPE_MAP.get(dtype_id, DType.UNKNOWN)
|
|
142
|
+
tensors.append(TensorInfo(name=name, shape=shape, dtype=dtype))
|
|
143
|
+
|
|
144
|
+
# Build layers from tensor names
|
|
145
|
+
layers = _group_tensors_into_layers(tensors)
|
|
146
|
+
|
|
147
|
+
# Build quant profile
|
|
148
|
+
quant = _build_quant_profile(tensors, metadata)
|
|
149
|
+
|
|
150
|
+
total_params = sum(t.n_elements for t in tensors)
|
|
151
|
+
total_size = sum(t.size_bytes for t in tensors)
|
|
152
|
+
|
|
153
|
+
model_name = metadata.get("general.name", path.stem)
|
|
154
|
+
if not isinstance(model_name, str):
|
|
155
|
+
model_name = str(model_name)
|
|
156
|
+
|
|
157
|
+
return ModelProfile(
|
|
158
|
+
name=model_name,
|
|
159
|
+
format=QuantFormat.GGUF,
|
|
160
|
+
total_params=total_params,
|
|
161
|
+
total_size_bytes=total_size,
|
|
162
|
+
tensors=tensors,
|
|
163
|
+
layers=layers,
|
|
164
|
+
quant=quant,
|
|
165
|
+
metadata={k: v for k, v in metadata.items() if isinstance(v, (str, int, float, bool))},
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def profile_safetensors(path: str | Path) -> ModelProfile:
|
|
170
|
+
"""Parse a safetensors file header and return a model profile.
|
|
171
|
+
|
|
172
|
+
Only reads the JSON header — does not load weights into memory.
|
|
173
|
+
"""
|
|
174
|
+
path = Path(path)
|
|
175
|
+
if not path.exists():
|
|
176
|
+
raise QuantbenchError(f"File not found: {path}")
|
|
177
|
+
|
|
178
|
+
with open(path, "rb") as f:
|
|
179
|
+
# First 8 bytes: header size as uint64
|
|
180
|
+
header_size = struct.unpack("<Q", f.read(8))[0]
|
|
181
|
+
if header_size > 100_000_000: # sanity check: 100MB max header
|
|
182
|
+
raise QuantbenchError(f"Header too large: {header_size}")
|
|
183
|
+
|
|
184
|
+
header_bytes = f.read(header_size)
|
|
185
|
+
header = json.loads(header_bytes)
|
|
186
|
+
|
|
187
|
+
# Extract tensor metadata (skip __metadata__ key)
|
|
188
|
+
tensors: List[TensorInfo] = []
|
|
189
|
+
meta = header.pop("__metadata__", {})
|
|
190
|
+
|
|
191
|
+
for tensor_name, info in header.items():
|
|
192
|
+
dtype_str = info.get("dtype", "F32")
|
|
193
|
+
shape = info.get("shape", [])
|
|
194
|
+
offsets = info.get("data_offsets", [0, 0])
|
|
195
|
+
|
|
196
|
+
dtype = _SAFETENSORS_DTYPE_MAP.get(dtype_str, DType.UNKNOWN)
|
|
197
|
+
size_bytes = offsets[1] - offsets[0] if len(offsets) == 2 else 0
|
|
198
|
+
|
|
199
|
+
ti = TensorInfo(name=tensor_name, shape=shape, dtype=dtype, size_bytes=size_bytes)
|
|
200
|
+
tensors.append(ti)
|
|
201
|
+
|
|
202
|
+
layers = _group_tensors_into_layers(tensors)
|
|
203
|
+
quant = _build_quant_profile(tensors, meta)
|
|
204
|
+
|
|
205
|
+
total_params = sum(t.n_elements for t in tensors)
|
|
206
|
+
total_size = sum(t.size_bytes for t in tensors)
|
|
207
|
+
|
|
208
|
+
return ModelProfile(
|
|
209
|
+
name=path.stem,
|
|
210
|
+
format=QuantFormat.SAFETENSORS,
|
|
211
|
+
total_params=total_params,
|
|
212
|
+
total_size_bytes=total_size,
|
|
213
|
+
tensors=tensors,
|
|
214
|
+
layers=layers,
|
|
215
|
+
quant=quant,
|
|
216
|
+
metadata=meta if isinstance(meta, dict) else {},
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def profile_from_dict(data: Dict[str, Any]) -> ModelProfile:
|
|
221
|
+
"""Create a ModelProfile from a dictionary (e.g. loaded from JSON)."""
|
|
222
|
+
return ModelProfile.from_dict(data)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _group_tensors_into_layers(tensors: List[TensorInfo]) -> List[LayerInfo]:
|
|
226
|
+
"""Group tensors into logical layers based on naming conventions."""
|
|
227
|
+
layer_map: Dict[str, List[TensorInfo]] = {}
|
|
228
|
+
|
|
229
|
+
for t in tensors:
|
|
230
|
+
# Extract layer identifier from tensor name
|
|
231
|
+
# Common patterns: "blk.0.attn_q.weight", "model.layers.0.self_attn.q_proj.weight"
|
|
232
|
+
parts = t.name.split(".")
|
|
233
|
+
layer_name = "other"
|
|
234
|
+
|
|
235
|
+
for i, part in enumerate(parts):
|
|
236
|
+
if part.isdigit() and i > 0:
|
|
237
|
+
layer_name = ".".join(parts[: i + 1])
|
|
238
|
+
break
|
|
239
|
+
if part in ("blk", "layers", "block", "h"):
|
|
240
|
+
if i + 1 < len(parts) and parts[i + 1].isdigit():
|
|
241
|
+
layer_name = ".".join(parts[: i + 2])
|
|
242
|
+
break
|
|
243
|
+
|
|
244
|
+
if layer_name not in layer_map:
|
|
245
|
+
layer_map[layer_name] = []
|
|
246
|
+
layer_map[layer_name].append(t)
|
|
247
|
+
|
|
248
|
+
return [LayerInfo(name=name, tensors=ts) for name, ts in sorted(layer_map.items())]
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _build_quant_profile(tensors: List[TensorInfo], metadata: Dict[str, Any]) -> QuantProfile:
|
|
252
|
+
"""Build quantization profile from tensor list and metadata."""
|
|
253
|
+
if not tensors:
|
|
254
|
+
return QuantProfile()
|
|
255
|
+
|
|
256
|
+
# Count dtype distribution
|
|
257
|
+
total_elements = sum(t.n_elements for t in tensors)
|
|
258
|
+
dtype_counts: Dict[str, int] = {}
|
|
259
|
+
for t in tensors:
|
|
260
|
+
key = t.dtype.value
|
|
261
|
+
dtype_counts[key] = dtype_counts.get(key, 0) + t.n_elements
|
|
262
|
+
|
|
263
|
+
dtype_dist = {}
|
|
264
|
+
if total_elements > 0:
|
|
265
|
+
dtype_dist = {k: round(v / total_elements, 4) for k, v in dtype_counts.items()}
|
|
266
|
+
|
|
267
|
+
# Compute average bits per weight
|
|
268
|
+
avg_bpw = 0.0
|
|
269
|
+
if total_elements > 0:
|
|
270
|
+
avg_bpw = sum(t.n_elements * t.bits_per_weight for t in tensors) / total_elements
|
|
271
|
+
|
|
272
|
+
# Count quantized vs full precision layers
|
|
273
|
+
fp_dtypes = {DType.F32, DType.F16, DType.BF16}
|
|
274
|
+
n_quant = sum(1 for t in tensors if t.dtype not in fp_dtypes)
|
|
275
|
+
n_fp = sum(1 for t in tensors if t.dtype in fp_dtypes)
|
|
276
|
+
|
|
277
|
+
# Detect method from metadata
|
|
278
|
+
method = QuantMethod.UNKNOWN
|
|
279
|
+
meta_str = str(metadata).lower()
|
|
280
|
+
if "gptq" in meta_str:
|
|
281
|
+
method = QuantMethod.GPTQ
|
|
282
|
+
elif "awq" in meta_str:
|
|
283
|
+
method = QuantMethod.AWQ
|
|
284
|
+
elif "ggml" in meta_str or "llama.cpp" in meta_str:
|
|
285
|
+
method = QuantMethod.GGML
|
|
286
|
+
elif "bitsandbytes" in meta_str:
|
|
287
|
+
method = QuantMethod.BITSANDBYTES
|
|
288
|
+
elif avg_bpw > 15:
|
|
289
|
+
method = QuantMethod.FP16
|
|
290
|
+
elif avg_bpw > 7:
|
|
291
|
+
method = QuantMethod.INT8
|
|
292
|
+
elif avg_bpw > 3:
|
|
293
|
+
method = QuantMethod.INT4
|
|
294
|
+
|
|
295
|
+
return QuantProfile(
|
|
296
|
+
method=method,
|
|
297
|
+
avg_bits_per_weight=round(avg_bpw, 2),
|
|
298
|
+
dtype_distribution=dtype_dist,
|
|
299
|
+
n_quantized_layers=n_quant,
|
|
300
|
+
n_full_precision_layers=n_fp,
|
|
301
|
+
)
|
quantbenchx/py.typed
ADDED
|
File without changes
|
quantbenchx/recommend.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""Quantization recommendation engine — suggest optimal format based on model profile."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from quantbenchx._types import DType, ModelProfile
|
|
9
|
+
from quantbenchx.layerwise import layer_sensitivity
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Recommendation:
|
|
14
|
+
"""Quantization recommendation for a model."""
|
|
15
|
+
|
|
16
|
+
format: str
|
|
17
|
+
estimated_size_gb: float
|
|
18
|
+
estimated_quality: float # 0-1 quality retention score
|
|
19
|
+
per_layer: Dict[str, str] # layer name -> recommended format
|
|
20
|
+
explanation: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Candidate formats ordered from highest quality to most compressed.
|
|
24
|
+
# Each entry: (label, target DType, approx bpw, quality baseline)
|
|
25
|
+
_CANDIDATES: List[tuple] = [
|
|
26
|
+
("Q8_0", DType.Q8_0, 8.5, 0.99),
|
|
27
|
+
("Q6_K", DType.Q6_K, 6.5625, 0.97),
|
|
28
|
+
("Q5_K_M", DType.Q5_K_M, 5.5, 0.95),
|
|
29
|
+
("Q5_K_S", DType.Q5_K_S, 5.5, 0.94),
|
|
30
|
+
("Q4_K_M", DType.Q4_K_M, 4.85, 0.90),
|
|
31
|
+
("Q4_K_S", DType.Q4_K_S, 4.5, 0.87),
|
|
32
|
+
("Q3_K_M", DType.Q3_K_M, 3.9, 0.80),
|
|
33
|
+
("Q3_K_S", DType.Q3_K_S, 3.5, 0.74),
|
|
34
|
+
("Q2_K", DType.Q2_K, 3.35, 0.60),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
# Layers matching these keywords are kept at a higher precision tier.
|
|
38
|
+
_HIGH_PRECISION_KEYWORDS = {"embed", "lm_head", "output", "norm", "layernorm", "rmsnorm"}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def recommend(
|
|
42
|
+
profile: ModelProfile,
|
|
43
|
+
target_bits: Optional[float] = None,
|
|
44
|
+
max_quality_loss: float = 0.05,
|
|
45
|
+
) -> Recommendation:
|
|
46
|
+
"""Recommend a quantization strategy for *profile*.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
profile:
|
|
51
|
+
A :class:`ModelProfile` produced by the profiling helpers.
|
|
52
|
+
target_bits:
|
|
53
|
+
If given, pick the format closest to this bits-per-weight.
|
|
54
|
+
Overrides *max_quality_loss*.
|
|
55
|
+
max_quality_loss:
|
|
56
|
+
Maximum tolerable quality loss (0-1). ``0.05`` means we want
|
|
57
|
+
≥ 95 % quality retention. Ignored when *target_bits* is set.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
Recommendation
|
|
62
|
+
"""
|
|
63
|
+
total_params = profile.total_params
|
|
64
|
+
if total_params == 0:
|
|
65
|
+
total_params = sum(l.n_params for l in profile.layers)
|
|
66
|
+
|
|
67
|
+
# --- pick the overall format ----------------------------------------
|
|
68
|
+
if target_bits is not None:
|
|
69
|
+
chosen = _closest_candidate(target_bits)
|
|
70
|
+
else:
|
|
71
|
+
min_quality = 1.0 - max_quality_loss
|
|
72
|
+
chosen = _best_candidate_for_quality(min_quality)
|
|
73
|
+
|
|
74
|
+
label, dtype, bpw, quality_base = chosen
|
|
75
|
+
|
|
76
|
+
# --- estimate size --------------------------------------------------
|
|
77
|
+
if total_params > 0:
|
|
78
|
+
estimated_size_bytes = total_params * bpw / 8
|
|
79
|
+
estimated_size_gb = round(estimated_size_bytes / (1024**3), 3)
|
|
80
|
+
else:
|
|
81
|
+
estimated_size_gb = 0.0
|
|
82
|
+
|
|
83
|
+
# --- per-layer recommendations (mixed-quant) ------------------------
|
|
84
|
+
sensitivities = layer_sensitivity(profile) if profile.layers else {}
|
|
85
|
+
high_dtype = _one_tier_higher(dtype)
|
|
86
|
+
per_layer: Dict[str, str] = {}
|
|
87
|
+
for layer_name, sens in sensitivities.items():
|
|
88
|
+
if _is_sensitive_layer(layer_name) or sens >= 0.75:
|
|
89
|
+
per_layer[layer_name] = high_dtype.value
|
|
90
|
+
else:
|
|
91
|
+
per_layer[layer_name] = dtype.value
|
|
92
|
+
|
|
93
|
+
# --- adjust quality estimate for mixed-quant benefit -----------------
|
|
94
|
+
estimated_quality = _adjust_quality(quality_base, per_layer, sensitivities)
|
|
95
|
+
|
|
96
|
+
# --- human-readable explanation -------------------------------------
|
|
97
|
+
explanation = _build_explanation(
|
|
98
|
+
profile, label, bpw, estimated_size_gb, estimated_quality,
|
|
99
|
+
per_layer, sensitivities, target_bits, max_quality_loss,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
return Recommendation(
|
|
103
|
+
format=label,
|
|
104
|
+
estimated_size_gb=estimated_size_gb,
|
|
105
|
+
estimated_quality=round(estimated_quality, 4),
|
|
106
|
+
per_layer=per_layer,
|
|
107
|
+
explanation=explanation,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def format_recommendation(rec: Recommendation) -> str:
|
|
112
|
+
"""Return a human-readable multi-line summary of *rec*."""
|
|
113
|
+
lines = [
|
|
114
|
+
f"Recommended format : {rec.format}",
|
|
115
|
+
f"Estimated size : {rec.estimated_size_gb:.2f} GB",
|
|
116
|
+
f"Quality retention : {rec.estimated_quality * 100:.1f}%",
|
|
117
|
+
"",
|
|
118
|
+
]
|
|
119
|
+
if rec.per_layer:
|
|
120
|
+
lines.append("Per-layer strategy:")
|
|
121
|
+
for layer, fmt in rec.per_layer.items():
|
|
122
|
+
lines.append(f" {layer}: {fmt}")
|
|
123
|
+
lines.append("")
|
|
124
|
+
lines.append(rec.explanation)
|
|
125
|
+
return "\n".join(lines)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ── internal helpers ─────────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _closest_candidate(target_bpw: float) -> tuple:
|
|
132
|
+
best = _CANDIDATES[0]
|
|
133
|
+
best_dist = abs(best[2] - target_bpw)
|
|
134
|
+
for c in _CANDIDATES[1:]:
|
|
135
|
+
d = abs(c[2] - target_bpw)
|
|
136
|
+
if d < best_dist:
|
|
137
|
+
best = c
|
|
138
|
+
best_dist = d
|
|
139
|
+
return best
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _best_candidate_for_quality(min_quality: float) -> tuple:
|
|
143
|
+
# Walk from most compressed to least; pick the most compressed that
|
|
144
|
+
# still meets the quality threshold.
|
|
145
|
+
for c in reversed(_CANDIDATES):
|
|
146
|
+
if c[3] >= min_quality:
|
|
147
|
+
return c
|
|
148
|
+
# Nothing meets the bar — fall back to highest quality.
|
|
149
|
+
return _CANDIDATES[0]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _one_tier_higher(dtype: DType) -> DType:
|
|
153
|
+
"""Return the next-higher-quality DType tier."""
|
|
154
|
+
order = [c[1] for c in _CANDIDATES]
|
|
155
|
+
try:
|
|
156
|
+
idx = order.index(dtype)
|
|
157
|
+
except ValueError:
|
|
158
|
+
return dtype
|
|
159
|
+
return order[max(0, idx - 1)]
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _is_sensitive_layer(name: str) -> bool:
|
|
163
|
+
name_lower = name.lower()
|
|
164
|
+
return any(kw in name_lower for kw in _HIGH_PRECISION_KEYWORDS)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _adjust_quality(
|
|
168
|
+
base: float,
|
|
169
|
+
per_layer: Dict[str, str],
|
|
170
|
+
sensitivities: Dict[str, float],
|
|
171
|
+
) -> float:
|
|
172
|
+
"""Bump quality estimate when sensitive layers get higher precision."""
|
|
173
|
+
if not per_layer or not sensitivities:
|
|
174
|
+
return base
|
|
175
|
+
|
|
176
|
+
total_sens = sum(sensitivities.values())
|
|
177
|
+
if total_sens == 0:
|
|
178
|
+
return base
|
|
179
|
+
|
|
180
|
+
# Fraction of total sensitivity weight that got upgraded.
|
|
181
|
+
upgraded_sens = sum(
|
|
182
|
+
sensitivities.get(ln, 0.0)
|
|
183
|
+
for ln, fmt in per_layer.items()
|
|
184
|
+
# "upgraded" if the format differs from the majority
|
|
185
|
+
if fmt != _mode_value(per_layer)
|
|
186
|
+
)
|
|
187
|
+
benefit = 0.02 * (upgraded_sens / total_sens)
|
|
188
|
+
return min(base + benefit, 1.0)
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _mode_value(d: Dict[str, str]) -> str:
|
|
192
|
+
"""Most common value in a dict."""
|
|
193
|
+
counts: Dict[str, int] = {}
|
|
194
|
+
for v in d.values():
|
|
195
|
+
counts[v] = counts.get(v, 0) + 1
|
|
196
|
+
return max(counts, key=lambda k: counts[k]) if counts else ""
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _build_explanation(
|
|
200
|
+
profile: ModelProfile,
|
|
201
|
+
label: str,
|
|
202
|
+
bpw: float,
|
|
203
|
+
size_gb: float,
|
|
204
|
+
quality: float,
|
|
205
|
+
per_layer: Dict[str, str],
|
|
206
|
+
sensitivities: Dict[str, float],
|
|
207
|
+
target_bits: Optional[float],
|
|
208
|
+
max_quality_loss: float,
|
|
209
|
+
) -> str:
|
|
210
|
+
parts: List[str] = []
|
|
211
|
+
param_b = profile.total_params / 1e9 if profile.total_params else 0
|
|
212
|
+
if param_b > 0:
|
|
213
|
+
parts.append(f"For a ~{param_b:.1f}B-parameter model, ")
|
|
214
|
+
else:
|
|
215
|
+
parts.append("Based on the model profile, ")
|
|
216
|
+
|
|
217
|
+
if target_bits is not None:
|
|
218
|
+
parts.append(
|
|
219
|
+
f"{label} (≈{bpw:.1f} bpw) is the closest match to the "
|
|
220
|
+
f"requested {target_bits:.1f} bits-per-weight target."
|
|
221
|
+
)
|
|
222
|
+
else:
|
|
223
|
+
parts.append(
|
|
224
|
+
f"{label} (≈{bpw:.1f} bpw) is recommended to stay within "
|
|
225
|
+
f"{max_quality_loss * 100:.0f}% quality loss."
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
n_upgraded = sum(
|
|
229
|
+
1 for fmt in per_layer.values() if fmt != _mode_value(per_layer)
|
|
230
|
+
)
|
|
231
|
+
if n_upgraded:
|
|
232
|
+
parts.append(
|
|
233
|
+
f" {n_upgraded} sensitive layer(s) are assigned higher precision "
|
|
234
|
+
f"for mixed-quantization."
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
parts.append(
|
|
238
|
+
f" Estimated size: {size_gb:.2f} GB, quality retention: {quality * 100:.1f}%."
|
|
239
|
+
)
|
|
240
|
+
return "".join(parts)
|