quantbenchx 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quantbenchx/profile.py ADDED
@@ -0,0 +1,301 @@
1
+ """Profile quantized models from GGUF, safetensors, or dict metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import struct
7
+ from pathlib import Path
8
+ from typing import Any, BinaryIO, Dict, List
9
+
10
+ from quantbenchx._types import (
11
+ DType,
12
+ LayerInfo,
13
+ ModelProfile,
14
+ QuantbenchError,
15
+ QuantFormat,
16
+ QuantMethod,
17
+ QuantProfile,
18
+ TensorInfo,
19
+ )
20
+
21
+ # ── GGUF constants ──
22
+ GGUF_MAGIC = 0x46475547 # "GGUF" in little-endian
23
+ GGUF_DTYPE_MAP = {
24
+ 0: DType.F32, 1: DType.F16, 2: DType.Q4_0, 3: DType.Q4_1,
25
+ 6: DType.Q5_0, 7: DType.Q5_1, 8: DType.Q8_0,
26
+ 10: DType.Q2_K, 11: DType.Q3_K_S, 12: DType.Q3_K_M,
27
+ 13: DType.Q3_K_L, 14: DType.Q4_K_S, 15: DType.Q4_K_M,
28
+ 16: DType.Q5_K_S, 17: DType.Q5_K_M, 18: DType.Q6_K,
29
+ 19: DType.IQ2_XXS, 20: DType.IQ3_XXS, 26: DType.IQ4_XS,
30
+ 24: DType.IQ1_S, 28: DType.BF16,
31
+ }
32
+
33
+ # Type tags for GGUF metadata values
34
+ _GGUF_TYPE_UINT8 = 0
35
+ _GGUF_TYPE_INT8 = 1
36
+ _GGUF_TYPE_UINT16 = 2
37
+ _GGUF_TYPE_INT16 = 3
38
+ _GGUF_TYPE_UINT32 = 4
39
+ _GGUF_TYPE_INT32 = 5
40
+ _GGUF_TYPE_FLOAT32 = 6
41
+ _GGUF_TYPE_BOOL = 7
42
+ _GGUF_TYPE_STRING = 8
43
+ _GGUF_TYPE_ARRAY = 9
44
+ _GGUF_TYPE_UINT64 = 10
45
+ _GGUF_TYPE_INT64 = 11
46
+ _GGUF_TYPE_FLOAT64 = 12
47
+
48
+ # safetensors dtype sizes
49
+ _SAFETENSORS_DTYPE_BPW = {
50
+ "F32": 32, "F16": 16, "BF16": 16, "I64": 64, "I32": 32,
51
+ "I16": 16, "I8": 8, "U8": 8, "BOOL": 1, "F8_E4M3": 8, "F8_E5M2": 8,
52
+ }
53
+ _SAFETENSORS_DTYPE_MAP = {
54
+ "F32": DType.F32, "F16": DType.F16, "BF16": DType.BF16,
55
+ "I8": DType.Q8_0, "U8": DType.Q8_0,
56
+ }
57
+
58
+
59
+ def _read_gguf_string(f: BinaryIO) -> str:
60
+ """Read a GGUF string (uint64 length + bytes)."""
61
+ length = struct.unpack("<Q", f.read(8))[0]
62
+ return f.read(length).decode("utf-8", errors="replace")
63
+
64
+
65
+ def _read_gguf_value(f: BinaryIO, vtype: int) -> Any:
66
+ """Read a single GGUF metadata value."""
67
+ if vtype == _GGUF_TYPE_UINT8:
68
+ return struct.unpack("<B", f.read(1))[0]
69
+ elif vtype == _GGUF_TYPE_INT8:
70
+ return struct.unpack("<b", f.read(1))[0]
71
+ elif vtype == _GGUF_TYPE_UINT16:
72
+ return struct.unpack("<H", f.read(2))[0]
73
+ elif vtype == _GGUF_TYPE_INT16:
74
+ return struct.unpack("<h", f.read(2))[0]
75
+ elif vtype == _GGUF_TYPE_UINT32:
76
+ return struct.unpack("<I", f.read(4))[0]
77
+ elif vtype == _GGUF_TYPE_INT32:
78
+ return struct.unpack("<i", f.read(4))[0]
79
+ elif vtype == _GGUF_TYPE_FLOAT32:
80
+ return struct.unpack("<f", f.read(4))[0]
81
+ elif vtype == _GGUF_TYPE_BOOL:
82
+ return struct.unpack("<B", f.read(1))[0] != 0
83
+ elif vtype == _GGUF_TYPE_STRING:
84
+ return _read_gguf_string(f)
85
+ elif vtype == _GGUF_TYPE_UINT64:
86
+ return struct.unpack("<Q", f.read(8))[0]
87
+ elif vtype == _GGUF_TYPE_INT64:
88
+ return struct.unpack("<q", f.read(8))[0]
89
+ elif vtype == _GGUF_TYPE_FLOAT64:
90
+ return struct.unpack("<d", f.read(8))[0]
91
+ elif vtype == _GGUF_TYPE_ARRAY:
92
+ elem_type = struct.unpack("<I", f.read(4))[0]
93
+ count = struct.unpack("<Q", f.read(8))[0]
94
+ return [_read_gguf_value(f, elem_type) for _ in range(count)]
95
+ else:
96
+ raise QuantbenchError(f"Unknown GGUF value type: {vtype}")
97
+
98
+
99
+ def profile_gguf(path: str | Path) -> ModelProfile:
100
+ """Parse a GGUF file and return a model profile.
101
+
102
+ This is a pure-Python parser — no external dependencies required.
103
+ Reads only the header (metadata + tensor info), not the actual weights.
104
+ """
105
+ path = Path(path)
106
+ if not path.exists():
107
+ raise QuantbenchError(f"File not found: {path}")
108
+
109
+ with open(path, "rb") as f:
110
+ # Read magic number
111
+ magic = struct.unpack("<I", f.read(4))[0]
112
+ if magic != GGUF_MAGIC:
113
+ raise QuantbenchError(f"Not a GGUF file (magic: {magic:#x})")
114
+
115
+ # Read version
116
+ version = struct.unpack("<I", f.read(4))[0]
117
+ if version not in (2, 3):
118
+ raise QuantbenchError(f"Unsupported GGUF version: {version}")
119
+
120
+ # Read counts
121
+ n_tensors = struct.unpack("<Q", f.read(8))[0]
122
+ n_kv = struct.unpack("<Q", f.read(8))[0]
123
+
124
+ # Read metadata
125
+ metadata: Dict[str, Any] = {}
126
+ for _ in range(n_kv):
127
+ key = _read_gguf_string(f)
128
+ vtype = struct.unpack("<I", f.read(4))[0]
129
+ value = _read_gguf_value(f, vtype)
130
+ metadata[key] = value
131
+
132
+ # Read tensor info
133
+ tensors: List[TensorInfo] = []
134
+ for _ in range(n_tensors):
135
+ name = _read_gguf_string(f)
136
+ n_dims = struct.unpack("<I", f.read(4))[0]
137
+ shape = [struct.unpack("<Q", f.read(8))[0] for _ in range(n_dims)]
138
+ dtype_id = struct.unpack("<I", f.read(4))[0]
139
+ _offset = struct.unpack("<Q", f.read(8))[0] # tensor data offset
140
+
141
+ dtype = GGUF_DTYPE_MAP.get(dtype_id, DType.UNKNOWN)
142
+ tensors.append(TensorInfo(name=name, shape=shape, dtype=dtype))
143
+
144
+ # Build layers from tensor names
145
+ layers = _group_tensors_into_layers(tensors)
146
+
147
+ # Build quant profile
148
+ quant = _build_quant_profile(tensors, metadata)
149
+
150
+ total_params = sum(t.n_elements for t in tensors)
151
+ total_size = sum(t.size_bytes for t in tensors)
152
+
153
+ model_name = metadata.get("general.name", path.stem)
154
+ if not isinstance(model_name, str):
155
+ model_name = str(model_name)
156
+
157
+ return ModelProfile(
158
+ name=model_name,
159
+ format=QuantFormat.GGUF,
160
+ total_params=total_params,
161
+ total_size_bytes=total_size,
162
+ tensors=tensors,
163
+ layers=layers,
164
+ quant=quant,
165
+ metadata={k: v for k, v in metadata.items() if isinstance(v, (str, int, float, bool))},
166
+ )
167
+
168
+
169
+ def profile_safetensors(path: str | Path) -> ModelProfile:
170
+ """Parse a safetensors file header and return a model profile.
171
+
172
+ Only reads the JSON header — does not load weights into memory.
173
+ """
174
+ path = Path(path)
175
+ if not path.exists():
176
+ raise QuantbenchError(f"File not found: {path}")
177
+
178
+ with open(path, "rb") as f:
179
+ # First 8 bytes: header size as uint64
180
+ header_size = struct.unpack("<Q", f.read(8))[0]
181
+ if header_size > 100_000_000: # sanity check: 100MB max header
182
+ raise QuantbenchError(f"Header too large: {header_size}")
183
+
184
+ header_bytes = f.read(header_size)
185
+ header = json.loads(header_bytes)
186
+
187
+ # Extract tensor metadata (skip __metadata__ key)
188
+ tensors: List[TensorInfo] = []
189
+ meta = header.pop("__metadata__", {})
190
+
191
+ for tensor_name, info in header.items():
192
+ dtype_str = info.get("dtype", "F32")
193
+ shape = info.get("shape", [])
194
+ offsets = info.get("data_offsets", [0, 0])
195
+
196
+ dtype = _SAFETENSORS_DTYPE_MAP.get(dtype_str, DType.UNKNOWN)
197
+ size_bytes = offsets[1] - offsets[0] if len(offsets) == 2 else 0
198
+
199
+ ti = TensorInfo(name=tensor_name, shape=shape, dtype=dtype, size_bytes=size_bytes)
200
+ tensors.append(ti)
201
+
202
+ layers = _group_tensors_into_layers(tensors)
203
+ quant = _build_quant_profile(tensors, meta)
204
+
205
+ total_params = sum(t.n_elements for t in tensors)
206
+ total_size = sum(t.size_bytes for t in tensors)
207
+
208
+ return ModelProfile(
209
+ name=path.stem,
210
+ format=QuantFormat.SAFETENSORS,
211
+ total_params=total_params,
212
+ total_size_bytes=total_size,
213
+ tensors=tensors,
214
+ layers=layers,
215
+ quant=quant,
216
+ metadata=meta if isinstance(meta, dict) else {},
217
+ )
218
+
219
+
220
+ def profile_from_dict(data: Dict[str, Any]) -> ModelProfile:
221
+ """Create a ModelProfile from a dictionary (e.g. loaded from JSON)."""
222
+ return ModelProfile.from_dict(data)
223
+
224
+
225
+ def _group_tensors_into_layers(tensors: List[TensorInfo]) -> List[LayerInfo]:
226
+ """Group tensors into logical layers based on naming conventions."""
227
+ layer_map: Dict[str, List[TensorInfo]] = {}
228
+
229
+ for t in tensors:
230
+ # Extract layer identifier from tensor name
231
+ # Common patterns: "blk.0.attn_q.weight", "model.layers.0.self_attn.q_proj.weight"
232
+ parts = t.name.split(".")
233
+ layer_name = "other"
234
+
235
+ for i, part in enumerate(parts):
236
+ if part.isdigit() and i > 0:
237
+ layer_name = ".".join(parts[: i + 1])
238
+ break
239
+ if part in ("blk", "layers", "block", "h"):
240
+ if i + 1 < len(parts) and parts[i + 1].isdigit():
241
+ layer_name = ".".join(parts[: i + 2])
242
+ break
243
+
244
+ if layer_name not in layer_map:
245
+ layer_map[layer_name] = []
246
+ layer_map[layer_name].append(t)
247
+
248
+ return [LayerInfo(name=name, tensors=ts) for name, ts in sorted(layer_map.items())]
249
+
250
+
251
+ def _build_quant_profile(tensors: List[TensorInfo], metadata: Dict[str, Any]) -> QuantProfile:
252
+ """Build quantization profile from tensor list and metadata."""
253
+ if not tensors:
254
+ return QuantProfile()
255
+
256
+ # Count dtype distribution
257
+ total_elements = sum(t.n_elements for t in tensors)
258
+ dtype_counts: Dict[str, int] = {}
259
+ for t in tensors:
260
+ key = t.dtype.value
261
+ dtype_counts[key] = dtype_counts.get(key, 0) + t.n_elements
262
+
263
+ dtype_dist = {}
264
+ if total_elements > 0:
265
+ dtype_dist = {k: round(v / total_elements, 4) for k, v in dtype_counts.items()}
266
+
267
+ # Compute average bits per weight
268
+ avg_bpw = 0.0
269
+ if total_elements > 0:
270
+ avg_bpw = sum(t.n_elements * t.bits_per_weight for t in tensors) / total_elements
271
+
272
+ # Count quantized vs full precision layers
273
+ fp_dtypes = {DType.F32, DType.F16, DType.BF16}
274
+ n_quant = sum(1 for t in tensors if t.dtype not in fp_dtypes)
275
+ n_fp = sum(1 for t in tensors if t.dtype in fp_dtypes)
276
+
277
+ # Detect method from metadata
278
+ method = QuantMethod.UNKNOWN
279
+ meta_str = str(metadata).lower()
280
+ if "gptq" in meta_str:
281
+ method = QuantMethod.GPTQ
282
+ elif "awq" in meta_str:
283
+ method = QuantMethod.AWQ
284
+ elif "ggml" in meta_str or "llama.cpp" in meta_str:
285
+ method = QuantMethod.GGML
286
+ elif "bitsandbytes" in meta_str:
287
+ method = QuantMethod.BITSANDBYTES
288
+ elif avg_bpw > 15:
289
+ method = QuantMethod.FP16
290
+ elif avg_bpw > 7:
291
+ method = QuantMethod.INT8
292
+ elif avg_bpw > 3:
293
+ method = QuantMethod.INT4
294
+
295
+ return QuantProfile(
296
+ method=method,
297
+ avg_bits_per_weight=round(avg_bpw, 2),
298
+ dtype_distribution=dtype_dist,
299
+ n_quantized_layers=n_quant,
300
+ n_full_precision_layers=n_fp,
301
+ )
quantbenchx/py.typed ADDED
File without changes
@@ -0,0 +1,240 @@
1
+ """Quantization recommendation engine — suggest optimal format based on model profile."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Dict, List, Optional
7
+
8
+ from quantbenchx._types import DType, ModelProfile
9
+ from quantbenchx.layerwise import layer_sensitivity
10
+
11
+
12
+ @dataclass
13
+ class Recommendation:
14
+ """Quantization recommendation for a model."""
15
+
16
+ format: str
17
+ estimated_size_gb: float
18
+ estimated_quality: float # 0-1 quality retention score
19
+ per_layer: Dict[str, str] # layer name -> recommended format
20
+ explanation: str
21
+
22
+
23
+ # Candidate formats ordered from highest quality to most compressed.
24
+ # Each entry: (label, target DType, approx bpw, quality baseline)
25
+ _CANDIDATES: List[tuple] = [
26
+ ("Q8_0", DType.Q8_0, 8.5, 0.99),
27
+ ("Q6_K", DType.Q6_K, 6.5625, 0.97),
28
+ ("Q5_K_M", DType.Q5_K_M, 5.5, 0.95),
29
+ ("Q5_K_S", DType.Q5_K_S, 5.5, 0.94),
30
+ ("Q4_K_M", DType.Q4_K_M, 4.85, 0.90),
31
+ ("Q4_K_S", DType.Q4_K_S, 4.5, 0.87),
32
+ ("Q3_K_M", DType.Q3_K_M, 3.9, 0.80),
33
+ ("Q3_K_S", DType.Q3_K_S, 3.5, 0.74),
34
+ ("Q2_K", DType.Q2_K, 3.35, 0.60),
35
+ ]
36
+
37
+ # Layers matching these keywords are kept at a higher precision tier.
38
+ _HIGH_PRECISION_KEYWORDS = {"embed", "lm_head", "output", "norm", "layernorm", "rmsnorm"}
39
+
40
+
41
+ def recommend(
42
+ profile: ModelProfile,
43
+ target_bits: Optional[float] = None,
44
+ max_quality_loss: float = 0.05,
45
+ ) -> Recommendation:
46
+ """Recommend a quantization strategy for *profile*.
47
+
48
+ Parameters
49
+ ----------
50
+ profile:
51
+ A :class:`ModelProfile` produced by the profiling helpers.
52
+ target_bits:
53
+ If given, pick the format closest to this bits-per-weight.
54
+ Overrides *max_quality_loss*.
55
+ max_quality_loss:
56
+ Maximum tolerable quality loss (0-1). ``0.05`` means we want
57
+ ≥ 95 % quality retention. Ignored when *target_bits* is set.
58
+
59
+ Returns
60
+ -------
61
+ Recommendation
62
+ """
63
+ total_params = profile.total_params
64
+ if total_params == 0:
65
+ total_params = sum(l.n_params for l in profile.layers)
66
+
67
+ # --- pick the overall format ----------------------------------------
68
+ if target_bits is not None:
69
+ chosen = _closest_candidate(target_bits)
70
+ else:
71
+ min_quality = 1.0 - max_quality_loss
72
+ chosen = _best_candidate_for_quality(min_quality)
73
+
74
+ label, dtype, bpw, quality_base = chosen
75
+
76
+ # --- estimate size --------------------------------------------------
77
+ if total_params > 0:
78
+ estimated_size_bytes = total_params * bpw / 8
79
+ estimated_size_gb = round(estimated_size_bytes / (1024**3), 3)
80
+ else:
81
+ estimated_size_gb = 0.0
82
+
83
+ # --- per-layer recommendations (mixed-quant) ------------------------
84
+ sensitivities = layer_sensitivity(profile) if profile.layers else {}
85
+ high_dtype = _one_tier_higher(dtype)
86
+ per_layer: Dict[str, str] = {}
87
+ for layer_name, sens in sensitivities.items():
88
+ if _is_sensitive_layer(layer_name) or sens >= 0.75:
89
+ per_layer[layer_name] = high_dtype.value
90
+ else:
91
+ per_layer[layer_name] = dtype.value
92
+
93
+ # --- adjust quality estimate for mixed-quant benefit -----------------
94
+ estimated_quality = _adjust_quality(quality_base, per_layer, sensitivities)
95
+
96
+ # --- human-readable explanation -------------------------------------
97
+ explanation = _build_explanation(
98
+ profile, label, bpw, estimated_size_gb, estimated_quality,
99
+ per_layer, sensitivities, target_bits, max_quality_loss,
100
+ )
101
+
102
+ return Recommendation(
103
+ format=label,
104
+ estimated_size_gb=estimated_size_gb,
105
+ estimated_quality=round(estimated_quality, 4),
106
+ per_layer=per_layer,
107
+ explanation=explanation,
108
+ )
109
+
110
+
111
+ def format_recommendation(rec: Recommendation) -> str:
112
+ """Return a human-readable multi-line summary of *rec*."""
113
+ lines = [
114
+ f"Recommended format : {rec.format}",
115
+ f"Estimated size : {rec.estimated_size_gb:.2f} GB",
116
+ f"Quality retention : {rec.estimated_quality * 100:.1f}%",
117
+ "",
118
+ ]
119
+ if rec.per_layer:
120
+ lines.append("Per-layer strategy:")
121
+ for layer, fmt in rec.per_layer.items():
122
+ lines.append(f" {layer}: {fmt}")
123
+ lines.append("")
124
+ lines.append(rec.explanation)
125
+ return "\n".join(lines)
126
+
127
+
128
+ # ── internal helpers ─────────────────────────────────────────────────────
129
+
130
+
131
+ def _closest_candidate(target_bpw: float) -> tuple:
132
+ best = _CANDIDATES[0]
133
+ best_dist = abs(best[2] - target_bpw)
134
+ for c in _CANDIDATES[1:]:
135
+ d = abs(c[2] - target_bpw)
136
+ if d < best_dist:
137
+ best = c
138
+ best_dist = d
139
+ return best
140
+
141
+
142
+ def _best_candidate_for_quality(min_quality: float) -> tuple:
143
+ # Walk from most compressed to least; pick the most compressed that
144
+ # still meets the quality threshold.
145
+ for c in reversed(_CANDIDATES):
146
+ if c[3] >= min_quality:
147
+ return c
148
+ # Nothing meets the bar — fall back to highest quality.
149
+ return _CANDIDATES[0]
150
+
151
+
152
+ def _one_tier_higher(dtype: DType) -> DType:
153
+ """Return the next-higher-quality DType tier."""
154
+ order = [c[1] for c in _CANDIDATES]
155
+ try:
156
+ idx = order.index(dtype)
157
+ except ValueError:
158
+ return dtype
159
+ return order[max(0, idx - 1)]
160
+
161
+
162
+ def _is_sensitive_layer(name: str) -> bool:
163
+ name_lower = name.lower()
164
+ return any(kw in name_lower for kw in _HIGH_PRECISION_KEYWORDS)
165
+
166
+
167
+ def _adjust_quality(
168
+ base: float,
169
+ per_layer: Dict[str, str],
170
+ sensitivities: Dict[str, float],
171
+ ) -> float:
172
+ """Bump quality estimate when sensitive layers get higher precision."""
173
+ if not per_layer or not sensitivities:
174
+ return base
175
+
176
+ total_sens = sum(sensitivities.values())
177
+ if total_sens == 0:
178
+ return base
179
+
180
+ # Fraction of total sensitivity weight that got upgraded.
181
+ upgraded_sens = sum(
182
+ sensitivities.get(ln, 0.0)
183
+ for ln, fmt in per_layer.items()
184
+ # "upgraded" if the format differs from the majority
185
+ if fmt != _mode_value(per_layer)
186
+ )
187
+ benefit = 0.02 * (upgraded_sens / total_sens)
188
+ return min(base + benefit, 1.0)
189
+
190
+
191
+ def _mode_value(d: Dict[str, str]) -> str:
192
+ """Most common value in a dict."""
193
+ counts: Dict[str, int] = {}
194
+ for v in d.values():
195
+ counts[v] = counts.get(v, 0) + 1
196
+ return max(counts, key=lambda k: counts[k]) if counts else ""
197
+
198
+
199
+ def _build_explanation(
200
+ profile: ModelProfile,
201
+ label: str,
202
+ bpw: float,
203
+ size_gb: float,
204
+ quality: float,
205
+ per_layer: Dict[str, str],
206
+ sensitivities: Dict[str, float],
207
+ target_bits: Optional[float],
208
+ max_quality_loss: float,
209
+ ) -> str:
210
+ parts: List[str] = []
211
+ param_b = profile.total_params / 1e9 if profile.total_params else 0
212
+ if param_b > 0:
213
+ parts.append(f"For a ~{param_b:.1f}B-parameter model, ")
214
+ else:
215
+ parts.append("Based on the model profile, ")
216
+
217
+ if target_bits is not None:
218
+ parts.append(
219
+ f"{label} (≈{bpw:.1f} bpw) is the closest match to the "
220
+ f"requested {target_bits:.1f} bits-per-weight target."
221
+ )
222
+ else:
223
+ parts.append(
224
+ f"{label} (≈{bpw:.1f} bpw) is recommended to stay within "
225
+ f"{max_quality_loss * 100:.0f}% quality loss."
226
+ )
227
+
228
+ n_upgraded = sum(
229
+ 1 for fmt in per_layer.values() if fmt != _mode_value(per_layer)
230
+ )
231
+ if n_upgraded:
232
+ parts.append(
233
+ f" {n_upgraded} sensitive layer(s) are assigned higher precision "
234
+ f"for mixed-quantization."
235
+ )
236
+
237
+ parts.append(
238
+ f" Estimated size: {size_gb:.2f} GB, quality retention: {quality * 100:.1f}%."
239
+ )
240
+ return "".join(parts)