quantbenchx 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quantbenchx/__init__.py +132 -0
- quantbenchx/_types.py +220 -0
- quantbenchx/bandwidth.py +290 -0
- quantbenchx/cli.py +153 -0
- quantbenchx/compare.py +101 -0
- quantbenchx/imatrix.py +201 -0
- quantbenchx/layerwise.py +167 -0
- quantbenchx/matrix.py +289 -0
- quantbenchx/perplexity.py +168 -0
- quantbenchx/predict.py +125 -0
- quantbenchx/profile.py +301 -0
- quantbenchx/py.typed +0 -0
- quantbenchx/recommend.py +240 -0
- quantbenchx/report.py +171 -0
- quantbenchx-0.3.0.dist-info/METADATA +213 -0
- quantbenchx-0.3.0.dist-info/RECORD +17 -0
- quantbenchx-0.3.0.dist-info/WHEEL +4 -0
quantbenchx/__init__.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""quantbenchx — Quantization quality analyzer."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
__version__ = "0.3.0"
|
|
5
|
+
|
|
6
|
+
from quantbenchx._types import (
|
|
7
|
+
DType,
|
|
8
|
+
LayerInfo,
|
|
9
|
+
ModelProfile,
|
|
10
|
+
QualityEstimate,
|
|
11
|
+
QuantbenchError,
|
|
12
|
+
QuantFormat,
|
|
13
|
+
QuantMethod,
|
|
14
|
+
QuantProfile,
|
|
15
|
+
TensorInfo,
|
|
16
|
+
)
|
|
17
|
+
from quantbenchx.bandwidth import (
|
|
18
|
+
KNOWN_GPUS,
|
|
19
|
+
BandwidthEstimate,
|
|
20
|
+
BandwidthEstimator,
|
|
21
|
+
GPUSpec,
|
|
22
|
+
compare_gpus,
|
|
23
|
+
format_bandwidth_report,
|
|
24
|
+
)
|
|
25
|
+
from quantbenchx.compare import (
|
|
26
|
+
compare_formats,
|
|
27
|
+
compare_profiles,
|
|
28
|
+
)
|
|
29
|
+
from quantbenchx.imatrix import (
|
|
30
|
+
ImatrixAnalysis,
|
|
31
|
+
ImatrixData,
|
|
32
|
+
ImatrixEntry,
|
|
33
|
+
analyze_imatrix,
|
|
34
|
+
format_imatrix_report,
|
|
35
|
+
parse_imatrix,
|
|
36
|
+
)
|
|
37
|
+
from quantbenchx.layerwise import (
|
|
38
|
+
analyze_layers,
|
|
39
|
+
layer_sensitivity,
|
|
40
|
+
recommend_mixed_quant,
|
|
41
|
+
)
|
|
42
|
+
from quantbenchx.matrix import (
|
|
43
|
+
KNOWN_FORMATS,
|
|
44
|
+
ComparisonMatrix,
|
|
45
|
+
FormatComparison,
|
|
46
|
+
QuantFormatSpec,
|
|
47
|
+
format_comparison_table,
|
|
48
|
+
)
|
|
49
|
+
from quantbenchx.perplexity import (
|
|
50
|
+
PerplexityDelta,
|
|
51
|
+
PerplexityScore,
|
|
52
|
+
estimate_perplexity_delta,
|
|
53
|
+
format_quality_report,
|
|
54
|
+
perplexity_from_logprobs,
|
|
55
|
+
)
|
|
56
|
+
from quantbenchx.perplexity import (
|
|
57
|
+
quality_score as perplexity_quality_score,
|
|
58
|
+
)
|
|
59
|
+
from quantbenchx.predict import (
|
|
60
|
+
estimate_quality,
|
|
61
|
+
perplexity_delta,
|
|
62
|
+
)
|
|
63
|
+
from quantbenchx.profile import profile_from_dict, profile_gguf, profile_safetensors
|
|
64
|
+
from quantbenchx.recommend import (
|
|
65
|
+
Recommendation,
|
|
66
|
+
format_recommendation,
|
|
67
|
+
recommend,
|
|
68
|
+
)
|
|
69
|
+
from quantbenchx.report import (
|
|
70
|
+
format_markdown,
|
|
71
|
+
format_report_rich,
|
|
72
|
+
format_report_text,
|
|
73
|
+
load_json,
|
|
74
|
+
report_to_dict,
|
|
75
|
+
save_json,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
"DType",
|
|
80
|
+
"ImatrixAnalysis",
|
|
81
|
+
"ImatrixData",
|
|
82
|
+
"ImatrixEntry",
|
|
83
|
+
"LayerInfo",
|
|
84
|
+
"ModelProfile",
|
|
85
|
+
"QuantFormat",
|
|
86
|
+
"QuantMethod",
|
|
87
|
+
"QuantProfile",
|
|
88
|
+
"QuantbenchError",
|
|
89
|
+
"QualityEstimate",
|
|
90
|
+
"PerplexityDelta",
|
|
91
|
+
"PerplexityScore",
|
|
92
|
+
"Recommendation",
|
|
93
|
+
"TensorInfo",
|
|
94
|
+
"analyze_imatrix",
|
|
95
|
+
"analyze_layers",
|
|
96
|
+
"compare_formats",
|
|
97
|
+
"compare_profiles",
|
|
98
|
+
"estimate_perplexity_delta",
|
|
99
|
+
"estimate_quality",
|
|
100
|
+
"format_imatrix_report",
|
|
101
|
+
"format_markdown",
|
|
102
|
+
"format_quality_report",
|
|
103
|
+
"format_recommendation",
|
|
104
|
+
"format_report_rich",
|
|
105
|
+
"format_report_text",
|
|
106
|
+
"layer_sensitivity",
|
|
107
|
+
"load_json",
|
|
108
|
+
"parse_imatrix",
|
|
109
|
+
"perplexity_delta",
|
|
110
|
+
"perplexity_from_logprobs",
|
|
111
|
+
"perplexity_quality_score",
|
|
112
|
+
"profile_from_dict",
|
|
113
|
+
"profile_gguf",
|
|
114
|
+
"profile_safetensors",
|
|
115
|
+
"recommend",
|
|
116
|
+
"recommend_mixed_quant",
|
|
117
|
+
"report_to_dict",
|
|
118
|
+
"save_json",
|
|
119
|
+
# matrix
|
|
120
|
+
"ComparisonMatrix",
|
|
121
|
+
"FormatComparison",
|
|
122
|
+
"KNOWN_FORMATS",
|
|
123
|
+
"QuantFormatSpec",
|
|
124
|
+
"format_comparison_table",
|
|
125
|
+
# bandwidth
|
|
126
|
+
"BandwidthEstimate",
|
|
127
|
+
"BandwidthEstimator",
|
|
128
|
+
"GPUSpec",
|
|
129
|
+
"KNOWN_GPUS",
|
|
130
|
+
"compare_gpus",
|
|
131
|
+
"format_bandwidth_report",
|
|
132
|
+
]
|
quantbenchx/_types.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Core types for quantbenchx."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Any, Dict, List
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QuantbenchError(Exception):
|
|
11
|
+
"""Base exception for quantbenchx."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class QuantFormat(Enum):
|
|
15
|
+
"""Quantization file format."""
|
|
16
|
+
GGUF = "gguf"
|
|
17
|
+
SAFETENSORS = "safetensors"
|
|
18
|
+
PYTORCH = "pytorch"
|
|
19
|
+
UNKNOWN = "unknown"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class QuantMethod(Enum):
|
|
23
|
+
"""Quantization method/algorithm."""
|
|
24
|
+
GPTQ = "gptq"
|
|
25
|
+
AWQ = "awq"
|
|
26
|
+
GGML = "ggml"
|
|
27
|
+
BITSANDBYTES = "bitsandbytes"
|
|
28
|
+
FP16 = "fp16"
|
|
29
|
+
BF16 = "bf16"
|
|
30
|
+
FP32 = "fp32"
|
|
31
|
+
INT8 = "int8"
|
|
32
|
+
INT4 = "int4"
|
|
33
|
+
UNKNOWN = "unknown"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DType(Enum):
|
|
37
|
+
"""Data type for tensors."""
|
|
38
|
+
F32 = "f32"
|
|
39
|
+
F16 = "f16"
|
|
40
|
+
BF16 = "bf16"
|
|
41
|
+
Q8_0 = "q8_0"
|
|
42
|
+
Q6_K = "q6_k"
|
|
43
|
+
Q5_K_M = "q5_k_m"
|
|
44
|
+
Q5_K_S = "q5_k_s"
|
|
45
|
+
Q5_1 = "q5_1"
|
|
46
|
+
Q5_0 = "q5_0"
|
|
47
|
+
Q4_K_M = "q4_k_m"
|
|
48
|
+
Q4_K_S = "q4_k_s"
|
|
49
|
+
Q4_1 = "q4_1"
|
|
50
|
+
Q4_0 = "q4_0"
|
|
51
|
+
Q3_K_M = "q3_k_m"
|
|
52
|
+
Q3_K_S = "q3_k_s"
|
|
53
|
+
Q3_K_L = "q3_k_l"
|
|
54
|
+
Q2_K = "q2_k"
|
|
55
|
+
IQ4_XS = "iq4_xs"
|
|
56
|
+
IQ3_XXS = "iq3_xxs"
|
|
57
|
+
IQ2_XXS = "iq2_xxs"
|
|
58
|
+
IQ1_S = "iq1_s"
|
|
59
|
+
UNKNOWN = "unknown"
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def bits_per_weight(self) -> float:
|
|
63
|
+
"""Approximate bits per weight for this dtype."""
|
|
64
|
+
_bpw = {
|
|
65
|
+
"f32": 32.0, "f16": 16.0, "bf16": 16.0,
|
|
66
|
+
"q8_0": 8.5, "q6_k": 6.5625, "q5_k_m": 5.5,
|
|
67
|
+
"q5_k_s": 5.5, "q5_1": 5.5, "q5_0": 5.5,
|
|
68
|
+
"q4_k_m": 4.85, "q4_k_s": 4.5, "q4_1": 4.5,
|
|
69
|
+
"q4_0": 4.5, "q3_k_m": 3.9, "q3_k_s": 3.5,
|
|
70
|
+
"q3_k_l": 4.3, "q2_k": 3.35,
|
|
71
|
+
"iq4_xs": 4.25, "iq3_xxs": 3.06,
|
|
72
|
+
"iq2_xxs": 2.06, "iq1_s": 1.56,
|
|
73
|
+
"unknown": 16.0,
|
|
74
|
+
}
|
|
75
|
+
return _bpw.get(self.value, 16.0)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class TensorInfo:
|
|
80
|
+
"""Information about a single tensor in a model."""
|
|
81
|
+
name: str
|
|
82
|
+
shape: List[int]
|
|
83
|
+
dtype: DType
|
|
84
|
+
n_elements: int = 0
|
|
85
|
+
size_bytes: int = 0
|
|
86
|
+
|
|
87
|
+
def __post_init__(self) -> None:
|
|
88
|
+
if self.n_elements == 0 and self.shape:
|
|
89
|
+
prod = 1
|
|
90
|
+
for s in self.shape:
|
|
91
|
+
prod *= s
|
|
92
|
+
self.n_elements = prod
|
|
93
|
+
if self.size_bytes == 0 and self.n_elements > 0:
|
|
94
|
+
self.size_bytes = int(self.n_elements * self.dtype.bits_per_weight / 8)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def bits_per_weight(self) -> float:
|
|
98
|
+
return self.dtype.bits_per_weight
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def compression_ratio(self) -> float:
|
|
102
|
+
"""Compression ratio vs FP32."""
|
|
103
|
+
return 32.0 / self.bits_per_weight if self.bits_per_weight > 0 else 1.0
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@dataclass
|
|
107
|
+
class LayerInfo:
|
|
108
|
+
"""Information about a model layer (group of tensors)."""
|
|
109
|
+
name: str
|
|
110
|
+
tensors: List[TensorInfo] = field(default_factory=list)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def n_params(self) -> int:
|
|
114
|
+
return sum(t.n_elements for t in self.tensors)
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def size_bytes(self) -> int:
|
|
118
|
+
return sum(t.size_bytes for t in self.tensors)
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def avg_bits_per_weight(self) -> float:
|
|
122
|
+
total_elements = sum(t.n_elements for t in self.tensors)
|
|
123
|
+
if total_elements == 0:
|
|
124
|
+
return 0.0
|
|
125
|
+
weighted = sum(t.n_elements * t.bits_per_weight for t in self.tensors)
|
|
126
|
+
return weighted / total_elements
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def dominant_dtype(self) -> DType:
|
|
130
|
+
if not self.tensors:
|
|
131
|
+
return DType.UNKNOWN
|
|
132
|
+
dtype_counts: Dict[DType, int] = {}
|
|
133
|
+
for t in self.tensors:
|
|
134
|
+
dtype_counts[t.dtype] = dtype_counts.get(t.dtype, 0) + t.n_elements
|
|
135
|
+
return max(dtype_counts, key=lambda d: dtype_counts[d])
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class QuantProfile:
|
|
140
|
+
"""Quantization profile summary."""
|
|
141
|
+
method: QuantMethod = QuantMethod.UNKNOWN
|
|
142
|
+
avg_bits_per_weight: float = 0.0
|
|
143
|
+
dtype_distribution: Dict[str, float] = field(default_factory=dict)
|
|
144
|
+
n_quantized_layers: int = 0
|
|
145
|
+
n_full_precision_layers: int = 0
|
|
146
|
+
group_size: int = 0
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class ModelProfile:
|
|
151
|
+
"""Full model profile with all tensor/layer information."""
|
|
152
|
+
name: str
|
|
153
|
+
format: QuantFormat
|
|
154
|
+
total_params: int = 0
|
|
155
|
+
total_size_bytes: int = 0
|
|
156
|
+
tensors: List[TensorInfo] = field(default_factory=list)
|
|
157
|
+
layers: List[LayerInfo] = field(default_factory=list)
|
|
158
|
+
quant: QuantProfile = field(default_factory=QuantProfile)
|
|
159
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def size_gb(self) -> float:
|
|
163
|
+
return self.total_size_bytes / (1024**3) if self.total_size_bytes > 0 else 0.0
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def compression_ratio(self) -> float:
|
|
167
|
+
fp32_size = self.total_params * 4
|
|
168
|
+
return fp32_size / self.total_size_bytes if self.total_size_bytes > 0 else 1.0
|
|
169
|
+
|
|
170
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
171
|
+
return {
|
|
172
|
+
"name": self.name,
|
|
173
|
+
"format": self.format.value,
|
|
174
|
+
"total_params": self.total_params,
|
|
175
|
+
"total_size_bytes": self.total_size_bytes,
|
|
176
|
+
"size_gb": round(self.size_gb, 3),
|
|
177
|
+
"compression_ratio": round(self.compression_ratio, 2),
|
|
178
|
+
"quant": {
|
|
179
|
+
"method": self.quant.method.value,
|
|
180
|
+
"avg_bits_per_weight": round(self.quant.avg_bits_per_weight, 2),
|
|
181
|
+
"dtype_distribution": self.quant.dtype_distribution,
|
|
182
|
+
"n_quantized_layers": self.quant.n_quantized_layers,
|
|
183
|
+
"n_full_precision_layers": self.quant.n_full_precision_layers,
|
|
184
|
+
},
|
|
185
|
+
"n_tensors": len(self.tensors),
|
|
186
|
+
"n_layers": len(self.layers),
|
|
187
|
+
"metadata": self.metadata,
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
def from_dict(cls, d: Dict[str, Any]) -> ModelProfile:
|
|
192
|
+
quant_d = d.get("quant", {})
|
|
193
|
+
qp = QuantProfile(
|
|
194
|
+
method=QuantMethod(quant_d.get("method", "unknown")),
|
|
195
|
+
avg_bits_per_weight=quant_d.get("avg_bits_per_weight", 0.0),
|
|
196
|
+
dtype_distribution=quant_d.get("dtype_distribution", {}),
|
|
197
|
+
n_quantized_layers=quant_d.get("n_quantized_layers", 0),
|
|
198
|
+
n_full_precision_layers=quant_d.get("n_full_precision_layers", 0),
|
|
199
|
+
)
|
|
200
|
+
return cls(
|
|
201
|
+
name=d["name"],
|
|
202
|
+
format=QuantFormat(d.get("format", "unknown")),
|
|
203
|
+
total_params=d.get("total_params", 0),
|
|
204
|
+
total_size_bytes=d.get("total_size_bytes", 0),
|
|
205
|
+
quant=qp,
|
|
206
|
+
metadata=d.get("metadata", {}),
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@dataclass
|
|
211
|
+
class QualityEstimate:
|
|
212
|
+
"""Estimated quality loss from quantization."""
|
|
213
|
+
model_name: str
|
|
214
|
+
method: str
|
|
215
|
+
avg_bits_per_weight: float
|
|
216
|
+
estimated_perplexity_delta: float
|
|
217
|
+
quality_score: float # 0-1, higher = better
|
|
218
|
+
risk_level: str # low, medium, high, critical
|
|
219
|
+
sensitive_layers: List[str] = field(default_factory=list)
|
|
220
|
+
recommendations: List[str] = field(default_factory=list)
|
quantbenchx/bandwidth.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""Memory bandwidth estimation and roofline analysis for quantized models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class GPUSpec:
|
|
12
|
+
"""Specification of a GPU's memory and compute capabilities."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
memory_bandwidth_gbps: float # GB/s
|
|
16
|
+
memory_gb: float # Total VRAM in GB
|
|
17
|
+
compute_tflops: float # Peak FP16/BF16 TFLOPS
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class BandwidthEstimate:
|
|
22
|
+
"""Result of a bandwidth estimation."""
|
|
23
|
+
|
|
24
|
+
model_size_gb: float
|
|
25
|
+
gpu: str
|
|
26
|
+
transfer_time_ms: float
|
|
27
|
+
is_memory_bound: bool
|
|
28
|
+
arithmetic_intensity: float # FLOP/byte
|
|
29
|
+
achievable_tflops: float
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Pre-defined GPU specs (peak values from vendor datasheets)
|
|
33
|
+
KNOWN_GPUS: Dict[str, GPUSpec] = {
|
|
34
|
+
"H100_SXM": GPUSpec(
|
|
35
|
+
name="H100 SXM",
|
|
36
|
+
memory_bandwidth_gbps=3350.0,
|
|
37
|
+
memory_gb=80.0,
|
|
38
|
+
compute_tflops=989.5,
|
|
39
|
+
),
|
|
40
|
+
"H100_PCIe": GPUSpec(
|
|
41
|
+
name="H100 PCIe",
|
|
42
|
+
memory_bandwidth_gbps=2039.0,
|
|
43
|
+
memory_gb=80.0,
|
|
44
|
+
compute_tflops=756.5,
|
|
45
|
+
),
|
|
46
|
+
"A100_80GB": GPUSpec(
|
|
47
|
+
name="A100 80GB SXM",
|
|
48
|
+
memory_bandwidth_gbps=2039.0,
|
|
49
|
+
memory_gb=80.0,
|
|
50
|
+
compute_tflops=312.0,
|
|
51
|
+
),
|
|
52
|
+
"A100_40GB": GPUSpec(
|
|
53
|
+
name="A100 40GB",
|
|
54
|
+
memory_bandwidth_gbps=1555.0,
|
|
55
|
+
memory_gb=40.0,
|
|
56
|
+
compute_tflops=312.0,
|
|
57
|
+
),
|
|
58
|
+
"A10G": GPUSpec(
|
|
59
|
+
name="A10G",
|
|
60
|
+
memory_bandwidth_gbps=600.0,
|
|
61
|
+
memory_gb=24.0,
|
|
62
|
+
compute_tflops=125.0,
|
|
63
|
+
),
|
|
64
|
+
"L4": GPUSpec(
|
|
65
|
+
name="L4",
|
|
66
|
+
memory_bandwidth_gbps=300.0,
|
|
67
|
+
memory_gb=24.0,
|
|
68
|
+
compute_tflops=121.0,
|
|
69
|
+
),
|
|
70
|
+
"T4": GPUSpec(
|
|
71
|
+
name="T4",
|
|
72
|
+
memory_bandwidth_gbps=300.0,
|
|
73
|
+
memory_gb=16.0,
|
|
74
|
+
compute_tflops=65.0,
|
|
75
|
+
),
|
|
76
|
+
"RTX_4090": GPUSpec(
|
|
77
|
+
name="RTX 4090",
|
|
78
|
+
memory_bandwidth_gbps=1008.0,
|
|
79
|
+
memory_gb=24.0,
|
|
80
|
+
compute_tflops=330.0,
|
|
81
|
+
),
|
|
82
|
+
"V100": GPUSpec(
|
|
83
|
+
name="V100",
|
|
84
|
+
memory_bandwidth_gbps=900.0,
|
|
85
|
+
memory_gb=16.0,
|
|
86
|
+
compute_tflops=125.0,
|
|
87
|
+
),
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class BandwidthEstimator:
|
|
92
|
+
"""Estimate memory bandwidth requirements for model inference."""
|
|
93
|
+
|
|
94
|
+
def __init__(self, gpu: GPUSpec) -> None:
|
|
95
|
+
self.gpu = gpu
|
|
96
|
+
|
|
97
|
+
def estimate_transfer(self, model_size_gb: float) -> BandwidthEstimate:
|
|
98
|
+
"""Estimate time to transfer model weights from HBM once.
|
|
99
|
+
|
|
100
|
+
This is the lower bound for a single forward pass in a memory-bound
|
|
101
|
+
regime — every weight must be read at least once.
|
|
102
|
+
"""
|
|
103
|
+
if model_size_gb <= 0:
|
|
104
|
+
raise ValueError("model_size_gb must be positive")
|
|
105
|
+
|
|
106
|
+
transfer_time_s = model_size_gb / self.gpu.memory_bandwidth_gbps
|
|
107
|
+
transfer_time_ms = transfer_time_s * 1000.0
|
|
108
|
+
|
|
109
|
+
# A pure weight-read has zero compute → always memory-bound
|
|
110
|
+
return BandwidthEstimate(
|
|
111
|
+
model_size_gb=model_size_gb,
|
|
112
|
+
gpu=self.gpu.name,
|
|
113
|
+
transfer_time_ms=round(transfer_time_ms, 4),
|
|
114
|
+
is_memory_bound=True,
|
|
115
|
+
arithmetic_intensity=0.0,
|
|
116
|
+
achievable_tflops=0.0,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def estimate_inference(
|
|
120
|
+
self,
|
|
121
|
+
model_size_gb: float,
|
|
122
|
+
batch_size: int = 1,
|
|
123
|
+
seq_length: int = 512,
|
|
124
|
+
) -> BandwidthEstimate:
|
|
125
|
+
"""Estimate inference bandwidth characteristics.
|
|
126
|
+
|
|
127
|
+
Uses a simplified model:
|
|
128
|
+
- Bytes transferred ≈ model_size (weights read once per token)
|
|
129
|
+
- FLOPs per token ≈ 2 × parameters (one multiply + one add per param)
|
|
130
|
+
- Parameters estimated from model_size assuming 2 bytes/param (fp16 baseline).
|
|
131
|
+
"""
|
|
132
|
+
if model_size_gb <= 0:
|
|
133
|
+
raise ValueError("model_size_gb must be positive")
|
|
134
|
+
if batch_size < 1:
|
|
135
|
+
raise ValueError("batch_size must be >= 1")
|
|
136
|
+
if seq_length < 1:
|
|
137
|
+
raise ValueError("seq_length must be >= 1")
|
|
138
|
+
|
|
139
|
+
model_size_bytes = model_size_gb * (1024 ** 3)
|
|
140
|
+
|
|
141
|
+
# Estimate parameter count (assume 2 bytes / param as fp16 baseline)
|
|
142
|
+
est_params = model_size_bytes / 2.0
|
|
143
|
+
|
|
144
|
+
# FLOPs per token ≈ 2 × params (matmul dominated)
|
|
145
|
+
flops_per_token = 2.0 * est_params
|
|
146
|
+
total_flops = flops_per_token * batch_size
|
|
147
|
+
|
|
148
|
+
# Bytes read from memory per token generation step
|
|
149
|
+
bytes_read = model_size_bytes # weights read once
|
|
150
|
+
|
|
151
|
+
# Arithmetic intensity (FLOP / byte)
|
|
152
|
+
arithmetic_intensity = total_flops / bytes_read if bytes_read > 0 else 0.0
|
|
153
|
+
|
|
154
|
+
# Roofline: compute-bound when AI > ridge point
|
|
155
|
+
ridge_point = (self.gpu.compute_tflops * 1e12) / (
|
|
156
|
+
self.gpu.memory_bandwidth_gbps * 1e9
|
|
157
|
+
)
|
|
158
|
+
is_memory_bound = arithmetic_intensity < ridge_point
|
|
159
|
+
|
|
160
|
+
if is_memory_bound:
|
|
161
|
+
# Achievable throughput limited by bandwidth
|
|
162
|
+
achievable_tflops = (
|
|
163
|
+
arithmetic_intensity * self.gpu.memory_bandwidth_gbps * 1e9
|
|
164
|
+
) / 1e12
|
|
165
|
+
else:
|
|
166
|
+
achievable_tflops = self.gpu.compute_tflops
|
|
167
|
+
|
|
168
|
+
# Time for one decoding step (generating one token)
|
|
169
|
+
if is_memory_bound:
|
|
170
|
+
time_s = bytes_read / (self.gpu.memory_bandwidth_gbps * 1e9)
|
|
171
|
+
else:
|
|
172
|
+
time_s = total_flops / (self.gpu.compute_tflops * 1e12)
|
|
173
|
+
|
|
174
|
+
transfer_time_ms = round(time_s * 1000.0, 4)
|
|
175
|
+
|
|
176
|
+
return BandwidthEstimate(
|
|
177
|
+
model_size_gb=model_size_gb,
|
|
178
|
+
gpu=self.gpu.name,
|
|
179
|
+
transfer_time_ms=transfer_time_ms,
|
|
180
|
+
is_memory_bound=is_memory_bound,
|
|
181
|
+
arithmetic_intensity=round(arithmetic_intensity, 4),
|
|
182
|
+
achievable_tflops=round(achievable_tflops, 4),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def roofline_analysis(
|
|
186
|
+
self,
|
|
187
|
+
model_size_gb: float,
|
|
188
|
+
flops_per_token: float,
|
|
189
|
+
) -> Dict[str, object]:
|
|
190
|
+
"""Perform roofline analysis for a given workload.
|
|
191
|
+
|
|
192
|
+
Returns a dict with ridge point, operational intensity, and
|
|
193
|
+
whether the workload is compute- or memory-bound.
|
|
194
|
+
"""
|
|
195
|
+
if model_size_gb <= 0:
|
|
196
|
+
raise ValueError("model_size_gb must be positive")
|
|
197
|
+
if flops_per_token <= 0:
|
|
198
|
+
raise ValueError("flops_per_token must be positive")
|
|
199
|
+
|
|
200
|
+
model_size_bytes = model_size_gb * (1024 ** 3)
|
|
201
|
+
arithmetic_intensity = flops_per_token / model_size_bytes
|
|
202
|
+
|
|
203
|
+
ridge_point = (self.gpu.compute_tflops * 1e12) / (
|
|
204
|
+
self.gpu.memory_bandwidth_gbps * 1e9
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
is_memory_bound = arithmetic_intensity < ridge_point
|
|
208
|
+
|
|
209
|
+
if is_memory_bound:
|
|
210
|
+
achievable_tflops = (
|
|
211
|
+
arithmetic_intensity * self.gpu.memory_bandwidth_gbps * 1e9
|
|
212
|
+
) / 1e12
|
|
213
|
+
else:
|
|
214
|
+
achievable_tflops = self.gpu.compute_tflops
|
|
215
|
+
|
|
216
|
+
# Time per token
|
|
217
|
+
if is_memory_bound:
|
|
218
|
+
time_s = model_size_bytes / (self.gpu.memory_bandwidth_gbps * 1e9)
|
|
219
|
+
else:
|
|
220
|
+
time_s = flops_per_token / (self.gpu.compute_tflops * 1e12)
|
|
221
|
+
|
|
222
|
+
tokens_per_second = 1.0 / time_s if time_s > 0 else 0.0
|
|
223
|
+
|
|
224
|
+
return {
|
|
225
|
+
"gpu": self.gpu.name,
|
|
226
|
+
"model_size_gb": model_size_gb,
|
|
227
|
+
"ridge_point": round(ridge_point, 4),
|
|
228
|
+
"arithmetic_intensity": round(arithmetic_intensity, 4),
|
|
229
|
+
"is_memory_bound": is_memory_bound,
|
|
230
|
+
"achievable_tflops": round(achievable_tflops, 4),
|
|
231
|
+
"time_per_token_ms": round(time_s * 1000.0, 4),
|
|
232
|
+
"tokens_per_second": round(tokens_per_second, 2),
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
def fits_in_memory(self, model_size_gb: float) -> bool:
|
|
236
|
+
"""Check if the model fits in a single GPU's memory."""
|
|
237
|
+
return model_size_gb <= self.gpu.memory_gb
|
|
238
|
+
|
|
239
|
+
def required_gpus(self, model_size_gb: float) -> int:
|
|
240
|
+
"""Minimum number of GPUs required to hold the model weights."""
|
|
241
|
+
if model_size_gb <= 0:
|
|
242
|
+
raise ValueError("model_size_gb must be positive")
|
|
243
|
+
return math.ceil(model_size_gb / self.gpu.memory_gb)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def compare_gpus(
|
|
247
|
+
model_size_gb: float,
|
|
248
|
+
gpus: Optional[List[GPUSpec]] = None,
|
|
249
|
+
) -> List[BandwidthEstimate]:
|
|
250
|
+
"""Compare bandwidth estimates across multiple GPUs.
|
|
251
|
+
|
|
252
|
+
If *gpus* is ``None``, all :data:`KNOWN_GPUS` are used.
|
|
253
|
+
"""
|
|
254
|
+
if gpus is None:
|
|
255
|
+
gpus = list(KNOWN_GPUS.values())
|
|
256
|
+
|
|
257
|
+
results: List[BandwidthEstimate] = []
|
|
258
|
+
for gpu in gpus:
|
|
259
|
+
estimator = BandwidthEstimator(gpu)
|
|
260
|
+
est = estimator.estimate_inference(model_size_gb)
|
|
261
|
+
results.append(est)
|
|
262
|
+
return results
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def format_bandwidth_report(estimates: List[BandwidthEstimate]) -> str:
|
|
266
|
+
"""Format a list of bandwidth estimates as a human-readable report."""
|
|
267
|
+
if not estimates:
|
|
268
|
+
return "No estimates to report."
|
|
269
|
+
|
|
270
|
+
lines: List[str] = []
|
|
271
|
+
lines.append("Bandwidth Estimation Report")
|
|
272
|
+
lines.append("=" * 60)
|
|
273
|
+
|
|
274
|
+
model_size = estimates[0].model_size_gb
|
|
275
|
+
lines.append(f"Model size: {model_size:.2f} GB")
|
|
276
|
+
lines.append("")
|
|
277
|
+
|
|
278
|
+
header = f"{'GPU':<20} {'Time/tok(ms)':>13} {'Bound':>8} {'AI':>8} {'TFLOPS':>8}"
|
|
279
|
+
lines.append(header)
|
|
280
|
+
lines.append("-" * 60)
|
|
281
|
+
|
|
282
|
+
for est in sorted(estimates, key=lambda e: e.transfer_time_ms):
|
|
283
|
+
bound = "MEM" if est.is_memory_bound else "COMP"
|
|
284
|
+
lines.append(
|
|
285
|
+
f"{est.gpu:<20} {est.transfer_time_ms:>13.4f} {bound:>8} "
|
|
286
|
+
f"{est.arithmetic_intensity:>8.2f} {est.achievable_tflops:>8.2f}"
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
lines.append("")
|
|
290
|
+
return "\n".join(lines)
|