quantbenchx 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,132 @@
1
+ """quantbenchx — Quantization quality analyzer."""
2
+ from __future__ import annotations
3
+
4
+ __version__ = "0.3.0"
5
+
6
+ from quantbenchx._types import (
7
+ DType,
8
+ LayerInfo,
9
+ ModelProfile,
10
+ QualityEstimate,
11
+ QuantbenchError,
12
+ QuantFormat,
13
+ QuantMethod,
14
+ QuantProfile,
15
+ TensorInfo,
16
+ )
17
+ from quantbenchx.bandwidth import (
18
+ KNOWN_GPUS,
19
+ BandwidthEstimate,
20
+ BandwidthEstimator,
21
+ GPUSpec,
22
+ compare_gpus,
23
+ format_bandwidth_report,
24
+ )
25
+ from quantbenchx.compare import (
26
+ compare_formats,
27
+ compare_profiles,
28
+ )
29
+ from quantbenchx.imatrix import (
30
+ ImatrixAnalysis,
31
+ ImatrixData,
32
+ ImatrixEntry,
33
+ analyze_imatrix,
34
+ format_imatrix_report,
35
+ parse_imatrix,
36
+ )
37
+ from quantbenchx.layerwise import (
38
+ analyze_layers,
39
+ layer_sensitivity,
40
+ recommend_mixed_quant,
41
+ )
42
+ from quantbenchx.matrix import (
43
+ KNOWN_FORMATS,
44
+ ComparisonMatrix,
45
+ FormatComparison,
46
+ QuantFormatSpec,
47
+ format_comparison_table,
48
+ )
49
+ from quantbenchx.perplexity import (
50
+ PerplexityDelta,
51
+ PerplexityScore,
52
+ estimate_perplexity_delta,
53
+ format_quality_report,
54
+ perplexity_from_logprobs,
55
+ )
56
+ from quantbenchx.perplexity import (
57
+ quality_score as perplexity_quality_score,
58
+ )
59
+ from quantbenchx.predict import (
60
+ estimate_quality,
61
+ perplexity_delta,
62
+ )
63
+ from quantbenchx.profile import profile_from_dict, profile_gguf, profile_safetensors
64
+ from quantbenchx.recommend import (
65
+ Recommendation,
66
+ format_recommendation,
67
+ recommend,
68
+ )
69
+ from quantbenchx.report import (
70
+ format_markdown,
71
+ format_report_rich,
72
+ format_report_text,
73
+ load_json,
74
+ report_to_dict,
75
+ save_json,
76
+ )
77
+
78
+ __all__ = [
79
+ "DType",
80
+ "ImatrixAnalysis",
81
+ "ImatrixData",
82
+ "ImatrixEntry",
83
+ "LayerInfo",
84
+ "ModelProfile",
85
+ "QuantFormat",
86
+ "QuantMethod",
87
+ "QuantProfile",
88
+ "QuantbenchError",
89
+ "QualityEstimate",
90
+ "PerplexityDelta",
91
+ "PerplexityScore",
92
+ "Recommendation",
93
+ "TensorInfo",
94
+ "analyze_imatrix",
95
+ "analyze_layers",
96
+ "compare_formats",
97
+ "compare_profiles",
98
+ "estimate_perplexity_delta",
99
+ "estimate_quality",
100
+ "format_imatrix_report",
101
+ "format_markdown",
102
+ "format_quality_report",
103
+ "format_recommendation",
104
+ "format_report_rich",
105
+ "format_report_text",
106
+ "layer_sensitivity",
107
+ "load_json",
108
+ "parse_imatrix",
109
+ "perplexity_delta",
110
+ "perplexity_from_logprobs",
111
+ "perplexity_quality_score",
112
+ "profile_from_dict",
113
+ "profile_gguf",
114
+ "profile_safetensors",
115
+ "recommend",
116
+ "recommend_mixed_quant",
117
+ "report_to_dict",
118
+ "save_json",
119
+ # matrix
120
+ "ComparisonMatrix",
121
+ "FormatComparison",
122
+ "KNOWN_FORMATS",
123
+ "QuantFormatSpec",
124
+ "format_comparison_table",
125
+ # bandwidth
126
+ "BandwidthEstimate",
127
+ "BandwidthEstimator",
128
+ "GPUSpec",
129
+ "KNOWN_GPUS",
130
+ "compare_gpus",
131
+ "format_bandwidth_report",
132
+ ]
quantbenchx/_types.py ADDED
@@ -0,0 +1,220 @@
1
+ """Core types for quantbenchx."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from enum import Enum
7
+ from typing import Any, Dict, List
8
+
9
+
10
+ class QuantbenchError(Exception):
11
+ """Base exception for quantbenchx."""
12
+
13
+
14
+ class QuantFormat(Enum):
15
+ """Quantization file format."""
16
+ GGUF = "gguf"
17
+ SAFETENSORS = "safetensors"
18
+ PYTORCH = "pytorch"
19
+ UNKNOWN = "unknown"
20
+
21
+
22
+ class QuantMethod(Enum):
23
+ """Quantization method/algorithm."""
24
+ GPTQ = "gptq"
25
+ AWQ = "awq"
26
+ GGML = "ggml"
27
+ BITSANDBYTES = "bitsandbytes"
28
+ FP16 = "fp16"
29
+ BF16 = "bf16"
30
+ FP32 = "fp32"
31
+ INT8 = "int8"
32
+ INT4 = "int4"
33
+ UNKNOWN = "unknown"
34
+
35
+
36
+ class DType(Enum):
37
+ """Data type for tensors."""
38
+ F32 = "f32"
39
+ F16 = "f16"
40
+ BF16 = "bf16"
41
+ Q8_0 = "q8_0"
42
+ Q6_K = "q6_k"
43
+ Q5_K_M = "q5_k_m"
44
+ Q5_K_S = "q5_k_s"
45
+ Q5_1 = "q5_1"
46
+ Q5_0 = "q5_0"
47
+ Q4_K_M = "q4_k_m"
48
+ Q4_K_S = "q4_k_s"
49
+ Q4_1 = "q4_1"
50
+ Q4_0 = "q4_0"
51
+ Q3_K_M = "q3_k_m"
52
+ Q3_K_S = "q3_k_s"
53
+ Q3_K_L = "q3_k_l"
54
+ Q2_K = "q2_k"
55
+ IQ4_XS = "iq4_xs"
56
+ IQ3_XXS = "iq3_xxs"
57
+ IQ2_XXS = "iq2_xxs"
58
+ IQ1_S = "iq1_s"
59
+ UNKNOWN = "unknown"
60
+
61
+ @property
62
+ def bits_per_weight(self) -> float:
63
+ """Approximate bits per weight for this dtype."""
64
+ _bpw = {
65
+ "f32": 32.0, "f16": 16.0, "bf16": 16.0,
66
+ "q8_0": 8.5, "q6_k": 6.5625, "q5_k_m": 5.5,
67
+ "q5_k_s": 5.5, "q5_1": 5.5, "q5_0": 5.5,
68
+ "q4_k_m": 4.85, "q4_k_s": 4.5, "q4_1": 4.5,
69
+ "q4_0": 4.5, "q3_k_m": 3.9, "q3_k_s": 3.5,
70
+ "q3_k_l": 4.3, "q2_k": 3.35,
71
+ "iq4_xs": 4.25, "iq3_xxs": 3.06,
72
+ "iq2_xxs": 2.06, "iq1_s": 1.56,
73
+ "unknown": 16.0,
74
+ }
75
+ return _bpw.get(self.value, 16.0)
76
+
77
+
78
+ @dataclass
79
+ class TensorInfo:
80
+ """Information about a single tensor in a model."""
81
+ name: str
82
+ shape: List[int]
83
+ dtype: DType
84
+ n_elements: int = 0
85
+ size_bytes: int = 0
86
+
87
+ def __post_init__(self) -> None:
88
+ if self.n_elements == 0 and self.shape:
89
+ prod = 1
90
+ for s in self.shape:
91
+ prod *= s
92
+ self.n_elements = prod
93
+ if self.size_bytes == 0 and self.n_elements > 0:
94
+ self.size_bytes = int(self.n_elements * self.dtype.bits_per_weight / 8)
95
+
96
+ @property
97
+ def bits_per_weight(self) -> float:
98
+ return self.dtype.bits_per_weight
99
+
100
+ @property
101
+ def compression_ratio(self) -> float:
102
+ """Compression ratio vs FP32."""
103
+ return 32.0 / self.bits_per_weight if self.bits_per_weight > 0 else 1.0
104
+
105
+
106
+ @dataclass
107
+ class LayerInfo:
108
+ """Information about a model layer (group of tensors)."""
109
+ name: str
110
+ tensors: List[TensorInfo] = field(default_factory=list)
111
+
112
+ @property
113
+ def n_params(self) -> int:
114
+ return sum(t.n_elements for t in self.tensors)
115
+
116
+ @property
117
+ def size_bytes(self) -> int:
118
+ return sum(t.size_bytes for t in self.tensors)
119
+
120
+ @property
121
+ def avg_bits_per_weight(self) -> float:
122
+ total_elements = sum(t.n_elements for t in self.tensors)
123
+ if total_elements == 0:
124
+ return 0.0
125
+ weighted = sum(t.n_elements * t.bits_per_weight for t in self.tensors)
126
+ return weighted / total_elements
127
+
128
+ @property
129
+ def dominant_dtype(self) -> DType:
130
+ if not self.tensors:
131
+ return DType.UNKNOWN
132
+ dtype_counts: Dict[DType, int] = {}
133
+ for t in self.tensors:
134
+ dtype_counts[t.dtype] = dtype_counts.get(t.dtype, 0) + t.n_elements
135
+ return max(dtype_counts, key=lambda d: dtype_counts[d])
136
+
137
+
138
+ @dataclass
139
+ class QuantProfile:
140
+ """Quantization profile summary."""
141
+ method: QuantMethod = QuantMethod.UNKNOWN
142
+ avg_bits_per_weight: float = 0.0
143
+ dtype_distribution: Dict[str, float] = field(default_factory=dict)
144
+ n_quantized_layers: int = 0
145
+ n_full_precision_layers: int = 0
146
+ group_size: int = 0
147
+
148
+
149
+ @dataclass
150
+ class ModelProfile:
151
+ """Full model profile with all tensor/layer information."""
152
+ name: str
153
+ format: QuantFormat
154
+ total_params: int = 0
155
+ total_size_bytes: int = 0
156
+ tensors: List[TensorInfo] = field(default_factory=list)
157
+ layers: List[LayerInfo] = field(default_factory=list)
158
+ quant: QuantProfile = field(default_factory=QuantProfile)
159
+ metadata: Dict[str, Any] = field(default_factory=dict)
160
+
161
+ @property
162
+ def size_gb(self) -> float:
163
+ return self.total_size_bytes / (1024**3) if self.total_size_bytes > 0 else 0.0
164
+
165
+ @property
166
+ def compression_ratio(self) -> float:
167
+ fp32_size = self.total_params * 4
168
+ return fp32_size / self.total_size_bytes if self.total_size_bytes > 0 else 1.0
169
+
170
+ def to_dict(self) -> Dict[str, Any]:
171
+ return {
172
+ "name": self.name,
173
+ "format": self.format.value,
174
+ "total_params": self.total_params,
175
+ "total_size_bytes": self.total_size_bytes,
176
+ "size_gb": round(self.size_gb, 3),
177
+ "compression_ratio": round(self.compression_ratio, 2),
178
+ "quant": {
179
+ "method": self.quant.method.value,
180
+ "avg_bits_per_weight": round(self.quant.avg_bits_per_weight, 2),
181
+ "dtype_distribution": self.quant.dtype_distribution,
182
+ "n_quantized_layers": self.quant.n_quantized_layers,
183
+ "n_full_precision_layers": self.quant.n_full_precision_layers,
184
+ },
185
+ "n_tensors": len(self.tensors),
186
+ "n_layers": len(self.layers),
187
+ "metadata": self.metadata,
188
+ }
189
+
190
+ @classmethod
191
+ def from_dict(cls, d: Dict[str, Any]) -> ModelProfile:
192
+ quant_d = d.get("quant", {})
193
+ qp = QuantProfile(
194
+ method=QuantMethod(quant_d.get("method", "unknown")),
195
+ avg_bits_per_weight=quant_d.get("avg_bits_per_weight", 0.0),
196
+ dtype_distribution=quant_d.get("dtype_distribution", {}),
197
+ n_quantized_layers=quant_d.get("n_quantized_layers", 0),
198
+ n_full_precision_layers=quant_d.get("n_full_precision_layers", 0),
199
+ )
200
+ return cls(
201
+ name=d["name"],
202
+ format=QuantFormat(d.get("format", "unknown")),
203
+ total_params=d.get("total_params", 0),
204
+ total_size_bytes=d.get("total_size_bytes", 0),
205
+ quant=qp,
206
+ metadata=d.get("metadata", {}),
207
+ )
208
+
209
+
210
+ @dataclass
211
+ class QualityEstimate:
212
+ """Estimated quality loss from quantization."""
213
+ model_name: str
214
+ method: str
215
+ avg_bits_per_weight: float
216
+ estimated_perplexity_delta: float
217
+ quality_score: float # 0-1, higher = better
218
+ risk_level: str # low, medium, high, critical
219
+ sensitive_layers: List[str] = field(default_factory=list)
220
+ recommendations: List[str] = field(default_factory=list)
@@ -0,0 +1,290 @@
1
+ """Memory bandwidth estimation and roofline analysis for quantized models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from dataclasses import dataclass
7
+ from typing import Dict, List, Optional
8
+
9
+
10
+ @dataclass
11
+ class GPUSpec:
12
+ """Specification of a GPU's memory and compute capabilities."""
13
+
14
+ name: str
15
+ memory_bandwidth_gbps: float # GB/s
16
+ memory_gb: float # Total VRAM in GB
17
+ compute_tflops: float # Peak FP16/BF16 TFLOPS
18
+
19
+
20
+ @dataclass
21
+ class BandwidthEstimate:
22
+ """Result of a bandwidth estimation."""
23
+
24
+ model_size_gb: float
25
+ gpu: str
26
+ transfer_time_ms: float
27
+ is_memory_bound: bool
28
+ arithmetic_intensity: float # FLOP/byte
29
+ achievable_tflops: float
30
+
31
+
32
+ # Pre-defined GPU specs (peak values from vendor datasheets)
33
+ KNOWN_GPUS: Dict[str, GPUSpec] = {
34
+ "H100_SXM": GPUSpec(
35
+ name="H100 SXM",
36
+ memory_bandwidth_gbps=3350.0,
37
+ memory_gb=80.0,
38
+ compute_tflops=989.5,
39
+ ),
40
+ "H100_PCIe": GPUSpec(
41
+ name="H100 PCIe",
42
+ memory_bandwidth_gbps=2039.0,
43
+ memory_gb=80.0,
44
+ compute_tflops=756.5,
45
+ ),
46
+ "A100_80GB": GPUSpec(
47
+ name="A100 80GB SXM",
48
+ memory_bandwidth_gbps=2039.0,
49
+ memory_gb=80.0,
50
+ compute_tflops=312.0,
51
+ ),
52
+ "A100_40GB": GPUSpec(
53
+ name="A100 40GB",
54
+ memory_bandwidth_gbps=1555.0,
55
+ memory_gb=40.0,
56
+ compute_tflops=312.0,
57
+ ),
58
+ "A10G": GPUSpec(
59
+ name="A10G",
60
+ memory_bandwidth_gbps=600.0,
61
+ memory_gb=24.0,
62
+ compute_tflops=125.0,
63
+ ),
64
+ "L4": GPUSpec(
65
+ name="L4",
66
+ memory_bandwidth_gbps=300.0,
67
+ memory_gb=24.0,
68
+ compute_tflops=121.0,
69
+ ),
70
+ "T4": GPUSpec(
71
+ name="T4",
72
+ memory_bandwidth_gbps=300.0,
73
+ memory_gb=16.0,
74
+ compute_tflops=65.0,
75
+ ),
76
+ "RTX_4090": GPUSpec(
77
+ name="RTX 4090",
78
+ memory_bandwidth_gbps=1008.0,
79
+ memory_gb=24.0,
80
+ compute_tflops=330.0,
81
+ ),
82
+ "V100": GPUSpec(
83
+ name="V100",
84
+ memory_bandwidth_gbps=900.0,
85
+ memory_gb=16.0,
86
+ compute_tflops=125.0,
87
+ ),
88
+ }
89
+
90
+
91
+ class BandwidthEstimator:
92
+ """Estimate memory bandwidth requirements for model inference."""
93
+
94
+ def __init__(self, gpu: GPUSpec) -> None:
95
+ self.gpu = gpu
96
+
97
+ def estimate_transfer(self, model_size_gb: float) -> BandwidthEstimate:
98
+ """Estimate time to transfer model weights from HBM once.
99
+
100
+ This is the lower bound for a single forward pass in a memory-bound
101
+ regime — every weight must be read at least once.
102
+ """
103
+ if model_size_gb <= 0:
104
+ raise ValueError("model_size_gb must be positive")
105
+
106
+ transfer_time_s = model_size_gb / self.gpu.memory_bandwidth_gbps
107
+ transfer_time_ms = transfer_time_s * 1000.0
108
+
109
+ # A pure weight-read has zero compute → always memory-bound
110
+ return BandwidthEstimate(
111
+ model_size_gb=model_size_gb,
112
+ gpu=self.gpu.name,
113
+ transfer_time_ms=round(transfer_time_ms, 4),
114
+ is_memory_bound=True,
115
+ arithmetic_intensity=0.0,
116
+ achievable_tflops=0.0,
117
+ )
118
+
119
+ def estimate_inference(
120
+ self,
121
+ model_size_gb: float,
122
+ batch_size: int = 1,
123
+ seq_length: int = 512,
124
+ ) -> BandwidthEstimate:
125
+ """Estimate inference bandwidth characteristics.
126
+
127
+ Uses a simplified model:
128
+ - Bytes transferred ≈ model_size (weights read once per token)
129
+ - FLOPs per token ≈ 2 × parameters (one multiply + one add per param)
130
+ - Parameters estimated from model_size assuming 2 bytes/param (fp16 baseline).
131
+ """
132
+ if model_size_gb <= 0:
133
+ raise ValueError("model_size_gb must be positive")
134
+ if batch_size < 1:
135
+ raise ValueError("batch_size must be >= 1")
136
+ if seq_length < 1:
137
+ raise ValueError("seq_length must be >= 1")
138
+
139
+ model_size_bytes = model_size_gb * (1024 ** 3)
140
+
141
+ # Estimate parameter count (assume 2 bytes / param as fp16 baseline)
142
+ est_params = model_size_bytes / 2.0
143
+
144
+ # FLOPs per token ≈ 2 × params (matmul dominated)
145
+ flops_per_token = 2.0 * est_params
146
+ total_flops = flops_per_token * batch_size
147
+
148
+ # Bytes read from memory per token generation step
149
+ bytes_read = model_size_bytes # weights read once
150
+
151
+ # Arithmetic intensity (FLOP / byte)
152
+ arithmetic_intensity = total_flops / bytes_read if bytes_read > 0 else 0.0
153
+
154
+ # Roofline: compute-bound when AI > ridge point
155
+ ridge_point = (self.gpu.compute_tflops * 1e12) / (
156
+ self.gpu.memory_bandwidth_gbps * 1e9
157
+ )
158
+ is_memory_bound = arithmetic_intensity < ridge_point
159
+
160
+ if is_memory_bound:
161
+ # Achievable throughput limited by bandwidth
162
+ achievable_tflops = (
163
+ arithmetic_intensity * self.gpu.memory_bandwidth_gbps * 1e9
164
+ ) / 1e12
165
+ else:
166
+ achievable_tflops = self.gpu.compute_tflops
167
+
168
+ # Time for one decoding step (generating one token)
169
+ if is_memory_bound:
170
+ time_s = bytes_read / (self.gpu.memory_bandwidth_gbps * 1e9)
171
+ else:
172
+ time_s = total_flops / (self.gpu.compute_tflops * 1e12)
173
+
174
+ transfer_time_ms = round(time_s * 1000.0, 4)
175
+
176
+ return BandwidthEstimate(
177
+ model_size_gb=model_size_gb,
178
+ gpu=self.gpu.name,
179
+ transfer_time_ms=transfer_time_ms,
180
+ is_memory_bound=is_memory_bound,
181
+ arithmetic_intensity=round(arithmetic_intensity, 4),
182
+ achievable_tflops=round(achievable_tflops, 4),
183
+ )
184
+
185
+ def roofline_analysis(
186
+ self,
187
+ model_size_gb: float,
188
+ flops_per_token: float,
189
+ ) -> Dict[str, object]:
190
+ """Perform roofline analysis for a given workload.
191
+
192
+ Returns a dict with ridge point, operational intensity, and
193
+ whether the workload is compute- or memory-bound.
194
+ """
195
+ if model_size_gb <= 0:
196
+ raise ValueError("model_size_gb must be positive")
197
+ if flops_per_token <= 0:
198
+ raise ValueError("flops_per_token must be positive")
199
+
200
+ model_size_bytes = model_size_gb * (1024 ** 3)
201
+ arithmetic_intensity = flops_per_token / model_size_bytes
202
+
203
+ ridge_point = (self.gpu.compute_tflops * 1e12) / (
204
+ self.gpu.memory_bandwidth_gbps * 1e9
205
+ )
206
+
207
+ is_memory_bound = arithmetic_intensity < ridge_point
208
+
209
+ if is_memory_bound:
210
+ achievable_tflops = (
211
+ arithmetic_intensity * self.gpu.memory_bandwidth_gbps * 1e9
212
+ ) / 1e12
213
+ else:
214
+ achievable_tflops = self.gpu.compute_tflops
215
+
216
+ # Time per token
217
+ if is_memory_bound:
218
+ time_s = model_size_bytes / (self.gpu.memory_bandwidth_gbps * 1e9)
219
+ else:
220
+ time_s = flops_per_token / (self.gpu.compute_tflops * 1e12)
221
+
222
+ tokens_per_second = 1.0 / time_s if time_s > 0 else 0.0
223
+
224
+ return {
225
+ "gpu": self.gpu.name,
226
+ "model_size_gb": model_size_gb,
227
+ "ridge_point": round(ridge_point, 4),
228
+ "arithmetic_intensity": round(arithmetic_intensity, 4),
229
+ "is_memory_bound": is_memory_bound,
230
+ "achievable_tflops": round(achievable_tflops, 4),
231
+ "time_per_token_ms": round(time_s * 1000.0, 4),
232
+ "tokens_per_second": round(tokens_per_second, 2),
233
+ }
234
+
235
+ def fits_in_memory(self, model_size_gb: float) -> bool:
236
+ """Check if the model fits in a single GPU's memory."""
237
+ return model_size_gb <= self.gpu.memory_gb
238
+
239
+ def required_gpus(self, model_size_gb: float) -> int:
240
+ """Minimum number of GPUs required to hold the model weights."""
241
+ if model_size_gb <= 0:
242
+ raise ValueError("model_size_gb must be positive")
243
+ return math.ceil(model_size_gb / self.gpu.memory_gb)
244
+
245
+
246
+ def compare_gpus(
247
+ model_size_gb: float,
248
+ gpus: Optional[List[GPUSpec]] = None,
249
+ ) -> List[BandwidthEstimate]:
250
+ """Compare bandwidth estimates across multiple GPUs.
251
+
252
+ If *gpus* is ``None``, all :data:`KNOWN_GPUS` are used.
253
+ """
254
+ if gpus is None:
255
+ gpus = list(KNOWN_GPUS.values())
256
+
257
+ results: List[BandwidthEstimate] = []
258
+ for gpu in gpus:
259
+ estimator = BandwidthEstimator(gpu)
260
+ est = estimator.estimate_inference(model_size_gb)
261
+ results.append(est)
262
+ return results
263
+
264
+
265
+ def format_bandwidth_report(estimates: List[BandwidthEstimate]) -> str:
266
+ """Format a list of bandwidth estimates as a human-readable report."""
267
+ if not estimates:
268
+ return "No estimates to report."
269
+
270
+ lines: List[str] = []
271
+ lines.append("Bandwidth Estimation Report")
272
+ lines.append("=" * 60)
273
+
274
+ model_size = estimates[0].model_size_gb
275
+ lines.append(f"Model size: {model_size:.2f} GB")
276
+ lines.append("")
277
+
278
+ header = f"{'GPU':<20} {'Time/tok(ms)':>13} {'Bound':>8} {'AI':>8} {'TFLOPS':>8}"
279
+ lines.append(header)
280
+ lines.append("-" * 60)
281
+
282
+ for est in sorted(estimates, key=lambda e: e.transfer_time_ms):
283
+ bound = "MEM" if est.is_memory_bound else "COMP"
284
+ lines.append(
285
+ f"{est.gpu:<20} {est.transfer_time_ms:>13.4f} {bound:>8} "
286
+ f"{est.arithmetic_intensity:>8.2f} {est.achievable_tflops:>8.2f}"
287
+ )
288
+
289
+ lines.append("")
290
+ return "\n".join(lines)