mfu-tracker 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mfu_tracker/__init__.py +18 -0
- mfu_tracker/flops.py +157 -0
- mfu_tracker/gpu.py +124 -0
- mfu_tracker/integrations/__init__.py +3 -0
- mfu_tracker/integrations/hf_trainer.py +152 -0
- mfu_tracker/optim.py +155 -0
- mfu_tracker/tracker.py +211 -0
- mfu_tracker-0.1.0.dist-info/METADATA +235 -0
- mfu_tracker-0.1.0.dist-info/RECORD +11 -0
- mfu_tracker-0.1.0.dist-info/WHEEL +4 -0
- mfu_tracker-0.1.0.dist-info/licenses/LICENSE +21 -0
mfu_tracker/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""mfu-tracker: lightweight MFU and MBU tracking for PyTorch models."""
|
|
2
|
+
from .flops import flash_attn_flops, param_bytes, profile_flops
|
|
3
|
+
from .gpu import GPUSpec, get_gpu_spec
|
|
4
|
+
from .optim import MFUOptimizerWrapper
|
|
5
|
+
from .tracker import UtilizationResult, compute_mbu, compute_mfu, track
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"track",
|
|
9
|
+
"compute_mfu",
|
|
10
|
+
"compute_mbu",
|
|
11
|
+
"profile_flops",
|
|
12
|
+
"flash_attn_flops",
|
|
13
|
+
"param_bytes",
|
|
14
|
+
"get_gpu_spec",
|
|
15
|
+
"GPUSpec",
|
|
16
|
+
"UtilizationResult",
|
|
17
|
+
"MFUOptimizerWrapper",
|
|
18
|
+
]
|
mfu_tracker/flops.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""FLOP counting — FlopCounterMode (PyTorch 2.1+) with thop fallback."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
import torch.nn as nn
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def flash_attn_flops(
|
|
11
|
+
batch: int,
|
|
12
|
+
seq_len: int,
|
|
13
|
+
num_heads: int,
|
|
14
|
+
head_dim: int,
|
|
15
|
+
*,
|
|
16
|
+
causal: bool = True,
|
|
17
|
+
with_backward: bool = False,
|
|
18
|
+
) -> int:
|
|
19
|
+
"""
|
|
20
|
+
Analytical FLOP count for one flash attention call (escape hatch).
|
|
21
|
+
|
|
22
|
+
``profile_flops()`` already counts ``F.scaled_dot_product_attention``
|
|
23
|
+
automatically on CUDA via ``FlopCounterMode``. Use this only if your model
|
|
24
|
+
calls the ``flash_attn`` C extension directly (``flash_attn_func``), which
|
|
25
|
+
is rare in modern codebases.
|
|
26
|
+
|
|
27
|
+
Formula:
|
|
28
|
+
causal=True → 2 * batch * seq_len² * num_heads * head_dim FLOPs
|
|
29
|
+
causal=False → 4 * batch * seq_len² * num_heads * head_dim FLOPs
|
|
30
|
+
"""
|
|
31
|
+
fwd_flops = 2 * batch * seq_len * seq_len * num_heads * head_dim
|
|
32
|
+
if not causal:
|
|
33
|
+
fwd_flops *= 2
|
|
34
|
+
return fwd_flops * 3 if with_backward else fwd_flops
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _profile_with_flop_counter(target: nn.Module, call_args: tuple) -> int:
|
|
38
|
+
"""Profile using torch.utils.flop_counter.FlopCounterMode (PyTorch 2.1+).
|
|
39
|
+
|
|
40
|
+
Works at the ATen dispatch level — counts F.scaled_dot_product_attention
|
|
41
|
+
(SDPA / native flash attention) on CUDA automatically. Returns -1 on failure.
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
from torch.utils.flop_counter import FlopCounterMode
|
|
45
|
+
except ImportError:
|
|
46
|
+
return -1
|
|
47
|
+
|
|
48
|
+
was_training = target.training
|
|
49
|
+
target.eval()
|
|
50
|
+
try:
|
|
51
|
+
with torch.no_grad(), FlopCounterMode(display=False) as flop_counter:
|
|
52
|
+
target(*call_args)
|
|
53
|
+
return int(flop_counter.get_total_flops())
|
|
54
|
+
except Exception:
|
|
55
|
+
return -1
|
|
56
|
+
finally:
|
|
57
|
+
target.train(was_training)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _profile_with_thop(target: nn.Module, call_args: tuple) -> int:
|
|
61
|
+
"""Profile using thop — fallback for PyTorch < 2.1 or unsupported models."""
|
|
62
|
+
from thop import profile
|
|
63
|
+
|
|
64
|
+
macs, _ = profile(target, inputs=call_args, verbose=False)
|
|
65
|
+
return int(macs * 2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def profile_flops(
|
|
69
|
+
model: nn.Module,
|
|
70
|
+
args: Optional[tuple] = None,
|
|
71
|
+
kwargs: Optional[dict[str, Any]] = None,
|
|
72
|
+
*,
|
|
73
|
+
with_backward: bool = True,
|
|
74
|
+
) -> int:
|
|
75
|
+
"""
|
|
76
|
+
Count FLOPs for one forward (or forward+backward) pass through *model*.
|
|
77
|
+
|
|
78
|
+
Uses ``torch.utils.flop_counter.FlopCounterMode`` (PyTorch 2.1+), which
|
|
79
|
+
hooks at the ATen operator level and automatically counts
|
|
80
|
+
``F.scaled_dot_product_attention`` (SDPA) on CUDA — covering virtually all
|
|
81
|
+
modern transformer attention implementations without any manual correction.
|
|
82
|
+
Falls back to ``thop`` on older PyTorch versions.
|
|
83
|
+
|
|
84
|
+
**Quantized models (bitsandbytes INT8 / NF4):**
|
|
85
|
+
Both counters operate on the Python/ATen graph and cannot see inside opaque
|
|
86
|
+
bitsandbytes CUDA kernels. In practice the FLOPs reported are close to
|
|
87
|
+
correct because NF4 (used by QLoRA) dequantizes weights to fp16 before the
|
|
88
|
+
matmul, so the actual computation is a standard fp16 GEMM. INT8 similarly
|
|
89
|
+
performs an fp16-equivalent matmul after dequantization in most bitsandbytes
|
|
90
|
+
kernels. Pass the appropriate ``dtype`` to ``track()`` / ``compute_mfu()``
|
|
91
|
+
to select the correct peak TFLOPS ceiling::
|
|
92
|
+
|
|
93
|
+
# QLoRA (NF4 base + fp16 LoRA adapters) — adapters run in fp16
|
|
94
|
+
flops = profile_flops(model, kwargs=batch, with_backward=False)
|
|
95
|
+
with track(flops, param_bytes(model), dtype="fp16", spec=spec) as r:
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
# Pure INT8 inference
|
|
99
|
+
with track(flops, param_bytes(model), dtype="int8", spec=spec) as r:
|
|
100
|
+
...
|
|
101
|
+
|
|
102
|
+
**PEFT / LoRA MBU:**
|
|
103
|
+
``param_bytes(model)`` counts all parameters including the frozen base,
|
|
104
|
+
which is correct for the forward pass (all weights are read from memory).
|
|
105
|
+
For a backward-pass MBU estimate that excludes frozen weights, use
|
|
106
|
+
``param_bytes(model, trainable_only=True)``::
|
|
107
|
+
|
|
108
|
+
# Backward MBU for a LoRA-finetuned model
|
|
109
|
+
active_bytes = param_bytes(model, trainable_only=True)
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
model: The nn.Module to profile. Should be on CUDA for accurate
|
|
113
|
+
SDPA counts.
|
|
114
|
+
args: Positional inputs passed to model(*args).
|
|
115
|
+
kwargs: Keyword-only inputs (e.g. HF models). Baked into a thin
|
|
116
|
+
wrapper since both profilers call ``model(*inputs)``.
|
|
117
|
+
with_backward: Include backward-pass FLOPs (default True for training).
|
|
118
|
+
Backward ≈ 2× forward → 3× total.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Total integer FLOP count for one step.
|
|
122
|
+
"""
|
|
123
|
+
# kwargs-only models need a wrapper — both profilers call target(*call_args)
|
|
124
|
+
if kwargs:
|
|
125
|
+
_kw = kwargs
|
|
126
|
+
|
|
127
|
+
class _KwargsAdapter(nn.Module):
|
|
128
|
+
def forward(self):
|
|
129
|
+
return model(**(args[0] if args and isinstance(args[0], dict) else _kw))
|
|
130
|
+
|
|
131
|
+
target: nn.Module = _KwargsAdapter()
|
|
132
|
+
call_args: tuple = ()
|
|
133
|
+
else:
|
|
134
|
+
target = model
|
|
135
|
+
call_args = args if args is not None else ()
|
|
136
|
+
|
|
137
|
+
forward_flops = _profile_with_flop_counter(target, call_args)
|
|
138
|
+
if forward_flops < 0:
|
|
139
|
+
forward_flops = _profile_with_thop(target, call_args)
|
|
140
|
+
|
|
141
|
+
return forward_flops * 3 if with_backward else forward_flops
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def param_bytes(model: nn.Module, *, trainable_only: bool = False) -> int:
|
|
145
|
+
"""
|
|
146
|
+
Total bytes occupied by model parameters (for MBU calculation).
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
trainable_only: If True, count only parameters that require grad.
|
|
150
|
+
Useful for PEFT/LoRA backward-pass MBU estimates.
|
|
151
|
+
"""
|
|
152
|
+
params = (
|
|
153
|
+
(p for p in model.parameters() if p.requires_grad)
|
|
154
|
+
if trainable_only
|
|
155
|
+
else model.parameters()
|
|
156
|
+
)
|
|
157
|
+
return sum(p.numel() * p.element_size() for p in params)
|
mfu_tracker/gpu.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""GPU capability querying and peak throughput derivation."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import warnings
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
# FP16 dense tensor-core FLOPs per SM per clock cycle.
|
|
11
|
+
# Keyed by (major, minor) compute capability.
|
|
12
|
+
# Values empirically validated against NVIDIA spec-sheet peaks:
|
|
13
|
+
# A100 SXM4 : 156 TFLOPS = 108 SMs × 1024 × 1.410 GHz ✓
|
|
14
|
+
# H100 SXM5 : 989 TFLOPS = 132 SMs × 3840 × 1.980 GHz ✓
|
|
15
|
+
# RTX 4090 : ~660 TFLOPS = 128 SMs × 2048 × 2.520 GHz ✓
|
|
16
|
+
#
|
|
17
|
+
# Data-centre vs. consumer Ampere (8.0 vs. 8.6) differ because GA100's
|
|
18
|
+
# tensor cores are physically larger than GA102's.
|
|
19
|
+
_FP16_FLOPS_PER_SM_PER_CLOCK: dict[tuple[int, int], int] = {
|
|
20
|
+
(7, 0): 1024, # Volta (V100)
|
|
21
|
+
(7, 5): 1024, # Turing (T4, RTX 20xx)
|
|
22
|
+
(8, 0): 1024, # Ampere DC (A100, A30)
|
|
23
|
+
(8, 6): 512, # Ampere cons. (A10, RTX 30xx)
|
|
24
|
+
(8, 7): 512, # Ampere Jetson (Orin)
|
|
25
|
+
(8, 9): 2048, # Ada Lovelace (RTX 40xx, L40S)
|
|
26
|
+
(9, 0): 3840, # Hopper (H100, H200)
|
|
27
|
+
(10, 0): 7680, # Blackwell (B100, B200) — estimated ≈ 2× Hopper
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Throughput multiplier relative to FP16 dense for each dtype.
|
|
31
|
+
# Only available from certain compute capabilities onward.
|
|
32
|
+
# (cc_major_min, multiplier)
|
|
33
|
+
_DTYPE_MULTIPLIER: dict[str, tuple[int, float]] = {
|
|
34
|
+
"fp16": (7, 1.0),
|
|
35
|
+
"bf16": (8, 1.0), # BF16 tensor cores added in Ampere
|
|
36
|
+
"int8": (7, 2.0), # INT8 tensor cores since Turing
|
|
37
|
+
"fp8": (9, 2.0), # FP8 via Transformer Engine; Ada (8.9) also supports it
|
|
38
|
+
"int4": (7, 4.0), # INT4 since Turing
|
|
39
|
+
"fp4": (10, 4.0), # FP4 native in Blackwell
|
|
40
|
+
}
|
|
41
|
+
# Ada also supports FP8 even though its major is 8
|
|
42
|
+
_ADA_FP8_MINOR = 9
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class GPUSpec:
|
|
47
|
+
name: str
|
|
48
|
+
compute_capability: tuple[int, int]
|
|
49
|
+
num_sms: int
|
|
50
|
+
clock_rate_hz: float
|
|
51
|
+
memory_bandwidth_bytes_per_sec: float
|
|
52
|
+
peak_flops_by_dtype: dict[str, float] = field(default_factory=dict)
|
|
53
|
+
peak_memory_bandwidth_tbs: float = 0.0
|
|
54
|
+
|
|
55
|
+
def peak_tflops(self, dtype: str = "fp16") -> float:
|
|
56
|
+
"""Return peak dense tensor-core TFLOPS for the given dtype."""
|
|
57
|
+
if dtype not in self.peak_flops_by_dtype:
|
|
58
|
+
supported = list(self.peak_flops_by_dtype)
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"dtype '{dtype}' not supported on {self.name} "
|
|
61
|
+
f"(CC {self.compute_capability[0]}.{self.compute_capability[1]}). "
|
|
62
|
+
f"Supported: {supported}"
|
|
63
|
+
)
|
|
64
|
+
return self.peak_flops_by_dtype[dtype] / 1e12
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _warn_unknown(cc: tuple[int, int]) -> None:
|
|
68
|
+
warnings.warn(
|
|
69
|
+
f"Unknown compute capability {cc[0]}.{cc[1]}; MFU/MBU results may be inaccurate. "
|
|
70
|
+
"Please open an issue at https://github.com/your-repo/mfu-tracker.",
|
|
71
|
+
UserWarning,
|
|
72
|
+
stacklevel=3,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _fp8_supported(cc: tuple[int, int]) -> bool:
|
|
77
|
+
major, minor = cc
|
|
78
|
+
return major >= 9 or (major == 8 and minor >= _ADA_FP8_MINOR)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_gpu_spec(device: Optional[torch.device] = None) -> GPUSpec:
|
|
82
|
+
"""Query the GPU and derive peak throughput from first principles."""
|
|
83
|
+
if device is None:
|
|
84
|
+
device = torch.device("cuda")
|
|
85
|
+
|
|
86
|
+
props = torch.cuda.get_device_properties(device)
|
|
87
|
+
cc = (props.major, props.minor)
|
|
88
|
+
|
|
89
|
+
fp16_flops_per_sm = _FP16_FLOPS_PER_SM_PER_CLOCK.get(cc)
|
|
90
|
+
if fp16_flops_per_sm is None:
|
|
91
|
+
# Try falling back to the same major with a known minor
|
|
92
|
+
fallback = next(
|
|
93
|
+
(v for (maj, _), v in _FP16_FLOPS_PER_SM_PER_CLOCK.items() if maj == props.major),
|
|
94
|
+
None,
|
|
95
|
+
)
|
|
96
|
+
_warn_unknown(cc)
|
|
97
|
+
fp16_flops_per_sm = fallback if fallback is not None else 1024
|
|
98
|
+
|
|
99
|
+
# clock_rate is reported in kHz
|
|
100
|
+
clock_hz = props.clock_rate * 1_000
|
|
101
|
+
fp16_peak = props.multi_processor_count * fp16_flops_per_sm * clock_hz
|
|
102
|
+
|
|
103
|
+
# Build per-dtype peak FLOPS table for this GPU
|
|
104
|
+
peak_flops_by_dtype: dict[str, float] = {}
|
|
105
|
+
for dtype, (min_major, multiplier) in _DTYPE_MULTIPLIER.items():
|
|
106
|
+
if dtype == "fp8":
|
|
107
|
+
if not _fp8_supported(cc):
|
|
108
|
+
continue
|
|
109
|
+
elif props.major < min_major:
|
|
110
|
+
continue
|
|
111
|
+
peak_flops_by_dtype[dtype] = fp16_peak * multiplier
|
|
112
|
+
|
|
113
|
+
# Memory bandwidth: memory_clock_rate in kHz, bus width in bits, DDR = ×2
|
|
114
|
+
mem_bw = props.memory_clock_rate * 1_000 * (props.memory_bus_width / 8) * 2
|
|
115
|
+
|
|
116
|
+
return GPUSpec(
|
|
117
|
+
name=props.name,
|
|
118
|
+
compute_capability=cc,
|
|
119
|
+
num_sms=props.multi_processor_count,
|
|
120
|
+
clock_rate_hz=clock_hz,
|
|
121
|
+
memory_bandwidth_bytes_per_sec=mem_bw,
|
|
122
|
+
peak_flops_by_dtype=peak_flops_by_dtype,
|
|
123
|
+
peak_memory_bandwidth_tbs=mem_bw / 1e12,
|
|
124
|
+
)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""HuggingFace Trainer integration via TrainerCallback."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import warnings
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
import torch.nn as nn
|
|
9
|
+
from transformers import TrainerCallback
|
|
10
|
+
|
|
11
|
+
from ..flops import param_bytes, profile_flops
|
|
12
|
+
from ..gpu import GPUSpec, get_gpu_spec
|
|
13
|
+
from ..tracker import compute_mbu, compute_mfu
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MFUCallback(TrainerCallback):
|
|
17
|
+
"""
|
|
18
|
+
TrainerCallback that logs MFU and MBU at every Trainer logging step.
|
|
19
|
+
|
|
20
|
+
FLOPs per step are ``fwd_flops × (1 + backward_factor)`` where
|
|
21
|
+
``backward_factor`` defaults to 2.0 (standard 3× convention). Set it higher
|
|
22
|
+
when using gradient checkpointing (typical: 3.0–4.0).
|
|
23
|
+
|
|
24
|
+
Per-step cost is two non-blocking ``Event.record()`` calls (~10 μs CPU, no
|
|
25
|
+
GPU stall). The single ``torch.cuda.synchronize()`` is deferred to ``on_log``
|
|
26
|
+
and amortised across all steps in the logging interval.
|
|
27
|
+
|
|
28
|
+
Usage::
|
|
29
|
+
|
|
30
|
+
from mfu_tracker.integrations.hf_trainer import MFUCallback
|
|
31
|
+
|
|
32
|
+
callback = MFUCallback(sample_batch=next(iter(train_dataloader)), dtype="bf16")
|
|
33
|
+
# num_gpus is auto-detected from torch.distributed — no manual config needed
|
|
34
|
+
trainer = Trainer(..., callbacks=[callback])
|
|
35
|
+
|
|
36
|
+
**torch.compile**: profile_flops is called at ``on_train_begin``, before the
|
|
37
|
+
first compiled step. This is correct — ``torch.compile`` does not change the
|
|
38
|
+
FLOP count (same math, just faster execution). The MFU improvement from
|
|
39
|
+
compilation is captured automatically in the CUDA event timing of real steps.
|
|
40
|
+
Do not pass a compiled model to this callback directly; let HF Trainer compile
|
|
41
|
+
after the callback is registered.
|
|
42
|
+
|
|
43
|
+
**DDP / FSDP**: leave ``num_gpus=1`` — per-GPU MFU equals global MFU for
|
|
44
|
+
data-parallel jobs.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
sample_batch: A representative batch dict passed as ``**kwargs`` to the
|
|
48
|
+
model. Used once at training start to profile forward FLOPs.
|
|
49
|
+
dtype: Compute dtype for the peak ceiling — "fp16", "bf16", etc.
|
|
50
|
+
num_gpus: GPUs in the peak ceiling (default 1).
|
|
51
|
+
backward_factor: Multiplier for backward pass cost (default 2.0). Set
|
|
52
|
+
higher when using gradient checkpointing (typical: 3.0–4.0).
|
|
53
|
+
metric_prefix: Prefix for logged metric names (default ``"throughput"``).
|
|
54
|
+
Results in ``throughput/mfu`` and ``throughput/mbu``, which
|
|
55
|
+
WandB groups into its own section away from loss/lr. Set to
|
|
56
|
+
``""`` to log bare ``mfu`` / ``mbu`` keys.
|
|
57
|
+
device: CUDA device to query. Defaults to current device.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
sample_batch: dict[str, Any],
|
|
63
|
+
dtype: str = "bf16",
|
|
64
|
+
num_gpus: int = 1,
|
|
65
|
+
backward_factor: float = 2.0,
|
|
66
|
+
metric_prefix: str = "throughput",
|
|
67
|
+
device: Optional[torch.device] = None,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.sample_batch = sample_batch
|
|
70
|
+
self.dtype = dtype
|
|
71
|
+
self.num_gpus = num_gpus
|
|
72
|
+
self.backward_factor = backward_factor
|
|
73
|
+
self.metric_prefix = metric_prefix
|
|
74
|
+
self.device = device
|
|
75
|
+
|
|
76
|
+
self._model: Optional[nn.Module] = None
|
|
77
|
+
self._spec: Optional[GPUSpec] = None
|
|
78
|
+
self._fwd_flops: Optional[int] = None
|
|
79
|
+
self._param_bytes: Optional[int] = None
|
|
80
|
+
|
|
81
|
+
# Each entry: (e_start, e_bwd, e_end, bwd_recorded)
|
|
82
|
+
# Each entry: (e_start, e_end). Accumulated between on_log calls.
|
|
83
|
+
self._pending: list[tuple] = []
|
|
84
|
+
|
|
85
|
+
def _profile(self, model: nn.Module) -> None:
|
|
86
|
+
self._spec = get_gpu_spec(self.device)
|
|
87
|
+
self._param_bytes = param_bytes(model)
|
|
88
|
+
try:
|
|
89
|
+
# Move sample batch to the model's device (Trainer may have moved the model).
|
|
90
|
+
model_device = next(model.parameters()).device
|
|
91
|
+
batch = {
|
|
92
|
+
k: v.to(model_device) if isinstance(v, torch.Tensor) else v
|
|
93
|
+
for k, v in self.sample_batch.items()
|
|
94
|
+
}
|
|
95
|
+
self._fwd_flops = profile_flops(
|
|
96
|
+
model, kwargs=batch, with_backward=False
|
|
97
|
+
)
|
|
98
|
+
except Exception as exc:
|
|
99
|
+
warnings.warn(
|
|
100
|
+
f"mfu-tracker: FLOP profiling failed ({exc}); MFU will not be logged.",
|
|
101
|
+
stacklevel=2,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# --- TrainerCallback protocol -------------------------------------------
|
|
105
|
+
|
|
106
|
+
def on_train_begin(self, args, state, control, model=None, **kwargs):
|
|
107
|
+
if model is not None and torch.cuda.is_available():
|
|
108
|
+
self._model = model
|
|
109
|
+
self._profile(model)
|
|
110
|
+
|
|
111
|
+
def on_step_begin(self, args, state, control, **kwargs):
|
|
112
|
+
if self._fwd_flops is None or not torch.cuda.is_available():
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
e_start = torch.cuda.Event(enable_timing=True)
|
|
116
|
+
e_end = torch.cuda.Event(enable_timing=True)
|
|
117
|
+
e_start.record()
|
|
118
|
+
self._pending_step = (e_start, e_end)
|
|
119
|
+
|
|
120
|
+
def on_step_end(self, args, state, control, **kwargs):
|
|
121
|
+
if not hasattr(self, "_pending_step"):
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
e_start, e_end = self._pending_step
|
|
125
|
+
e_end.record()
|
|
126
|
+
self._pending.append((e_start, e_end))
|
|
127
|
+
del self._pending_step
|
|
128
|
+
|
|
129
|
+
def on_log(self, args, state, control, logs=None, **kwargs):
|
|
130
|
+
if logs is None or not self._pending or self._fwd_flops is None:
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
# Single sync amortised across all accumulated steps.
|
|
134
|
+
torch.cuda.synchronize(self.device)
|
|
135
|
+
|
|
136
|
+
n_steps = len(self._pending)
|
|
137
|
+
total_ms = sum(e_start.elapsed_time(e_end) for e_start, e_end in self._pending)
|
|
138
|
+
self._pending.clear()
|
|
139
|
+
|
|
140
|
+
elapsed_sec = total_ms / 1000
|
|
141
|
+
if elapsed_sec <= 0:
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
total_flops = int(self._fwd_flops * n_steps * (1 + self.backward_factor))
|
|
145
|
+
|
|
146
|
+
prefix = f"{self.metric_prefix}/" if self.metric_prefix else ""
|
|
147
|
+
logs[f"{prefix}mfu"] = round(
|
|
148
|
+
compute_mfu(total_flops, elapsed_sec, dtype=self.dtype, num_gpus=self.num_gpus, spec=self._spec), 4
|
|
149
|
+
)
|
|
150
|
+
logs[f"{prefix}mbu"] = round(
|
|
151
|
+
compute_mbu(self._param_bytes * n_steps, elapsed_sec, num_gpus=self.num_gpus, spec=self._spec), 4
|
|
152
|
+
)
|
mfu_tracker/optim.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Optimizer wrapper that measures MFU and MBU per training step."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import warnings
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from typing import Any, Generator, Optional
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import torch.nn as nn
|
|
10
|
+
|
|
11
|
+
from .flops import param_bytes, profile_flops
|
|
12
|
+
from .gpu import GPUSpec, get_gpu_spec
|
|
13
|
+
from .tracker import UtilizationResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class MFUOptimizerWrapper:
|
|
17
|
+
"""
|
|
18
|
+
Wraps any ``torch.optim.Optimizer`` to automatically track MFU and MBU.
|
|
19
|
+
|
|
20
|
+
FLOPs are profiled once on the uncompiled model and scaled by
|
|
21
|
+
``1 + backward_factor`` (default 2.0, the standard forward + 2× backward
|
|
22
|
+
convention). Set ``backward_factor`` higher when using gradient checkpointing,
|
|
23
|
+
which recomputes activations during backward (typical values: 3.0–4.0).
|
|
24
|
+
|
|
25
|
+
``zero_grad()`` is called automatically at the *start* of ``track_step()``.
|
|
26
|
+
Call ``optimizer.step()`` **after** the block so it is excluded from the
|
|
27
|
+
timing window::
|
|
28
|
+
|
|
29
|
+
optimizer = MFUOptimizerWrapper(
|
|
30
|
+
torch.optim.AdamW(model.parameters(), lr=1e-4),
|
|
31
|
+
model=model,
|
|
32
|
+
sample_batch=sample_batch,
|
|
33
|
+
dtype="bf16",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
for batch in dataloader:
|
|
37
|
+
with optimizer.track_step() as result:
|
|
38
|
+
output = model(**batch)
|
|
39
|
+
loss = output.loss
|
|
40
|
+
loss.backward()
|
|
41
|
+
optimizer.step()
|
|
42
|
+
print(f"MFU={result.mfu:.1%} MBU={result.mbu:.1%}")
|
|
43
|
+
|
|
44
|
+
**torch.compile**: profile the uncompiled model first, then compile::
|
|
45
|
+
|
|
46
|
+
optimizer = MFUOptimizerWrapper(raw_model, ...)
|
|
47
|
+
optimizer.profile() # profile before compile
|
|
48
|
+
model = torch.compile(model)
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
optimizer: torch.optim.Optimizer,
|
|
54
|
+
model: nn.Module,
|
|
55
|
+
sample_batch: dict[str, Any],
|
|
56
|
+
dtype: str = "bf16",
|
|
57
|
+
num_gpus: int = 1,
|
|
58
|
+
backward_factor: float = 2.0,
|
|
59
|
+
device: Optional[torch.device] = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
self.optimizer = optimizer
|
|
62
|
+
self._model = model
|
|
63
|
+
self._sample_batch = sample_batch
|
|
64
|
+
self._dtype = dtype
|
|
65
|
+
self._num_gpus = num_gpus
|
|
66
|
+
self._backward_factor = backward_factor
|
|
67
|
+
self._device = device
|
|
68
|
+
|
|
69
|
+
self._spec: Optional[GPUSpec] = None
|
|
70
|
+
self._fwd_flops: Optional[int] = None
|
|
71
|
+
self._param_bytes: Optional[int] = None
|
|
72
|
+
|
|
73
|
+
def profile(self) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Explicitly profile FLOPs on the current (uncompiled) model.
|
|
76
|
+
|
|
77
|
+
Call this before ``torch.compile`` so the FLOP count is measured on the
|
|
78
|
+
original graph. If not called, profiling happens lazily on the first
|
|
79
|
+
``track_step()`` — which may be too late if the model is already compiled::
|
|
80
|
+
|
|
81
|
+
optimizer = MFUOptimizerWrapper(raw_optimizer, model, sample_batch, dtype="bf16")
|
|
82
|
+
optimizer.profile() # profile uncompiled model
|
|
83
|
+
model = torch.compile(model) # compile after profiling
|
|
84
|
+
"""
|
|
85
|
+
if self._spec is None:
|
|
86
|
+
self._profile_once()
|
|
87
|
+
|
|
88
|
+
def _profile_once(self) -> None:
|
|
89
|
+
self._spec = get_gpu_spec(self._device)
|
|
90
|
+
try:
|
|
91
|
+
self._fwd_flops = profile_flops(
|
|
92
|
+
self._model,
|
|
93
|
+
kwargs=self._sample_batch,
|
|
94
|
+
with_backward=False,
|
|
95
|
+
)
|
|
96
|
+
except Exception as exc:
|
|
97
|
+
warnings.warn(
|
|
98
|
+
f"mfu-tracker: profiling failed ({exc}); MFU will not be populated.",
|
|
99
|
+
stacklevel=3,
|
|
100
|
+
)
|
|
101
|
+
self._param_bytes = param_bytes(self._model)
|
|
102
|
+
|
|
103
|
+
@contextmanager
|
|
104
|
+
def track_step(self) -> Generator[UtilizationResult, None, None]:
|
|
105
|
+
"""
|
|
106
|
+
Context manager that wraps one training step and populates a
|
|
107
|
+
:class:`~mfu_tracker.UtilizationResult` with MFU and MBU.
|
|
108
|
+
|
|
109
|
+
``optimizer.zero_grad()`` is called automatically at the *start* of the
|
|
110
|
+
block. Call ``optimizer.step()`` **after** the block so it is excluded
|
|
111
|
+
from the timing window::
|
|
112
|
+
|
|
113
|
+
with wrapped.track_step() as result:
|
|
114
|
+
out = model(**batch)
|
|
115
|
+
out.loss.backward()
|
|
116
|
+
wrapped.step()
|
|
117
|
+
|
|
118
|
+
FLOPs are ``fwd_flops × (1 + backward_factor)`` where ``backward_factor``
|
|
119
|
+
defaults to 2.0 (standard 3× convention). Set it higher when using
|
|
120
|
+
gradient checkpointing (typical: 3.0–4.0).
|
|
121
|
+
"""
|
|
122
|
+
if self._spec is None:
|
|
123
|
+
self._profile_once()
|
|
124
|
+
|
|
125
|
+
self.optimizer.zero_grad()
|
|
126
|
+
|
|
127
|
+
e_start = torch.cuda.Event(enable_timing=True)
|
|
128
|
+
e_end = torch.cuda.Event(enable_timing=True)
|
|
129
|
+
|
|
130
|
+
result = UtilizationResult(dtype=self._dtype, gpu_spec=self._spec, num_gpus=self._num_gpus)
|
|
131
|
+
|
|
132
|
+
e_start.record()
|
|
133
|
+
try:
|
|
134
|
+
yield result
|
|
135
|
+
finally:
|
|
136
|
+
e_end.record()
|
|
137
|
+
result._e_start = e_start
|
|
138
|
+
result._e_end = e_end
|
|
139
|
+
result._total_flops = (
|
|
140
|
+
int(self._fwd_flops * (1 + self._backward_factor))
|
|
141
|
+
if self._fwd_flops is not None else None
|
|
142
|
+
)
|
|
143
|
+
result._param_bytes = self._param_bytes
|
|
144
|
+
result._device = self._device
|
|
145
|
+
|
|
146
|
+
# --- Proxy the underlying optimizer ------------------------------------
|
|
147
|
+
|
|
148
|
+
def step(self, *args, **kwargs) -> None:
|
|
149
|
+
self.optimizer.step(*args, **kwargs)
|
|
150
|
+
|
|
151
|
+
def zero_grad(self, *args, **kwargs) -> None:
|
|
152
|
+
self.optimizer.zero_grad(*args, **kwargs)
|
|
153
|
+
|
|
154
|
+
def __getattr__(self, name: str) -> Any:
|
|
155
|
+
return getattr(self.optimizer, name)
|
mfu_tracker/tracker.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""MFU and MBU measurement — context manager and standalone functions."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import time
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Generator, Optional
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
|
|
11
|
+
from .gpu import GPUSpec, get_gpu_spec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class UtilizationResult:
|
|
16
|
+
"""
|
|
17
|
+
Result holder populated after a ``track()`` or ``track_step()`` block exits.
|
|
18
|
+
|
|
19
|
+
Fields backed by CUDA events (from ``MFUOptimizerWrapper.track_step()``) are
|
|
20
|
+
computed lazily on first access — the GPU sync is deferred until the value is
|
|
21
|
+
actually needed. Fields from the CPU-timed ``track()`` context manager are
|
|
22
|
+
populated eagerly since ``torch.cuda.synchronize`` is already called there.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
dtype: str = "fp16"
|
|
26
|
+
gpu_spec: Optional[GPUSpec] = None
|
|
27
|
+
num_gpus: int = 1
|
|
28
|
+
|
|
29
|
+
# Eagerly-set fields (track()) or lazily-set after _resolve() (track_step()).
|
|
30
|
+
_mfu: Optional[float] = field(default=None, repr=False)
|
|
31
|
+
_mbu: Optional[float] = field(default=None, repr=False)
|
|
32
|
+
_elapsed_sec: Optional[float] = field(default=None, repr=False)
|
|
33
|
+
_achieved_tflops: Optional[float] = field(default=None, repr=False)
|
|
34
|
+
_achieved_tbs: Optional[float] = field(default=None, repr=False)
|
|
35
|
+
|
|
36
|
+
# Set by MFUOptimizerWrapper for lazy resolution.
|
|
37
|
+
_e_start: Optional[object] = field(default=None, repr=False)
|
|
38
|
+
_e_end: Optional[object] = field(default=None, repr=False)
|
|
39
|
+
_total_flops: Optional[int] = field(default=None, repr=False)
|
|
40
|
+
_param_bytes: Optional[int] = field(default=None, repr=False)
|
|
41
|
+
_device: Optional[torch.device] = field(default=None, repr=False)
|
|
42
|
+
|
|
43
|
+
def _resolve(self) -> None:
|
|
44
|
+
"""Sync and compute all fields from CUDA events. Called at most once."""
|
|
45
|
+
if self._e_start is None:
|
|
46
|
+
return
|
|
47
|
+
if self._total_flops is None or self._param_bytes is None:
|
|
48
|
+
return
|
|
49
|
+
torch.cuda.synchronize(self._device)
|
|
50
|
+
elapsed = self._e_start.elapsed_time(self._e_end) / 1000
|
|
51
|
+
|
|
52
|
+
peak_tflops = self.gpu_spec.peak_tflops(self.dtype) * self.num_gpus
|
|
53
|
+
peak_tbs = self.gpu_spec.peak_memory_bandwidth_tbs * self.num_gpus
|
|
54
|
+
|
|
55
|
+
self._elapsed_sec = elapsed
|
|
56
|
+
self._achieved_tflops = self._total_flops / elapsed / 1e12
|
|
57
|
+
self._achieved_tbs = self._param_bytes / elapsed / 1e12
|
|
58
|
+
self._mfu = self._achieved_tflops / peak_tflops
|
|
59
|
+
self._mbu = self._achieved_tbs / peak_tbs
|
|
60
|
+
self._e_start = None # mark resolved
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def mfu(self) -> Optional[float]:
|
|
64
|
+
self._resolve()
|
|
65
|
+
return self._mfu
|
|
66
|
+
|
|
67
|
+
@mfu.setter
|
|
68
|
+
def mfu(self, v: Optional[float]) -> None:
|
|
69
|
+
self._mfu = v
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def mbu(self) -> Optional[float]:
|
|
73
|
+
self._resolve()
|
|
74
|
+
return self._mbu
|
|
75
|
+
|
|
76
|
+
@mbu.setter
|
|
77
|
+
def mbu(self, v: Optional[float]) -> None:
|
|
78
|
+
self._mbu = v
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def elapsed_sec(self) -> Optional[float]:
|
|
82
|
+
self._resolve()
|
|
83
|
+
return self._elapsed_sec
|
|
84
|
+
|
|
85
|
+
@elapsed_sec.setter
|
|
86
|
+
def elapsed_sec(self, v: Optional[float]) -> None:
|
|
87
|
+
self._elapsed_sec = v
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def achieved_tflops(self) -> Optional[float]:
|
|
91
|
+
self._resolve()
|
|
92
|
+
return self._achieved_tflops
|
|
93
|
+
|
|
94
|
+
@achieved_tflops.setter
|
|
95
|
+
def achieved_tflops(self, v: Optional[float]) -> None:
|
|
96
|
+
self._achieved_tflops = v
|
|
97
|
+
|
|
98
|
+
@property
|
|
99
|
+
def achieved_tbs(self) -> Optional[float]:
|
|
100
|
+
self._resolve()
|
|
101
|
+
return self._achieved_tbs
|
|
102
|
+
|
|
103
|
+
@achieved_tbs.setter
|
|
104
|
+
def achieved_tbs(self, v: Optional[float]) -> None:
|
|
105
|
+
self._achieved_tbs = v
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@contextmanager
|
|
109
|
+
def track(
|
|
110
|
+
flop_count: int,
|
|
111
|
+
param_bytes: int,
|
|
112
|
+
*,
|
|
113
|
+
dtype: str = "fp16",
|
|
114
|
+
num_gpus: int = 1,
|
|
115
|
+
device: Optional[torch.device] = None,
|
|
116
|
+
spec: Optional[GPUSpec] = None,
|
|
117
|
+
) -> Generator[UtilizationResult, None, None]:
|
|
118
|
+
"""
|
|
119
|
+
Context manager that measures MFU and MBU for an arbitrary compute block.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
flop_count: Total FLOPs for the block (use flops.profile_flops or your own).
|
|
123
|
+
param_bytes: Bytes transferred for weights (num_params * bytes_per_element).
|
|
124
|
+
dtype: Compute dtype string — "fp16", "bf16", "int8", "fp8", "int4", "fp4".
|
|
125
|
+
num_gpus: Multiplier applied to the peak ceiling (default 1). When using
|
|
126
|
+
``profile_flops`` as the source of *flop_count*, leave this at 1
|
|
127
|
+
regardless of parallelism strategy — ``profile_flops`` returns
|
|
128
|
+
per-GPU FLOPs, and per-GPU MFU equals global MFU for all standard
|
|
129
|
+
parallelism types (the N factors cancel). Only set this when
|
|
130
|
+
*flop_count* is the analytically-derived *full-model* FLOP count
|
|
131
|
+
(e.g. ``6 × params × tokens``) rather than a profiled per-GPU count.
|
|
132
|
+
device: CUDA device to measure against (default: current device).
|
|
133
|
+
spec: Pre-queried GPUSpec; fetched once if not provided.
|
|
134
|
+
|
|
135
|
+
Yields a :class:`UtilizationResult` whose fields are ``None`` until the block
|
|
136
|
+
exits, then populated with measured values.
|
|
137
|
+
|
|
138
|
+
Example — single GPU::
|
|
139
|
+
|
|
140
|
+
flops = profile_flops(model, args=(sample,), with_backward=True)
|
|
141
|
+
with track(flops, param_bytes(model), dtype="bf16") as result:
|
|
142
|
+
loss = model(inputs)
|
|
143
|
+
loss.backward()
|
|
144
|
+
optimizer.step()
|
|
145
|
+
print(f"MFU={result.mfu:.1%} MBU={result.mbu:.1%}")
|
|
146
|
+
|
|
147
|
+
Example — any parallelism strategy (DDP, FSDP, tensor, pipeline)::
|
|
148
|
+
|
|
149
|
+
# profile_flops on whatever model shard the local rank holds gives
|
|
150
|
+
# per-GPU FLOPs; per-GPU MFU == global MFU for all standard strategies.
|
|
151
|
+
with track(flops, param_bytes(model), dtype="bf16") as r:
|
|
152
|
+
...
|
|
153
|
+
"""
|
|
154
|
+
if spec is None:
|
|
155
|
+
spec = get_gpu_spec(device)
|
|
156
|
+
|
|
157
|
+
peak_tflops = spec.peak_tflops(dtype) * num_gpus
|
|
158
|
+
peak_tbs = spec.peak_memory_bandwidth_tbs * num_gpus
|
|
159
|
+
|
|
160
|
+
result = UtilizationResult(dtype=dtype, gpu_spec=spec, num_gpus=num_gpus)
|
|
161
|
+
|
|
162
|
+
torch.cuda.synchronize(device)
|
|
163
|
+
t0 = time.perf_counter()
|
|
164
|
+
yield result
|
|
165
|
+
torch.cuda.synchronize(device)
|
|
166
|
+
elapsed = time.perf_counter() - t0
|
|
167
|
+
|
|
168
|
+
result.elapsed_sec = elapsed
|
|
169
|
+
result.achieved_tflops = flop_count / elapsed / 1e12
|
|
170
|
+
result.achieved_tbs = param_bytes / elapsed / 1e12
|
|
171
|
+
result.mfu = result.achieved_tflops / peak_tflops
|
|
172
|
+
result.mbu = result.achieved_tbs / peak_tbs
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def compute_mfu(
|
|
176
|
+
flop_count: int,
|
|
177
|
+
elapsed_sec: float,
|
|
178
|
+
*,
|
|
179
|
+
dtype: str = "fp16",
|
|
180
|
+
num_gpus: int = 1,
|
|
181
|
+
device: Optional[torch.device] = None,
|
|
182
|
+
spec: Optional[GPUSpec] = None,
|
|
183
|
+
) -> float:
|
|
184
|
+
"""
|
|
185
|
+
Standalone MFU calculation without a context manager.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
num_gpus: GPUs in the peak ceiling. See :func:`track` for guidance.
|
|
189
|
+
"""
|
|
190
|
+
if spec is None:
|
|
191
|
+
spec = get_gpu_spec(device)
|
|
192
|
+
return (flop_count / elapsed_sec / 1e12) / (spec.peak_tflops(dtype) * num_gpus)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def compute_mbu(
|
|
196
|
+
param_bytes: int,
|
|
197
|
+
elapsed_sec: float,
|
|
198
|
+
*,
|
|
199
|
+
num_gpus: int = 1,
|
|
200
|
+
device: Optional[torch.device] = None,
|
|
201
|
+
spec: Optional[GPUSpec] = None,
|
|
202
|
+
) -> float:
|
|
203
|
+
"""
|
|
204
|
+
Standalone MBU calculation without a context manager.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
num_gpus: GPUs in the peak ceiling. See :func:`track` for guidance.
|
|
208
|
+
"""
|
|
209
|
+
if spec is None:
|
|
210
|
+
spec = get_gpu_spec(device)
|
|
211
|
+
return (param_bytes / elapsed_sec / 1e12) / (spec.peak_memory_bandwidth_tbs * num_gpus)
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mfu-tracker
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight Model FLOPs Utilization and Bandwidth Utilization tracker for PyTorch
|
|
5
|
+
License: MIT License
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2026 Jeremias Lino Ferrao
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
SOFTWARE.
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Requires-Dist: numpy>=2.0.2
|
|
29
|
+
Requires-Dist: thop>=0.1.1.post2209072238
|
|
30
|
+
Requires-Dist: torch>=2.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
34
|
+
Provides-Extra: hf
|
|
35
|
+
Requires-Dist: transformers>=4.30; extra == 'hf'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# mfu-tracker
|
|
39
|
+
|
|
40
|
+
When profiling training runs, I found that most existing tools either lacked MFU/MBU support entirely or dragged in hundreds of megabytes of transitive dependencies. This library is an attempt at a self-contained alternative.
|
|
41
|
+
|
|
42
|
+
**mfu-tracker** is a PyTorch library for measuring Model FLOPs Utilization (MFU) and Model Bandwidth Utilization (MBU). It supports bare PyTorch training loops, an optimizer wrapper, and a HuggingFace Trainer callback.
|
|
43
|
+
|
|
44
|
+
- **Minimal dependencies** — PyTorch and `thop` only
|
|
45
|
+
- **Profiled FLOPs, not formula estimates** — uses `FlopCounterMode` to count the FLOPs your model actually executes rather than a formula like `6 × params × tokens`. For Mixture-of-Experts models this means only active experts are counted, giving a more accurate numerator than parameter-based estimates.
|
|
46
|
+
- **Three integration styles** — context manager, optimizer wrapper, HF Trainer callback
|
|
47
|
+
- **WandB / TensorBoard / MLflow** — metrics are logged through HF Trainer's existing pipeline when using `MFUCallback`
|
|
48
|
+
|
|
49
|
+
MFU as a training efficiency metric was introduced in the [PaLM paper](https://arxiv.org/abs/2204.02311) (Chowdhery et al., 2022).
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## What MFU and MBU measure
|
|
54
|
+
|
|
55
|
+
**MFU (Model FLOPs Utilization)** is the ratio of observed FLOP throughput to the GPU's theoretical peak for the given dtype. A value of 0.50 means the model is executing at half the GPU's rated peak. Well-optimized large models on modern hardware typically fall in the 0.40–0.60 range; small models often land much lower due to kernel dispatch overhead relative to compute time.
|
|
56
|
+
|
|
57
|
+
**MBU (Model Bandwidth Utilization)** as computed here is a proxy, not a direct DRAM measurement. It is defined as:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
MBU = (param_bytes / elapsed_sec) / peak_memory_bandwidth
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
where `param_bytes` is the total size of model parameters and `elapsed_sec` is wall time. This assumes one full pass through model weights per step and does not account for activation memory, gradients, optimizer state, or data layout effects. It is most useful as a relative indicator across runs rather than an absolute efficiency measure.
|
|
64
|
+
|
|
65
|
+
If both MFU and MBU are low simultaneously, the GPU is underutilized. Two common causes: kernel dispatch overhead (the CPU cannot issue kernels fast enough to keep the GPU busy — `torch.compile` reduces this by fusing operations), or CPU-side pipeline stalls (slow DataLoader, heavy host preprocessing, or host-to-device transfers in the hot path).
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Installation
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install mfu-tracker
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
HuggingFace Trainer integration requires no extra install — if you are already running HF Trainer, `transformers` is already available. Import `MFUCallback` directly.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Usage
|
|
80
|
+
|
|
81
|
+
### Context manager (bare PyTorch)
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from mfu_tracker import track, profile_flops, param_bytes
|
|
85
|
+
|
|
86
|
+
# Profile once on the uncompiled model before training begins
|
|
87
|
+
sample = {"input_ids": batch["input_ids"][:1]}
|
|
88
|
+
flops = profile_flops(model, kwargs=sample, with_backward=True)
|
|
89
|
+
p_bytes = param_bytes(model)
|
|
90
|
+
|
|
91
|
+
for batch in dataloader:
|
|
92
|
+
optimizer.zero_grad()
|
|
93
|
+
with track(flops, p_bytes, dtype="bf16") as result:
|
|
94
|
+
loss = model(**batch).loss
|
|
95
|
+
loss.backward()
|
|
96
|
+
optimizer.step()
|
|
97
|
+
|
|
98
|
+
print(f"MFU: {result.mfu:.3f} MBU: {result.mbu:.3f} {result.elapsed_sec*1000:.0f} ms/step")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Optimizer wrapper
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from mfu_tracker import MFUOptimizerWrapper
|
|
105
|
+
|
|
106
|
+
base_optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
|
|
107
|
+
optimizer = MFUOptimizerWrapper(
|
|
108
|
+
base_optimizer, model,
|
|
109
|
+
sample_batch={"input_ids": sample_ids},
|
|
110
|
+
dtype="bf16",
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Profile before compiling — FlopCounterMode may not trace compiled graphs
|
|
114
|
+
optimizer.profile()
|
|
115
|
+
model = torch.compile(model)
|
|
116
|
+
|
|
117
|
+
for batch in dataloader:
|
|
118
|
+
with optimizer.track_step() as result: # calls zero_grad() at block entry
|
|
119
|
+
loss = model(**batch).loss
|
|
120
|
+
loss.backward()
|
|
121
|
+
optimizer.step() # outside the timing window
|
|
122
|
+
|
|
123
|
+
if step % 10 == 0:
|
|
124
|
+
print(f"MFU {result.mfu:.3f} MBU {result.mbu:.3f}")
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### HuggingFace Trainer
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from mfu_tracker.integrations.hf_trainer import MFUCallback
|
|
131
|
+
|
|
132
|
+
sample_batch = {k: v[:batch_size] for k, v in next(iter(train_dataloader)).items()}
|
|
133
|
+
|
|
134
|
+
callback = MFUCallback(
|
|
135
|
+
sample_batch=sample_batch,
|
|
136
|
+
dtype="bf16",
|
|
137
|
+
metric_prefix="throughput", # logs throughput/mfu and throughput/mbu
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
trainer = Trainer(
|
|
141
|
+
model=model,
|
|
142
|
+
args=training_args,
|
|
143
|
+
train_dataset=train_dataset,
|
|
144
|
+
callbacks=[callback],
|
|
145
|
+
)
|
|
146
|
+
trainer.train()
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
`throughput/mfu` and `throughput/mbu` are added to the Trainer log dict at each logging step and forwarded automatically to any configured integrations (WandB, TensorBoard, MLflow). WandB groups metrics by the `/` separator, so these appear in a distinct "throughput" section rather than alongside loss and learning rate.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## FLOP counting
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from mfu_tracker import profile_flops, flash_attn_flops, param_bytes
|
|
157
|
+
|
|
158
|
+
# Standard models — FlopCounterMode counts SDPA automatically on CUDA
|
|
159
|
+
flops = profile_flops(model, kwargs=batch, with_backward=True)
|
|
160
|
+
|
|
161
|
+
# Models calling flash_attn_func directly (rare; older HF with use_flash_attention_2=True)
|
|
162
|
+
# need a manual correction since the C extension is opaque to FlopCounterMode:
|
|
163
|
+
flops += flash_attn_flops(batch_size=B, seq_len=S, num_heads=H, head_dim=D)
|
|
164
|
+
|
|
165
|
+
# PEFT / LoRA — restrict param_bytes to trainable parameters only
|
|
166
|
+
p_bytes = param_bytes(model, trainable_only=True)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
`with_backward=True` applies the standard 3× convention (1× forward + 2× backward). For gradient checkpointing, pass `backward_factor=3.0` or `4.0` to `MFUOptimizerWrapper` or `MFUCallback`.
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## GPU spec
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from mfu_tracker import get_gpu_spec
|
|
177
|
+
|
|
178
|
+
spec = get_gpu_spec()
|
|
179
|
+
print(spec.name) # e.g. "NVIDIA GeForce RTX 4080"
|
|
180
|
+
print(spec.peak_tflops("fp16")) # e.g. 97.6
|
|
181
|
+
print(spec.peak_tflops("fp8")) # Ada Lovelace (CC 8.9) and Hopper (CC 9.0)+
|
|
182
|
+
print(spec.peak_memory_bandwidth_tbs) # e.g. 0.717
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Supported dtypes: `fp32`, `fp16`, `bf16`, `int8`, `fp8`, `int4`, `fp4`. Unrecognized compute capabilities fall back to the nearest known major version with a `UserWarning`.
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Benchmark (RTX 4080, GPT-2 124M, fp16)
|
|
190
|
+
|
|
191
|
+
| Configuration | MFU | ms/step |
|
|
192
|
+
|---|---|---|
|
|
193
|
+
| batch=1 · eager | ~0.027 | ~40 ms |
|
|
194
|
+
| batch=8 · eager | ~0.09 | ~93 ms |
|
|
195
|
+
| batch=8 · sdpa | ~0.12 | ~74 ms |
|
|
196
|
+
| batch=8 · sdpa + compile | ~0.17 | ~50 ms |
|
|
197
|
+
| batch=16 · sdpa + compile | ~0.16 | ~104 ms |
|
|
198
|
+
|
|
199
|
+
GPT-2 (124M) is a small model relative to the compute capacity of a modern GPU, so low MFU is expected — the model spends a large fraction of step time waiting for kernel dispatch rather than doing arithmetic. Larger models (e.g. LLaMA-70B) typically reach 0.40–0.60 MFU. The improvement from `torch.compile` reflects kernel fusion reducing dispatch overhead. I'll add some testing on this later.
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
python examples/benchmark_mfu.py --help
|
|
203
|
+
python examples/hf_trainer_mfu.py --dtype bf16 --batch-size 16
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Multi-GPU
|
|
209
|
+
|
|
210
|
+
Leave `num_gpus=1` (the default) when using `profile_flops` as the FLOP source. For data-parallel strategies (DDP, FSDP), per-GPU FLOPs equal total FLOPs divided by N and wall time is the same on all ranks, so per-GPU MFU equals global MFU and the N factors cancel. Set `num_gpus > 1` only when pairing an analytically-derived full-model FLOP count (e.g. `6 × params × tokens`) with a total-job peak ceiling.
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Limitations
|
|
215
|
+
|
|
216
|
+
- **SDPA on CPU is not counted** — `FlopCounterMode` does not intercept flash attention dispatch on CPU. Profile with a CUDA model.
|
|
217
|
+
- **bitsandbytes quantized layers** — INT8/NF4 kernels are opaque to `FlopCounterMode`. NF4 dequantizes to fp16 before the matmul, so FLOP counts are approximately correct. Pass the appropriate dtype to use the right peak ceiling.
|
|
218
|
+
- **`flash_attn_func` direct calls** — models bypassing `F.scaled_dot_product_attention` need a manual `flash_attn_flops()` correction (see above).
|
|
219
|
+
- **Peak ceilings from spec sheets** — these are not independently measured. MFU > 1.0 indicates the ceiling is underestimated.
|
|
220
|
+
- **MBU is a proxy** — the formula uses parameter bytes as a stand-in for memory traffic; actual DRAM traffic (activations, gradients, optimizer state) is higher and not measured.
|
|
221
|
+
- I have not tested the library extensively yet; please open an issue if you encounter any bugs or unexpected behavior.
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Requirements
|
|
226
|
+
|
|
227
|
+
- Python 3.9+
|
|
228
|
+
- PyTorch 2.0+ (2.1+ recommended for `FlopCounterMode`)
|
|
229
|
+
- A CUDA GPU is required for meaningful results; CPU timing works but MFU will be near zero for any realistic model
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
mfu_tracker/__init__.py,sha256=gvoCPl6mEHBRV7Y_q_FLJjK16JQMsQ11nfgrpQiF8-0,503
|
|
2
|
+
mfu_tracker/flops.py,sha256=NwCrPXRNXPJMHSWEcb5SSWxEZEqkGocwdNY0NJ-ZeiU,5762
|
|
3
|
+
mfu_tracker/gpu.py,sha256=60ZSbSe1cAyfjPm-uqZR7ptRBYDFno6SJJO32VGePwA,4689
|
|
4
|
+
mfu_tracker/optim.py,sha256=kOfQfDMC7oPpWGGRHROs4Cwv9vxFAoe3GpYlV_2sGOw,5466
|
|
5
|
+
mfu_tracker/tracker.py,sha256=fDGpFm7XKEkwtSb6CHgWsrrS01ic2l8RxVOrvuDtaUY,7362
|
|
6
|
+
mfu_tracker/integrations/__init__.py,sha256=KmKZNbpniR3qqNG5Zloe6Z2tGwi7fgH1zCb6p_8a0po,63
|
|
7
|
+
mfu_tracker/integrations/hf_trainer.py,sha256=aP7ld5HGgOgDu_PQt0vcoOU2hx7ocqp__sHDPnmblgY,6164
|
|
8
|
+
mfu_tracker-0.1.0.dist-info/METADATA,sha256=wQ9f1PHsrcgYa-kB53wlTYht6luRnbS20PFkX-SR3Us,10365
|
|
9
|
+
mfu_tracker-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
mfu_tracker-0.1.0.dist-info/licenses/LICENSE,sha256=aXaBx4UYHF1tx67IIN-nh5BIggjU8q9WWYyN2HupHXA,1077
|
|
11
|
+
mfu_tracker-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jeremias Lino Ferrao
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|