mfu-tracker 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ """mfu-tracker: lightweight MFU and MBU tracking for PyTorch models."""
2
+ from .flops import flash_attn_flops, param_bytes, profile_flops
3
+ from .gpu import GPUSpec, get_gpu_spec
4
+ from .optim import MFUOptimizerWrapper
5
+ from .tracker import UtilizationResult, compute_mbu, compute_mfu, track
6
+
7
+ __all__ = [
8
+ "track",
9
+ "compute_mfu",
10
+ "compute_mbu",
11
+ "profile_flops",
12
+ "flash_attn_flops",
13
+ "param_bytes",
14
+ "get_gpu_spec",
15
+ "GPUSpec",
16
+ "UtilizationResult",
17
+ "MFUOptimizerWrapper",
18
+ ]
mfu_tracker/flops.py ADDED
@@ -0,0 +1,157 @@
1
+ """FLOP counting — FlopCounterMode (PyTorch 2.1+) with thop fallback."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any, Optional
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+
9
+
10
+ def flash_attn_flops(
11
+ batch: int,
12
+ seq_len: int,
13
+ num_heads: int,
14
+ head_dim: int,
15
+ *,
16
+ causal: bool = True,
17
+ with_backward: bool = False,
18
+ ) -> int:
19
+ """
20
+ Analytical FLOP count for one flash attention call (escape hatch).
21
+
22
+ ``profile_flops()`` already counts ``F.scaled_dot_product_attention``
23
+ automatically on CUDA via ``FlopCounterMode``. Use this only if your model
24
+ calls the ``flash_attn`` C extension directly (``flash_attn_func``), which
25
+ is rare in modern codebases.
26
+
27
+ Formula:
28
+ causal=True → 2 * batch * seq_len² * num_heads * head_dim FLOPs
29
+ causal=False → 4 * batch * seq_len² * num_heads * head_dim FLOPs
30
+ """
31
+ fwd_flops = 2 * batch * seq_len * seq_len * num_heads * head_dim
32
+ if not causal:
33
+ fwd_flops *= 2
34
+ return fwd_flops * 3 if with_backward else fwd_flops
35
+
36
+
37
+ def _profile_with_flop_counter(target: nn.Module, call_args: tuple) -> int:
38
+ """Profile using torch.utils.flop_counter.FlopCounterMode (PyTorch 2.1+).
39
+
40
+ Works at the ATen dispatch level — counts F.scaled_dot_product_attention
41
+ (SDPA / native flash attention) on CUDA automatically. Returns -1 on failure.
42
+ """
43
+ try:
44
+ from torch.utils.flop_counter import FlopCounterMode
45
+ except ImportError:
46
+ return -1
47
+
48
+ was_training = target.training
49
+ target.eval()
50
+ try:
51
+ with torch.no_grad(), FlopCounterMode(display=False) as flop_counter:
52
+ target(*call_args)
53
+ return int(flop_counter.get_total_flops())
54
+ except Exception:
55
+ return -1
56
+ finally:
57
+ target.train(was_training)
58
+
59
+
60
+ def _profile_with_thop(target: nn.Module, call_args: tuple) -> int:
61
+ """Profile using thop — fallback for PyTorch < 2.1 or unsupported models."""
62
+ from thop import profile
63
+
64
+ macs, _ = profile(target, inputs=call_args, verbose=False)
65
+ return int(macs * 2)
66
+
67
+
68
+ def profile_flops(
69
+ model: nn.Module,
70
+ args: Optional[tuple] = None,
71
+ kwargs: Optional[dict[str, Any]] = None,
72
+ *,
73
+ with_backward: bool = True,
74
+ ) -> int:
75
+ """
76
+ Count FLOPs for one forward (or forward+backward) pass through *model*.
77
+
78
+ Uses ``torch.utils.flop_counter.FlopCounterMode`` (PyTorch 2.1+), which
79
+ hooks at the ATen operator level and automatically counts
80
+ ``F.scaled_dot_product_attention`` (SDPA) on CUDA — covering virtually all
81
+ modern transformer attention implementations without any manual correction.
82
+ Falls back to ``thop`` on older PyTorch versions.
83
+
84
+ **Quantized models (bitsandbytes INT8 / NF4):**
85
+ Both counters operate on the Python/ATen graph and cannot see inside opaque
86
+ bitsandbytes CUDA kernels. In practice the FLOPs reported are close to
87
+ correct because NF4 (used by QLoRA) dequantizes weights to fp16 before the
88
+ matmul, so the actual computation is a standard fp16 GEMM. INT8 similarly
89
+ performs an fp16-equivalent matmul after dequantization in most bitsandbytes
90
+ kernels. Pass the appropriate ``dtype`` to ``track()`` / ``compute_mfu()``
91
+ to select the correct peak TFLOPS ceiling::
92
+
93
+ # QLoRA (NF4 base + fp16 LoRA adapters) — adapters run in fp16
94
+ flops = profile_flops(model, kwargs=batch, with_backward=False)
95
+ with track(flops, param_bytes(model), dtype="fp16", spec=spec) as r:
96
+ ...
97
+
98
+ # Pure INT8 inference
99
+ with track(flops, param_bytes(model), dtype="int8", spec=spec) as r:
100
+ ...
101
+
102
+ **PEFT / LoRA MBU:**
103
+ ``param_bytes(model)`` counts all parameters including the frozen base,
104
+ which is correct for the forward pass (all weights are read from memory).
105
+ For a backward-pass MBU estimate that excludes frozen weights, use
106
+ ``param_bytes(model, trainable_only=True)``::
107
+
108
+ # Backward MBU for a LoRA-finetuned model
109
+ active_bytes = param_bytes(model, trainable_only=True)
110
+
111
+ Args:
112
+ model: The nn.Module to profile. Should be on CUDA for accurate
113
+ SDPA counts.
114
+ args: Positional inputs passed to model(*args).
115
+ kwargs: Keyword-only inputs (e.g. HF models). Baked into a thin
116
+ wrapper since both profilers call ``model(*inputs)``.
117
+ with_backward: Include backward-pass FLOPs (default True for training).
118
+ Backward ≈ 2× forward → 3× total.
119
+
120
+ Returns:
121
+ Total integer FLOP count for one step.
122
+ """
123
+ # kwargs-only models need a wrapper — both profilers call target(*call_args)
124
+ if kwargs:
125
+ _kw = kwargs
126
+
127
+ class _KwargsAdapter(nn.Module):
128
+ def forward(self):
129
+ return model(**(args[0] if args and isinstance(args[0], dict) else _kw))
130
+
131
+ target: nn.Module = _KwargsAdapter()
132
+ call_args: tuple = ()
133
+ else:
134
+ target = model
135
+ call_args = args if args is not None else ()
136
+
137
+ forward_flops = _profile_with_flop_counter(target, call_args)
138
+ if forward_flops < 0:
139
+ forward_flops = _profile_with_thop(target, call_args)
140
+
141
+ return forward_flops * 3 if with_backward else forward_flops
142
+
143
+
144
+ def param_bytes(model: nn.Module, *, trainable_only: bool = False) -> int:
145
+ """
146
+ Total bytes occupied by model parameters (for MBU calculation).
147
+
148
+ Args:
149
+ trainable_only: If True, count only parameters that require grad.
150
+ Useful for PEFT/LoRA backward-pass MBU estimates.
151
+ """
152
+ params = (
153
+ (p for p in model.parameters() if p.requires_grad)
154
+ if trainable_only
155
+ else model.parameters()
156
+ )
157
+ return sum(p.numel() * p.element_size() for p in params)
mfu_tracker/gpu.py ADDED
@@ -0,0 +1,124 @@
1
+ """GPU capability querying and peak throughput derivation."""
2
+ from __future__ import annotations
3
+
4
+ import warnings
5
+ from dataclasses import dataclass, field
6
+ from typing import Optional
7
+
8
+ import torch
9
+
10
+ # FP16 dense tensor-core FLOPs per SM per clock cycle.
11
+ # Keyed by (major, minor) compute capability.
12
+ # Values empirically validated against NVIDIA spec-sheet peaks:
13
+ # A100 SXM4 : 156 TFLOPS = 108 SMs × 1024 × 1.410 GHz ✓
14
+ # H100 SXM5 : 989 TFLOPS = 132 SMs × 3840 × 1.980 GHz ✓
15
+ # RTX 4090 : ~660 TFLOPS = 128 SMs × 2048 × 2.520 GHz ✓
16
+ #
17
+ # Data-centre vs. consumer Ampere (8.0 vs. 8.6) differ because GA100's
18
+ # tensor cores are physically larger than GA102's.
19
+ _FP16_FLOPS_PER_SM_PER_CLOCK: dict[tuple[int, int], int] = {
20
+ (7, 0): 1024, # Volta (V100)
21
+ (7, 5): 1024, # Turing (T4, RTX 20xx)
22
+ (8, 0): 1024, # Ampere DC (A100, A30)
23
+ (8, 6): 512, # Ampere cons. (A10, RTX 30xx)
24
+ (8, 7): 512, # Ampere Jetson (Orin)
25
+ (8, 9): 2048, # Ada Lovelace (RTX 40xx, L40S)
26
+ (9, 0): 3840, # Hopper (H100, H200)
27
+ (10, 0): 7680, # Blackwell (B100, B200) — estimated ≈ 2× Hopper
28
+ }
29
+
30
+ # Throughput multiplier relative to FP16 dense for each dtype.
31
+ # Only available from certain compute capabilities onward.
32
+ # (cc_major_min, multiplier)
33
+ _DTYPE_MULTIPLIER: dict[str, tuple[int, float]] = {
34
+ "fp16": (7, 1.0),
35
+ "bf16": (8, 1.0), # BF16 tensor cores added in Ampere
36
+ "int8": (7, 2.0), # INT8 tensor cores since Turing
37
+ "fp8": (9, 2.0), # FP8 via Transformer Engine; Ada (8.9) also supports it
38
+ "int4": (7, 4.0), # INT4 since Turing
39
+ "fp4": (10, 4.0), # FP4 native in Blackwell
40
+ }
41
+ # Ada also supports FP8 even though its major is 8
42
+ _ADA_FP8_MINOR = 9
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class GPUSpec:
47
+ name: str
48
+ compute_capability: tuple[int, int]
49
+ num_sms: int
50
+ clock_rate_hz: float
51
+ memory_bandwidth_bytes_per_sec: float
52
+ peak_flops_by_dtype: dict[str, float] = field(default_factory=dict)
53
+ peak_memory_bandwidth_tbs: float = 0.0
54
+
55
+ def peak_tflops(self, dtype: str = "fp16") -> float:
56
+ """Return peak dense tensor-core TFLOPS for the given dtype."""
57
+ if dtype not in self.peak_flops_by_dtype:
58
+ supported = list(self.peak_flops_by_dtype)
59
+ raise ValueError(
60
+ f"dtype '{dtype}' not supported on {self.name} "
61
+ f"(CC {self.compute_capability[0]}.{self.compute_capability[1]}). "
62
+ f"Supported: {supported}"
63
+ )
64
+ return self.peak_flops_by_dtype[dtype] / 1e12
65
+
66
+
67
+ def _warn_unknown(cc: tuple[int, int]) -> None:
68
+ warnings.warn(
69
+ f"Unknown compute capability {cc[0]}.{cc[1]}; MFU/MBU results may be inaccurate. "
70
+ "Please open an issue at https://github.com/your-repo/mfu-tracker.",
71
+ UserWarning,
72
+ stacklevel=3,
73
+ )
74
+
75
+
76
+ def _fp8_supported(cc: tuple[int, int]) -> bool:
77
+ major, minor = cc
78
+ return major >= 9 or (major == 8 and minor >= _ADA_FP8_MINOR)
79
+
80
+
81
+ def get_gpu_spec(device: Optional[torch.device] = None) -> GPUSpec:
82
+ """Query the GPU and derive peak throughput from first principles."""
83
+ if device is None:
84
+ device = torch.device("cuda")
85
+
86
+ props = torch.cuda.get_device_properties(device)
87
+ cc = (props.major, props.minor)
88
+
89
+ fp16_flops_per_sm = _FP16_FLOPS_PER_SM_PER_CLOCK.get(cc)
90
+ if fp16_flops_per_sm is None:
91
+ # Try falling back to the same major with a known minor
92
+ fallback = next(
93
+ (v for (maj, _), v in _FP16_FLOPS_PER_SM_PER_CLOCK.items() if maj == props.major),
94
+ None,
95
+ )
96
+ _warn_unknown(cc)
97
+ fp16_flops_per_sm = fallback if fallback is not None else 1024
98
+
99
+ # clock_rate is reported in kHz
100
+ clock_hz = props.clock_rate * 1_000
101
+ fp16_peak = props.multi_processor_count * fp16_flops_per_sm * clock_hz
102
+
103
+ # Build per-dtype peak FLOPS table for this GPU
104
+ peak_flops_by_dtype: dict[str, float] = {}
105
+ for dtype, (min_major, multiplier) in _DTYPE_MULTIPLIER.items():
106
+ if dtype == "fp8":
107
+ if not _fp8_supported(cc):
108
+ continue
109
+ elif props.major < min_major:
110
+ continue
111
+ peak_flops_by_dtype[dtype] = fp16_peak * multiplier
112
+
113
+ # Memory bandwidth: memory_clock_rate in kHz, bus width in bits, DDR = ×2
114
+ mem_bw = props.memory_clock_rate * 1_000 * (props.memory_bus_width / 8) * 2
115
+
116
+ return GPUSpec(
117
+ name=props.name,
118
+ compute_capability=cc,
119
+ num_sms=props.multi_processor_count,
120
+ clock_rate_hz=clock_hz,
121
+ memory_bandwidth_bytes_per_sec=mem_bw,
122
+ peak_flops_by_dtype=peak_flops_by_dtype,
123
+ peak_memory_bandwidth_tbs=mem_bw / 1e12,
124
+ )
@@ -0,0 +1,3 @@
1
+ from .hf_trainer import MFUCallback
2
+
3
+ __all__ = ["MFUCallback"]
@@ -0,0 +1,152 @@
1
+ """HuggingFace Trainer integration via TrainerCallback."""
2
+ from __future__ import annotations
3
+
4
+ import warnings
5
+ from typing import Any, Optional
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ from transformers import TrainerCallback
10
+
11
+ from ..flops import param_bytes, profile_flops
12
+ from ..gpu import GPUSpec, get_gpu_spec
13
+ from ..tracker import compute_mbu, compute_mfu
14
+
15
+
16
+ class MFUCallback(TrainerCallback):
17
+ """
18
+ TrainerCallback that logs MFU and MBU at every Trainer logging step.
19
+
20
+ FLOPs per step are ``fwd_flops × (1 + backward_factor)`` where
21
+ ``backward_factor`` defaults to 2.0 (standard 3× convention). Set it higher
22
+ when using gradient checkpointing (typical: 3.0–4.0).
23
+
24
+ Per-step cost is two non-blocking ``Event.record()`` calls (~10 μs CPU, no
25
+ GPU stall). The single ``torch.cuda.synchronize()`` is deferred to ``on_log``
26
+ and amortised across all steps in the logging interval.
27
+
28
+ Usage::
29
+
30
+ from mfu_tracker.integrations.hf_trainer import MFUCallback
31
+
32
+ callback = MFUCallback(sample_batch=next(iter(train_dataloader)), dtype="bf16")
33
+ # num_gpus is auto-detected from torch.distributed — no manual config needed
34
+ trainer = Trainer(..., callbacks=[callback])
35
+
36
+ **torch.compile**: profile_flops is called at ``on_train_begin``, before the
37
+ first compiled step. This is correct — ``torch.compile`` does not change the
38
+ FLOP count (same math, just faster execution). The MFU improvement from
39
+ compilation is captured automatically in the CUDA event timing of real steps.
40
+ Do not pass a compiled model to this callback directly; let HF Trainer compile
41
+ after the callback is registered.
42
+
43
+ **DDP / FSDP**: leave ``num_gpus=1`` — per-GPU MFU equals global MFU for
44
+ data-parallel jobs.
45
+
46
+ Args:
47
+ sample_batch: A representative batch dict passed as ``**kwargs`` to the
48
+ model. Used once at training start to profile forward FLOPs.
49
+ dtype: Compute dtype for the peak ceiling — "fp16", "bf16", etc.
50
+ num_gpus: GPUs in the peak ceiling (default 1).
51
+ backward_factor: Multiplier for backward pass cost (default 2.0). Set
52
+ higher when using gradient checkpointing (typical: 3.0–4.0).
53
+ metric_prefix: Prefix for logged metric names (default ``"throughput"``).
54
+ Results in ``throughput/mfu`` and ``throughput/mbu``, which
55
+ WandB groups into its own section away from loss/lr. Set to
56
+ ``""`` to log bare ``mfu`` / ``mbu`` keys.
57
+ device: CUDA device to query. Defaults to current device.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ sample_batch: dict[str, Any],
63
+ dtype: str = "bf16",
64
+ num_gpus: int = 1,
65
+ backward_factor: float = 2.0,
66
+ metric_prefix: str = "throughput",
67
+ device: Optional[torch.device] = None,
68
+ ) -> None:
69
+ self.sample_batch = sample_batch
70
+ self.dtype = dtype
71
+ self.num_gpus = num_gpus
72
+ self.backward_factor = backward_factor
73
+ self.metric_prefix = metric_prefix
74
+ self.device = device
75
+
76
+ self._model: Optional[nn.Module] = None
77
+ self._spec: Optional[GPUSpec] = None
78
+ self._fwd_flops: Optional[int] = None
79
+ self._param_bytes: Optional[int] = None
80
+
81
+ # Each entry: (e_start, e_bwd, e_end, bwd_recorded)
82
+ # Each entry: (e_start, e_end). Accumulated between on_log calls.
83
+ self._pending: list[tuple] = []
84
+
85
+ def _profile(self, model: nn.Module) -> None:
86
+ self._spec = get_gpu_spec(self.device)
87
+ self._param_bytes = param_bytes(model)
88
+ try:
89
+ # Move sample batch to the model's device (Trainer may have moved the model).
90
+ model_device = next(model.parameters()).device
91
+ batch = {
92
+ k: v.to(model_device) if isinstance(v, torch.Tensor) else v
93
+ for k, v in self.sample_batch.items()
94
+ }
95
+ self._fwd_flops = profile_flops(
96
+ model, kwargs=batch, with_backward=False
97
+ )
98
+ except Exception as exc:
99
+ warnings.warn(
100
+ f"mfu-tracker: FLOP profiling failed ({exc}); MFU will not be logged.",
101
+ stacklevel=2,
102
+ )
103
+
104
+ # --- TrainerCallback protocol -------------------------------------------
105
+
106
+ def on_train_begin(self, args, state, control, model=None, **kwargs):
107
+ if model is not None and torch.cuda.is_available():
108
+ self._model = model
109
+ self._profile(model)
110
+
111
+ def on_step_begin(self, args, state, control, **kwargs):
112
+ if self._fwd_flops is None or not torch.cuda.is_available():
113
+ return
114
+
115
+ e_start = torch.cuda.Event(enable_timing=True)
116
+ e_end = torch.cuda.Event(enable_timing=True)
117
+ e_start.record()
118
+ self._pending_step = (e_start, e_end)
119
+
120
+ def on_step_end(self, args, state, control, **kwargs):
121
+ if not hasattr(self, "_pending_step"):
122
+ return
123
+
124
+ e_start, e_end = self._pending_step
125
+ e_end.record()
126
+ self._pending.append((e_start, e_end))
127
+ del self._pending_step
128
+
129
+ def on_log(self, args, state, control, logs=None, **kwargs):
130
+ if logs is None or not self._pending or self._fwd_flops is None:
131
+ return
132
+
133
+ # Single sync amortised across all accumulated steps.
134
+ torch.cuda.synchronize(self.device)
135
+
136
+ n_steps = len(self._pending)
137
+ total_ms = sum(e_start.elapsed_time(e_end) for e_start, e_end in self._pending)
138
+ self._pending.clear()
139
+
140
+ elapsed_sec = total_ms / 1000
141
+ if elapsed_sec <= 0:
142
+ return
143
+
144
+ total_flops = int(self._fwd_flops * n_steps * (1 + self.backward_factor))
145
+
146
+ prefix = f"{self.metric_prefix}/" if self.metric_prefix else ""
147
+ logs[f"{prefix}mfu"] = round(
148
+ compute_mfu(total_flops, elapsed_sec, dtype=self.dtype, num_gpus=self.num_gpus, spec=self._spec), 4
149
+ )
150
+ logs[f"{prefix}mbu"] = round(
151
+ compute_mbu(self._param_bytes * n_steps, elapsed_sec, num_gpus=self.num_gpus, spec=self._spec), 4
152
+ )
mfu_tracker/optim.py ADDED
@@ -0,0 +1,155 @@
1
+ """Optimizer wrapper that measures MFU and MBU per training step."""
2
+ from __future__ import annotations
3
+
4
+ import warnings
5
+ from contextlib import contextmanager
6
+ from typing import Any, Generator, Optional
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+
11
+ from .flops import param_bytes, profile_flops
12
+ from .gpu import GPUSpec, get_gpu_spec
13
+ from .tracker import UtilizationResult
14
+
15
+
16
+ class MFUOptimizerWrapper:
17
+ """
18
+ Wraps any ``torch.optim.Optimizer`` to automatically track MFU and MBU.
19
+
20
+ FLOPs are profiled once on the uncompiled model and scaled by
21
+ ``1 + backward_factor`` (default 2.0, the standard forward + 2× backward
22
+ convention). Set ``backward_factor`` higher when using gradient checkpointing,
23
+ which recomputes activations during backward (typical values: 3.0–4.0).
24
+
25
+ ``zero_grad()`` is called automatically at the *start* of ``track_step()``.
26
+ Call ``optimizer.step()`` **after** the block so it is excluded from the
27
+ timing window::
28
+
29
+ optimizer = MFUOptimizerWrapper(
30
+ torch.optim.AdamW(model.parameters(), lr=1e-4),
31
+ model=model,
32
+ sample_batch=sample_batch,
33
+ dtype="bf16",
34
+ )
35
+
36
+ for batch in dataloader:
37
+ with optimizer.track_step() as result:
38
+ output = model(**batch)
39
+ loss = output.loss
40
+ loss.backward()
41
+ optimizer.step()
42
+ print(f"MFU={result.mfu:.1%} MBU={result.mbu:.1%}")
43
+
44
+ **torch.compile**: profile the uncompiled model first, then compile::
45
+
46
+ optimizer = MFUOptimizerWrapper(raw_model, ...)
47
+ optimizer.profile() # profile before compile
48
+ model = torch.compile(model)
49
+ """
50
+
51
+ def __init__(
52
+ self,
53
+ optimizer: torch.optim.Optimizer,
54
+ model: nn.Module,
55
+ sample_batch: dict[str, Any],
56
+ dtype: str = "bf16",
57
+ num_gpus: int = 1,
58
+ backward_factor: float = 2.0,
59
+ device: Optional[torch.device] = None,
60
+ ) -> None:
61
+ self.optimizer = optimizer
62
+ self._model = model
63
+ self._sample_batch = sample_batch
64
+ self._dtype = dtype
65
+ self._num_gpus = num_gpus
66
+ self._backward_factor = backward_factor
67
+ self._device = device
68
+
69
+ self._spec: Optional[GPUSpec] = None
70
+ self._fwd_flops: Optional[int] = None
71
+ self._param_bytes: Optional[int] = None
72
+
73
+ def profile(self) -> None:
74
+ """
75
+ Explicitly profile FLOPs on the current (uncompiled) model.
76
+
77
+ Call this before ``torch.compile`` so the FLOP count is measured on the
78
+ original graph. If not called, profiling happens lazily on the first
79
+ ``track_step()`` — which may be too late if the model is already compiled::
80
+
81
+ optimizer = MFUOptimizerWrapper(raw_optimizer, model, sample_batch, dtype="bf16")
82
+ optimizer.profile() # profile uncompiled model
83
+ model = torch.compile(model) # compile after profiling
84
+ """
85
+ if self._spec is None:
86
+ self._profile_once()
87
+
88
+ def _profile_once(self) -> None:
89
+ self._spec = get_gpu_spec(self._device)
90
+ try:
91
+ self._fwd_flops = profile_flops(
92
+ self._model,
93
+ kwargs=self._sample_batch,
94
+ with_backward=False,
95
+ )
96
+ except Exception as exc:
97
+ warnings.warn(
98
+ f"mfu-tracker: profiling failed ({exc}); MFU will not be populated.",
99
+ stacklevel=3,
100
+ )
101
+ self._param_bytes = param_bytes(self._model)
102
+
103
+ @contextmanager
104
+ def track_step(self) -> Generator[UtilizationResult, None, None]:
105
+ """
106
+ Context manager that wraps one training step and populates a
107
+ :class:`~mfu_tracker.UtilizationResult` with MFU and MBU.
108
+
109
+ ``optimizer.zero_grad()`` is called automatically at the *start* of the
110
+ block. Call ``optimizer.step()`` **after** the block so it is excluded
111
+ from the timing window::
112
+
113
+ with wrapped.track_step() as result:
114
+ out = model(**batch)
115
+ out.loss.backward()
116
+ wrapped.step()
117
+
118
+ FLOPs are ``fwd_flops × (1 + backward_factor)`` where ``backward_factor``
119
+ defaults to 2.0 (standard 3× convention). Set it higher when using
120
+ gradient checkpointing (typical: 3.0–4.0).
121
+ """
122
+ if self._spec is None:
123
+ self._profile_once()
124
+
125
+ self.optimizer.zero_grad()
126
+
127
+ e_start = torch.cuda.Event(enable_timing=True)
128
+ e_end = torch.cuda.Event(enable_timing=True)
129
+
130
+ result = UtilizationResult(dtype=self._dtype, gpu_spec=self._spec, num_gpus=self._num_gpus)
131
+
132
+ e_start.record()
133
+ try:
134
+ yield result
135
+ finally:
136
+ e_end.record()
137
+ result._e_start = e_start
138
+ result._e_end = e_end
139
+ result._total_flops = (
140
+ int(self._fwd_flops * (1 + self._backward_factor))
141
+ if self._fwd_flops is not None else None
142
+ )
143
+ result._param_bytes = self._param_bytes
144
+ result._device = self._device
145
+
146
+ # --- Proxy the underlying optimizer ------------------------------------
147
+
148
+ def step(self, *args, **kwargs) -> None:
149
+ self.optimizer.step(*args, **kwargs)
150
+
151
+ def zero_grad(self, *args, **kwargs) -> None:
152
+ self.optimizer.zero_grad(*args, **kwargs)
153
+
154
+ def __getattr__(self, name: str) -> Any:
155
+ return getattr(self.optimizer, name)
mfu_tracker/tracker.py ADDED
@@ -0,0 +1,211 @@
1
+ """MFU and MBU measurement — context manager and standalone functions."""
2
+ from __future__ import annotations
3
+
4
+ import time
5
+ from contextlib import contextmanager
6
+ from dataclasses import dataclass, field
7
+ from typing import Generator, Optional
8
+
9
+ import torch
10
+
11
+ from .gpu import GPUSpec, get_gpu_spec
12
+
13
+
14
+ @dataclass
15
+ class UtilizationResult:
16
+ """
17
+ Result holder populated after a ``track()`` or ``track_step()`` block exits.
18
+
19
+ Fields backed by CUDA events (from ``MFUOptimizerWrapper.track_step()``) are
20
+ computed lazily on first access — the GPU sync is deferred until the value is
21
+ actually needed. Fields from the CPU-timed ``track()`` context manager are
22
+ populated eagerly since ``torch.cuda.synchronize`` is already called there.
23
+ """
24
+
25
+ dtype: str = "fp16"
26
+ gpu_spec: Optional[GPUSpec] = None
27
+ num_gpus: int = 1
28
+
29
+ # Eagerly-set fields (track()) or lazily-set after _resolve() (track_step()).
30
+ _mfu: Optional[float] = field(default=None, repr=False)
31
+ _mbu: Optional[float] = field(default=None, repr=False)
32
+ _elapsed_sec: Optional[float] = field(default=None, repr=False)
33
+ _achieved_tflops: Optional[float] = field(default=None, repr=False)
34
+ _achieved_tbs: Optional[float] = field(default=None, repr=False)
35
+
36
+ # Set by MFUOptimizerWrapper for lazy resolution.
37
+ _e_start: Optional[object] = field(default=None, repr=False)
38
+ _e_end: Optional[object] = field(default=None, repr=False)
39
+ _total_flops: Optional[int] = field(default=None, repr=False)
40
+ _param_bytes: Optional[int] = field(default=None, repr=False)
41
+ _device: Optional[torch.device] = field(default=None, repr=False)
42
+
43
+ def _resolve(self) -> None:
44
+ """Sync and compute all fields from CUDA events. Called at most once."""
45
+ if self._e_start is None:
46
+ return
47
+ if self._total_flops is None or self._param_bytes is None:
48
+ return
49
+ torch.cuda.synchronize(self._device)
50
+ elapsed = self._e_start.elapsed_time(self._e_end) / 1000
51
+
52
+ peak_tflops = self.gpu_spec.peak_tflops(self.dtype) * self.num_gpus
53
+ peak_tbs = self.gpu_spec.peak_memory_bandwidth_tbs * self.num_gpus
54
+
55
+ self._elapsed_sec = elapsed
56
+ self._achieved_tflops = self._total_flops / elapsed / 1e12
57
+ self._achieved_tbs = self._param_bytes / elapsed / 1e12
58
+ self._mfu = self._achieved_tflops / peak_tflops
59
+ self._mbu = self._achieved_tbs / peak_tbs
60
+ self._e_start = None # mark resolved
61
+
62
+ @property
63
+ def mfu(self) -> Optional[float]:
64
+ self._resolve()
65
+ return self._mfu
66
+
67
+ @mfu.setter
68
+ def mfu(self, v: Optional[float]) -> None:
69
+ self._mfu = v
70
+
71
+ @property
72
+ def mbu(self) -> Optional[float]:
73
+ self._resolve()
74
+ return self._mbu
75
+
76
+ @mbu.setter
77
+ def mbu(self, v: Optional[float]) -> None:
78
+ self._mbu = v
79
+
80
+ @property
81
+ def elapsed_sec(self) -> Optional[float]:
82
+ self._resolve()
83
+ return self._elapsed_sec
84
+
85
+ @elapsed_sec.setter
86
+ def elapsed_sec(self, v: Optional[float]) -> None:
87
+ self._elapsed_sec = v
88
+
89
+ @property
90
+ def achieved_tflops(self) -> Optional[float]:
91
+ self._resolve()
92
+ return self._achieved_tflops
93
+
94
+ @achieved_tflops.setter
95
+ def achieved_tflops(self, v: Optional[float]) -> None:
96
+ self._achieved_tflops = v
97
+
98
+ @property
99
+ def achieved_tbs(self) -> Optional[float]:
100
+ self._resolve()
101
+ return self._achieved_tbs
102
+
103
+ @achieved_tbs.setter
104
+ def achieved_tbs(self, v: Optional[float]) -> None:
105
+ self._achieved_tbs = v
106
+
107
+
108
+ @contextmanager
109
+ def track(
110
+ flop_count: int,
111
+ param_bytes: int,
112
+ *,
113
+ dtype: str = "fp16",
114
+ num_gpus: int = 1,
115
+ device: Optional[torch.device] = None,
116
+ spec: Optional[GPUSpec] = None,
117
+ ) -> Generator[UtilizationResult, None, None]:
118
+ """
119
+ Context manager that measures MFU and MBU for an arbitrary compute block.
120
+
121
+ Args:
122
+ flop_count: Total FLOPs for the block (use flops.profile_flops or your own).
123
+ param_bytes: Bytes transferred for weights (num_params * bytes_per_element).
124
+ dtype: Compute dtype string — "fp16", "bf16", "int8", "fp8", "int4", "fp4".
125
+ num_gpus: Multiplier applied to the peak ceiling (default 1). When using
126
+ ``profile_flops`` as the source of *flop_count*, leave this at 1
127
+ regardless of parallelism strategy — ``profile_flops`` returns
128
+ per-GPU FLOPs, and per-GPU MFU equals global MFU for all standard
129
+ parallelism types (the N factors cancel). Only set this when
130
+ *flop_count* is the analytically-derived *full-model* FLOP count
131
+ (e.g. ``6 × params × tokens``) rather than a profiled per-GPU count.
132
+ device: CUDA device to measure against (default: current device).
133
+ spec: Pre-queried GPUSpec; fetched once if not provided.
134
+
135
+ Yields a :class:`UtilizationResult` whose fields are ``None`` until the block
136
+ exits, then populated with measured values.
137
+
138
+ Example — single GPU::
139
+
140
+ flops = profile_flops(model, args=(sample,), with_backward=True)
141
+ with track(flops, param_bytes(model), dtype="bf16") as result:
142
+ loss = model(inputs)
143
+ loss.backward()
144
+ optimizer.step()
145
+ print(f"MFU={result.mfu:.1%} MBU={result.mbu:.1%}")
146
+
147
+ Example — any parallelism strategy (DDP, FSDP, tensor, pipeline)::
148
+
149
+ # profile_flops on whatever model shard the local rank holds gives
150
+ # per-GPU FLOPs; per-GPU MFU == global MFU for all standard strategies.
151
+ with track(flops, param_bytes(model), dtype="bf16") as r:
152
+ ...
153
+ """
154
+ if spec is None:
155
+ spec = get_gpu_spec(device)
156
+
157
+ peak_tflops = spec.peak_tflops(dtype) * num_gpus
158
+ peak_tbs = spec.peak_memory_bandwidth_tbs * num_gpus
159
+
160
+ result = UtilizationResult(dtype=dtype, gpu_spec=spec, num_gpus=num_gpus)
161
+
162
+ torch.cuda.synchronize(device)
163
+ t0 = time.perf_counter()
164
+ yield result
165
+ torch.cuda.synchronize(device)
166
+ elapsed = time.perf_counter() - t0
167
+
168
+ result.elapsed_sec = elapsed
169
+ result.achieved_tflops = flop_count / elapsed / 1e12
170
+ result.achieved_tbs = param_bytes / elapsed / 1e12
171
+ result.mfu = result.achieved_tflops / peak_tflops
172
+ result.mbu = result.achieved_tbs / peak_tbs
173
+
174
+
175
+ def compute_mfu(
176
+ flop_count: int,
177
+ elapsed_sec: float,
178
+ *,
179
+ dtype: str = "fp16",
180
+ num_gpus: int = 1,
181
+ device: Optional[torch.device] = None,
182
+ spec: Optional[GPUSpec] = None,
183
+ ) -> float:
184
+ """
185
+ Standalone MFU calculation without a context manager.
186
+
187
+ Args:
188
+ num_gpus: GPUs in the peak ceiling. See :func:`track` for guidance.
189
+ """
190
+ if spec is None:
191
+ spec = get_gpu_spec(device)
192
+ return (flop_count / elapsed_sec / 1e12) / (spec.peak_tflops(dtype) * num_gpus)
193
+
194
+
195
+ def compute_mbu(
196
+ param_bytes: int,
197
+ elapsed_sec: float,
198
+ *,
199
+ num_gpus: int = 1,
200
+ device: Optional[torch.device] = None,
201
+ spec: Optional[GPUSpec] = None,
202
+ ) -> float:
203
+ """
204
+ Standalone MBU calculation without a context manager.
205
+
206
+ Args:
207
+ num_gpus: GPUs in the peak ceiling. See :func:`track` for guidance.
208
+ """
209
+ if spec is None:
210
+ spec = get_gpu_spec(device)
211
+ return (param_bytes / elapsed_sec / 1e12) / (spec.peak_memory_bandwidth_tbs * num_gpus)
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.4
2
+ Name: mfu-tracker
3
+ Version: 0.1.0
4
+ Summary: Lightweight Model FLOPs Utilization and Bandwidth Utilization tracker for PyTorch
5
+ License: MIT License
6
+
7
+ Copyright (c) 2026 Jeremias Lino Ferrao
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+ License-File: LICENSE
27
+ Requires-Python: >=3.9
28
+ Requires-Dist: numpy>=2.0.2
29
+ Requires-Dist: thop>=0.1.1.post2209072238
30
+ Requires-Dist: torch>=2.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest-cov; extra == 'dev'
33
+ Requires-Dist: pytest>=7.0; extra == 'dev'
34
+ Provides-Extra: hf
35
+ Requires-Dist: transformers>=4.30; extra == 'hf'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # mfu-tracker
39
+
40
+ When profiling training runs, I found that most existing tools either lacked MFU/MBU support entirely or dragged in hundreds of megabytes of transitive dependencies. This library is an attempt at a self-contained alternative.
41
+
42
+ **mfu-tracker** is a PyTorch library for measuring Model FLOPs Utilization (MFU) and Model Bandwidth Utilization (MBU). It supports bare PyTorch training loops, an optimizer wrapper, and a HuggingFace Trainer callback.
43
+
44
+ - **Minimal dependencies** — PyTorch and `thop` only
45
+ - **Profiled FLOPs, not formula estimates** — uses `FlopCounterMode` to count the FLOPs your model actually executes rather than a formula like `6 × params × tokens`. For Mixture-of-Experts models this means only active experts are counted, giving a more accurate numerator than parameter-based estimates.
46
+ - **Three integration styles** — context manager, optimizer wrapper, HF Trainer callback
47
+ - **WandB / TensorBoard / MLflow** — metrics are logged through HF Trainer's existing pipeline when using `MFUCallback`
48
+
49
+ MFU as a training efficiency metric was introduced in the [PaLM paper](https://arxiv.org/abs/2204.02311) (Chowdhery et al., 2022).
50
+
51
+ ---
52
+
53
+ ## What MFU and MBU measure
54
+
55
+ **MFU (Model FLOPs Utilization)** is the ratio of observed FLOP throughput to the GPU's theoretical peak for the given dtype. A value of 0.50 means the model is executing at half the GPU's rated peak. Well-optimized large models on modern hardware typically fall in the 0.40–0.60 range; small models often land much lower due to kernel dispatch overhead relative to compute time.
56
+
57
+ **MBU (Model Bandwidth Utilization)** as computed here is a proxy, not a direct DRAM measurement. It is defined as:
58
+
59
+ ```
60
+ MBU = (param_bytes / elapsed_sec) / peak_memory_bandwidth
61
+ ```
62
+
63
+ where `param_bytes` is the total size of model parameters and `elapsed_sec` is wall time. This assumes one full pass through model weights per step and does not account for activation memory, gradients, optimizer state, or data layout effects. It is most useful as a relative indicator across runs rather than an absolute efficiency measure.
64
+
65
+ If both MFU and MBU are low simultaneously, the GPU is underutilized. Two common causes: kernel dispatch overhead (the CPU cannot issue kernels fast enough to keep the GPU busy — `torch.compile` reduces this by fusing operations), or CPU-side pipeline stalls (slow DataLoader, heavy host preprocessing, or host-to-device transfers in the hot path).
66
+
67
+ ---
68
+
69
+ ## Installation
70
+
71
+ ```bash
72
+ pip install mfu-tracker
73
+ ```
74
+
75
+ HuggingFace Trainer integration requires no extra install — if you are already running HF Trainer, `transformers` is already available. Import `MFUCallback` directly.
76
+
77
+ ---
78
+
79
+ ## Usage
80
+
81
+ ### Context manager (bare PyTorch)
82
+
83
+ ```python
84
+ from mfu_tracker import track, profile_flops, param_bytes
85
+
86
+ # Profile once on the uncompiled model before training begins
87
+ sample = {"input_ids": batch["input_ids"][:1]}
88
+ flops = profile_flops(model, kwargs=sample, with_backward=True)
89
+ p_bytes = param_bytes(model)
90
+
91
+ for batch in dataloader:
92
+ optimizer.zero_grad()
93
+ with track(flops, p_bytes, dtype="bf16") as result:
94
+ loss = model(**batch).loss
95
+ loss.backward()
96
+ optimizer.step()
97
+
98
+ print(f"MFU: {result.mfu:.3f} MBU: {result.mbu:.3f} {result.elapsed_sec*1000:.0f} ms/step")
99
+ ```
100
+
101
+ ### Optimizer wrapper
102
+
103
+ ```python
104
+ from mfu_tracker import MFUOptimizerWrapper
105
+
106
+ base_optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
107
+ optimizer = MFUOptimizerWrapper(
108
+ base_optimizer, model,
109
+ sample_batch={"input_ids": sample_ids},
110
+ dtype="bf16",
111
+ )
112
+
113
+ # Profile before compiling — FlopCounterMode may not trace compiled graphs
114
+ optimizer.profile()
115
+ model = torch.compile(model)
116
+
117
+ for batch in dataloader:
118
+ with optimizer.track_step() as result: # calls zero_grad() at block entry
119
+ loss = model(**batch).loss
120
+ loss.backward()
121
+ optimizer.step() # outside the timing window
122
+
123
+ if step % 10 == 0:
124
+ print(f"MFU {result.mfu:.3f} MBU {result.mbu:.3f}")
125
+ ```
126
+
127
+ ### HuggingFace Trainer
128
+
129
+ ```python
130
+ from mfu_tracker.integrations.hf_trainer import MFUCallback
131
+
132
+ sample_batch = {k: v[:batch_size] for k, v in next(iter(train_dataloader)).items()}
133
+
134
+ callback = MFUCallback(
135
+ sample_batch=sample_batch,
136
+ dtype="bf16",
137
+ metric_prefix="throughput", # logs throughput/mfu and throughput/mbu
138
+ )
139
+
140
+ trainer = Trainer(
141
+ model=model,
142
+ args=training_args,
143
+ train_dataset=train_dataset,
144
+ callbacks=[callback],
145
+ )
146
+ trainer.train()
147
+ ```
148
+
149
+ `throughput/mfu` and `throughput/mbu` are added to the Trainer log dict at each logging step and forwarded automatically to any configured integrations (WandB, TensorBoard, MLflow). WandB groups metrics by the `/` separator, so these appear in a distinct "throughput" section rather than alongside loss and learning rate.
150
+
151
+ ---
152
+
153
+ ## FLOP counting
154
+
155
+ ```python
156
+ from mfu_tracker import profile_flops, flash_attn_flops, param_bytes
157
+
158
+ # Standard models — FlopCounterMode counts SDPA automatically on CUDA
159
+ flops = profile_flops(model, kwargs=batch, with_backward=True)
160
+
161
+ # Models calling flash_attn_func directly (rare; older HF with use_flash_attention_2=True)
162
+ # need a manual correction since the C extension is opaque to FlopCounterMode:
163
+ flops += flash_attn_flops(batch_size=B, seq_len=S, num_heads=H, head_dim=D)
164
+
165
+ # PEFT / LoRA — restrict param_bytes to trainable parameters only
166
+ p_bytes = param_bytes(model, trainable_only=True)
167
+ ```
168
+
169
+ `with_backward=True` applies the standard 3× convention (1× forward + 2× backward). For gradient checkpointing, pass `backward_factor=3.0` or `4.0` to `MFUOptimizerWrapper` or `MFUCallback`.
170
+
171
+ ---
172
+
173
+ ## GPU spec
174
+
175
+ ```python
176
+ from mfu_tracker import get_gpu_spec
177
+
178
+ spec = get_gpu_spec()
179
+ print(spec.name) # e.g. "NVIDIA GeForce RTX 4080"
180
+ print(spec.peak_tflops("fp16")) # e.g. 97.6
181
+ print(spec.peak_tflops("fp8")) # Ada Lovelace (CC 8.9) and Hopper (CC 9.0)+
182
+ print(spec.peak_memory_bandwidth_tbs) # e.g. 0.717
183
+ ```
184
+
185
+ Supported dtypes: `fp32`, `fp16`, `bf16`, `int8`, `fp8`, `int4`, `fp4`. Unrecognized compute capabilities fall back to the nearest known major version with a `UserWarning`.
186
+
187
+ ---
188
+
189
+ ## Benchmark (RTX 4080, GPT-2 124M, fp16)
190
+
191
+ | Configuration | MFU | ms/step |
192
+ |---|---|---|
193
+ | batch=1 · eager | ~0.027 | ~40 ms |
194
+ | batch=8 · eager | ~0.09 | ~93 ms |
195
+ | batch=8 · sdpa | ~0.12 | ~74 ms |
196
+ | batch=8 · sdpa + compile | ~0.17 | ~50 ms |
197
+ | batch=16 · sdpa + compile | ~0.16 | ~104 ms |
198
+
199
+ GPT-2 (124M) is a small model relative to the compute capacity of a modern GPU, so low MFU is expected — the model spends a large fraction of step time waiting for kernel dispatch rather than doing arithmetic. Larger models (e.g. LLaMA-70B) typically reach 0.40–0.60 MFU. The improvement from `torch.compile` reflects kernel fusion reducing dispatch overhead. I'll add some testing on this later.
200
+
201
+ ```bash
202
+ python examples/benchmark_mfu.py --help
203
+ python examples/hf_trainer_mfu.py --dtype bf16 --batch-size 16
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Multi-GPU
209
+
210
+ Leave `num_gpus=1` (the default) when using `profile_flops` as the FLOP source. For data-parallel strategies (DDP, FSDP), per-GPU FLOPs equal total FLOPs divided by N and wall time is the same on all ranks, so per-GPU MFU equals global MFU and the N factors cancel. Set `num_gpus > 1` only when pairing an analytically-derived full-model FLOP count (e.g. `6 × params × tokens`) with a total-job peak ceiling.
211
+
212
+ ---
213
+
214
+ ## Limitations
215
+
216
+ - **SDPA on CPU is not counted** — `FlopCounterMode` does not intercept flash attention dispatch on CPU. Profile with a CUDA model.
217
+ - **bitsandbytes quantized layers** — INT8/NF4 kernels are opaque to `FlopCounterMode`. NF4 dequantizes to fp16 before the matmul, so FLOP counts are approximately correct. Pass the appropriate dtype to use the right peak ceiling.
218
+ - **`flash_attn_func` direct calls** — models bypassing `F.scaled_dot_product_attention` need a manual `flash_attn_flops()` correction (see above).
219
+ - **Peak ceilings from spec sheets** — these are not independently measured. MFU > 1.0 indicates the ceiling is underestimated.
220
+ - **MBU is a proxy** — the formula uses parameter bytes as a stand-in for memory traffic; actual DRAM traffic (activations, gradients, optimizer state) is higher and not measured.
221
+ - I have not tested the library extensively yet; please open an issue if you encounter any bugs or unexpected behavior.
222
+
223
+ ---
224
+
225
+ ## Requirements
226
+
227
+ - Python 3.9+
228
+ - PyTorch 2.0+ (2.1+ recommended for `FlopCounterMode`)
229
+ - A CUDA GPU is required for meaningful results; CPU timing works but MFU will be near zero for any realistic model
230
+
231
+ ---
232
+
233
+ ## License
234
+
235
+ MIT
@@ -0,0 +1,11 @@
1
+ mfu_tracker/__init__.py,sha256=gvoCPl6mEHBRV7Y_q_FLJjK16JQMsQ11nfgrpQiF8-0,503
2
+ mfu_tracker/flops.py,sha256=NwCrPXRNXPJMHSWEcb5SSWxEZEqkGocwdNY0NJ-ZeiU,5762
3
+ mfu_tracker/gpu.py,sha256=60ZSbSe1cAyfjPm-uqZR7ptRBYDFno6SJJO32VGePwA,4689
4
+ mfu_tracker/optim.py,sha256=kOfQfDMC7oPpWGGRHROs4Cwv9vxFAoe3GpYlV_2sGOw,5466
5
+ mfu_tracker/tracker.py,sha256=fDGpFm7XKEkwtSb6CHgWsrrS01ic2l8RxVOrvuDtaUY,7362
6
+ mfu_tracker/integrations/__init__.py,sha256=KmKZNbpniR3qqNG5Zloe6Z2tGwi7fgH1zCb6p_8a0po,63
7
+ mfu_tracker/integrations/hf_trainer.py,sha256=aP7ld5HGgOgDu_PQt0vcoOU2hx7ocqp__sHDPnmblgY,6164
8
+ mfu_tracker-0.1.0.dist-info/METADATA,sha256=wQ9f1PHsrcgYa-kB53wlTYht6luRnbS20PFkX-SR3Us,10365
9
+ mfu_tracker-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
10
+ mfu_tracker-0.1.0.dist-info/licenses/LICENSE,sha256=aXaBx4UYHF1tx67IIN-nh5BIggjU8q9WWYyN2HupHXA,1077
11
+ mfu_tracker-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jeremias Lino Ferrao
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.