pytscope 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. pytscope-0.2.1/.gitignore +13 -0
  2. pytscope-0.2.1/CHANGELOG.md +162 -0
  3. pytscope-0.2.1/LICENSE +21 -0
  4. pytscope-0.2.1/PKG-INFO +367 -0
  5. pytscope-0.2.1/README.md +324 -0
  6. pytscope-0.2.1/docs/README.md +20 -0
  7. pytscope-0.2.1/docs/VALIDATION.md +120 -0
  8. pytscope-0.2.1/docs/architecture.md +109 -0
  9. pytscope-0.2.1/docs/diagnostics.md +140 -0
  10. pytscope-0.2.1/docs/usage.md +197 -0
  11. pytscope-0.2.1/docs/validation-runs/2026-06-08-kaggle-2xT4/RESULTS.md +113 -0
  12. pytscope-0.2.1/docs/validation-runs/2026-06-08-kaggle-2xT4/exp1_straggler.txt +40 -0
  13. pytscope-0.2.1/docs/validation-runs/2026-06-08-kaggle-2xT4/exp2_bad_overlap.txt +12 -0
  14. pytscope-0.2.1/docs/validation-runs/2026-06-08-kaggle-2xT4/exp2_good_overlap.txt +12 -0
  15. pytscope-0.2.1/docs/validation-runs/2026-06-08-kaggle-2xT4/exp3_mfu.txt +30 -0
  16. pytscope-0.2.1/docs/validation-runs/README.md +75 -0
  17. pytscope-0.2.1/docs/validation-runs/kaggle_2xT4.ipynb +272 -0
  18. pytscope-0.2.1/examples/auto.py +43 -0
  19. pytscope-0.2.1/examples/cross_signal.py +48 -0
  20. pytscope-0.2.1/examples/ddp_gloo.py +126 -0
  21. pytscope-0.2.1/examples/efficiency_mfu.py +66 -0
  22. pytscope-0.2.1/examples/exposed_comm.py +61 -0
  23. pytscope-0.2.1/examples/manual_loop.py +34 -0
  24. pytscope-0.2.1/examples/pytorch_real.py +102 -0
  25. pytscope-0.2.1/pyproject.toml +92 -0
  26. pytscope-0.2.1/pytscope/__init__.py +57 -0
  27. pytscope-0.2.1/pytscope/analyzers/__init__.py +15 -0
  28. pytscope-0.2.1/pytscope/analyzers/convergence.py +77 -0
  29. pytscope-0.2.1/pytscope/analyzers/distributed.py +230 -0
  30. pytscope-0.2.1/pytscope/analyzers/efficiency.py +149 -0
  31. pytscope-0.2.1/pytscope/analyzers/memory.py +58 -0
  32. pytscope-0.2.1/pytscope/analyzers/pipeline.py +149 -0
  33. pytscope-0.2.1/pytscope/analyzers/repro.py +205 -0
  34. pytscope-0.2.1/pytscope/analyzers/stats.py +110 -0
  35. pytscope-0.2.1/pytscope/analyzers/timing.py +109 -0
  36. pytscope-0.2.1/pytscope/analyzers/trace.py +226 -0
  37. pytscope-0.2.1/pytscope/auto.py +290 -0
  38. pytscope-0.2.1/pytscope/cli.py +203 -0
  39. pytscope-0.2.1/pytscope/collectors/__init__.py +3 -0
  40. pytscope-0.2.1/pytscope/collectors/memory.py +40 -0
  41. pytscope-0.2.1/pytscope/core/__init__.py +27 -0
  42. pytscope-0.2.1/pytscope/core/distributed.py +49 -0
  43. pytscope-0.2.1/pytscope/core/events.py +76 -0
  44. pytscope-0.2.1/pytscope/core/provenance.py +47 -0
  45. pytscope-0.2.1/pytscope/core/store.py +101 -0
  46. pytscope-0.2.1/pytscope/diagnosis/__init__.py +12 -0
  47. pytscope-0.2.1/pytscope/diagnosis/engine.py +70 -0
  48. pytscope-0.2.1/pytscope/diagnosis/rules.py +129 -0
  49. pytscope-0.2.1/pytscope/diagnosis/rules_convergence.py +54 -0
  50. pytscope-0.2.1/pytscope/diagnosis/rules_cross.py +116 -0
  51. pytscope-0.2.1/pytscope/diagnosis/rules_distributed.py +184 -0
  52. pytscope-0.2.1/pytscope/diagnosis/rules_efficiency.py +65 -0
  53. pytscope-0.2.1/pytscope/diagnosis/rules_memory.py +71 -0
  54. pytscope-0.2.1/pytscope/hardware.py +93 -0
  55. pytscope-0.2.1/pytscope/integrations/__init__.py +7 -0
  56. pytscope-0.2.1/pytscope/integrations/huggingface.py +45 -0
  57. pytscope-0.2.1/pytscope/integrations/lightning.py +77 -0
  58. pytscope-0.2.1/pytscope/profiler.py +252 -0
  59. pytscope-0.2.1/pytscope/py.typed +0 -0
  60. pytscope-0.2.1/pytscope/report/__init__.py +3 -0
  61. pytscope-0.2.1/pytscope/report/cli_report.py +394 -0
  62. pytscope-0.2.1/tests/test_auto.py +148 -0
  63. pytscope-0.2.1/tests/test_cli_report.py +79 -0
  64. pytscope-0.2.1/tests/test_diagnosis.py +33 -0
  65. pytscope-0.2.1/tests/test_distributed.py +118 -0
  66. pytscope-0.2.1/tests/test_distributed_gloo.py +78 -0
  67. pytscope-0.2.1/tests/test_edge_cases.py +101 -0
  68. pytscope-0.2.1/tests/test_efficiency.py +116 -0
  69. pytscope-0.2.1/tests/test_overhead.py +76 -0
  70. pytscope-0.2.1/tests/test_pipeline.py +58 -0
  71. pytscope-0.2.1/tests/test_precision.py +110 -0
  72. pytscope-0.2.1/tests/test_profiler.py +77 -0
  73. pytscope-0.2.1/tests/test_repro.py +73 -0
  74. pytscope-0.2.1/tests/test_scenarios.py +147 -0
  75. pytscope-0.2.1/tests/test_store.py +22 -0
  76. pytscope-0.2.1/tests/test_timing.py +33 -0
  77. pytscope-0.2.1/tests/test_trace.py +156 -0
  78. pytscope-0.2.1/tests/test_verticals.py +120 -0
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ .eggs/
5
+ build/
6
+ dist/
7
+ .pytest_cache/
8
+ .coverage
9
+ htmlcov/
10
+ .venv/
11
+ venv/
12
+ runs/
13
+ .DS_Store
@@ -0,0 +1,162 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
5
+ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [Unreleased]
8
+
9
+ ## [0.2.1] - 2026-06-08
10
+
11
+ ### Changed
12
+ - **Project renamed from `trainscope` to `pytscope`.** Both `trainscope` and
13
+ `tscope` were already taken on PyPI; `pytscope` was available. The import
14
+ path, CLI command, PyPI distribution name, and GitHub repository
15
+ (`Sumu004/pytscope`) are now all `pytscope`. No functional changes —
16
+ this release exists solely to publish under the new name (the v0.2.0 tag
17
+ was cut from the pre-rename tree, so its build still carried the old name).
18
+
19
+ ## [0.2.0] - 2026-06-08
20
+
21
+ ### Added
22
+ - **Amber-LED "hardware panel" terminal reports.** `pytscope analyze`/`diff`
23
+ now render every section as a lit panel — a colored `●` indicator (red /
24
+ amber / green by what it's reporting: a stalling step-time breakdown, a
25
+ named straggler, a low MFU, …) in front of each heading — plus gradient
26
+ meter bars rendered as lit/unlit block segments (`█`/`░`), severity-coded
27
+ findings, and unicode sparklines for step-time and loss trends. The whole
28
+ grammar (LED color ↔ severity ↔ bar gradient) is consistent across every
29
+ vertical, and the run summary is condensed onto one heading line (steps ·
30
+ ms/step · throughput · median/p95/CV) with the old double-blank-line
31
+ padding between sections removed — same information, faster to scan.
32
+ Pure ANSI escapes, no new deps: auto-detects whether the terminal can
33
+ render color (honoring `NO_COLOR`/`FORCE_COLOR` and the new `--color
34
+ {auto,always,never}` flag), and degrades to byte-identical plain text when
35
+ piped to a file, redirected in CI, or explicitly disabled — nothing is
36
+ ever written to disk beyond the run itself (`tests/test_cli_report.py`).
37
+
38
+ ### Validated
39
+ - **Real multi-GPU NCCL run (Kaggle, 2× T4, 2026-06-08)** — closes two of the
40
+ three "needs hardware" gaps in `docs/VALIDATION.md`:
41
+ - **Straggler attribution: exact pass.** `DIST.STRAGGLER` named the injected
42
+ rank correctly (z=14.1, 100% critical-path share vs 50% expected by chance,
43
+ 27.5% wall lost to imbalance).
44
+ - **Exposed communication: directionally confirmed**, plus a genuine finding
45
+ about the hardware itself — on PCIe-only GPU pairs (no NVLink) the
46
+ all-reduce is link-bandwidth-bound, so absolute exposed-comm time stays
47
+ roughly constant across batch sizes (`DIST.EXPOSED_COMM` correctly fired
48
+ HIGH for both the small- and large-batch configs: 72% vs 62% exposed,
49
+ overlap improving in the predicted direction). Documented as an
50
+ interconnect-topology caveat in `docs/VALIDATION.md`.
51
+ - Full report, raw console captures, and analysis:
52
+ [`docs/validation-runs/2026-06-08-kaggle-2xT4/RESULTS.md`](docs/validation-runs/2026-06-08-kaggle-2xT4/RESULTS.md).
53
+
54
+ ### Fixed
55
+ - **`examples/efficiency_mfu.py` never selected CUDA** — its device probe only
56
+ checked `mps`/`cpu`, so on a CUDA box it silently profiled the CPU and
57
+ reported a meaningless ~0% MFU anchored against an A100 peak it never
58
+ touched (caught by the Kaggle validation run above). Now checks `cuda` first
59
+ and, when on CUDA, leaves `peak_flops` unset so `AutoProfiler` looks the
60
+ actual device up in the hardware peak table instead of hard-coding an
61
+ A100 anchor.
62
+
63
+ ### Added
64
+ - **`docs/validation-runs/`** — a ready-to-run Kaggle notebook
65
+ (`kaggle_2xT4.ipynb`) plus step-by-step instructions that execute the full
66
+ multi-GPU validation protocol from `docs/VALIDATION.md` (straggler
67
+ attribution, exposed-comm overlap, MFU sanity) on **real NCCL/CUDA, for
68
+ free**, using Kaggle's 2× T4 notebook tier — no paid GPU rental needed.
69
+
70
+ ### Fixed
71
+ - **CLI crashed on Windows** (`UnicodeEncodeError: 'charmap' codec can't encode
72
+ character 'Δ'`) whenever a report containing `Δ`/`—`/`•` was printed
73
+ with the console in its default `cp1252` encoding — this is what every CI
74
+ run on `windows-latest` was hitting (`pytscope diff`'s metrics table header
75
+ uses `Δ`). `pytscope.cli.main` now re-points stdout/stderr at a UTF-8
76
+ encoder with `errors="replace"` on entry; a no-op on platforms already UTF-8.
77
+
78
+ ### Changed
79
+ - **AutoProfiler is now correct on real models.** A forward re-entrancy guard
80
+ means gradient accumulation records one step per `optimizer.step` (not per
81
+ micro-batch), and activation checkpointing (forward recomputed during backward)
82
+ no longer corrupts the step structure. Tested with both.
83
+ - Added `docs/VALIDATION.md` (what's validated where + a multi-GPU protocol) and
84
+ made `examples/ddp_gloo.py` run under `torchrun` on NCCL/CUDA as well as CPU
85
+ gloo, so the GPU validation is executable.
86
+
87
+ ### Added
88
+ - **Training Efficiency Budget (MFU)** — a single accounting identity that
89
+ decomposes attributed wall time into named line items (useful compute /
90
+ compute overhead / data stall / communication / other) that sum **exactly** to
91
+ wall, anchored at the top by Model FLOPs Utilization. FLOPs are counted
92
+ automatically (`AutoProfiler(measure_flops=True)` via torch FlopCounterMode);
93
+ peak comes from a built-in GPU table (`hardware.peak_flops_for`) or
94
+ `--peak-tflops`. Recoverable line items are ranked by payoff; `EFFICIENCY.LOW_MFU`
95
+ (anchored) / `EFFICIENCY.RECOVERABLE` (no anchor) rules point at the biggest win.
96
+ CLI: `analyze --flops-per-step --peak-tflops`.
97
+ - **Exposed-communication analysis** — ingest a `torch.profiler`/Kineto trace
98
+ (`analyze --trace`, or auto-detected `trace.json[.gz]` in the run dir) and
99
+ compute, via exact interval arithmetic, how much collective communication
100
+ overlaps compute vs is *exposed* (on the critical path). Reports overlap
101
+ efficiency and a per-collective breakdown; new `DIST.EXPOSED_COMM` rule. The
102
+ overlap math is exact (tested on synthetic traces with known answers); the
103
+ parser is validated against a real `torch.profiler` export.
104
+ - **Automatic instrumentation** — `AutoProfiler(run_dir, model, optimizer)`
105
+ captures the full phase timeline (data / forward / backward / optimizer, plus
106
+ synchronous `comm`) with **zero changes to the training loop**, via PyTorch
107
+ forward hooks + an `optimizer.step` wrapper + collective patching. The step is
108
+ held open for post-step `log(loss=…)`. All hooks/patches are restored on
109
+ `finish()`. Assumes one forward/backward per step (use `Profiler` for gradient
110
+ accumulation).
111
+ - **Distributed vertical (headline)** — multi-rank critical-path analysis for
112
+ data-parallel training. `Profiler(distributed=True)` records every rank to
113
+ `run_dir/rank{k}/`; the analyzer aligns ranks on one timeline and computes
114
+ critical-path wall loss, communication fraction, sync skew, and load imbalance.
115
+ - **Statistical straggler detection** — identifies a *persistent* straggler rank
116
+ via a binomial-persistence test (is one rank consistently the critical path,
117
+ beyond chance?), not a fixed threshold. Rules: `DIST.STRAGGLER`,
118
+ `DIST.LOAD_IMBALANCE`, `DIST.COMM_BOUND`.
119
+ - **Pipeline-bubble analyzer** — measures achieved bubble from a per-stage
120
+ schedule and compares it to the inherent GPipe minimum `(p-1)/(m+p-1)`, so it
121
+ flags only *excess* bubble (`DIST.PIPELINE_BUBBLE`). Reproduces the closed form
122
+ exactly (tested across p, m).
123
+ - **`comm()` context manager** to attribute collective time to a `comm` phase.
124
+ - **Real `examples/ddp_gloo.py`** — runs genuine multi-process gloo DDP (CPU, no
125
+ GPU needed) with an injectable straggler; `pytscope analyze` then identifies
126
+ it. Backed by a real multi-process integration test.
127
+ - CLI `analyze` auto-detects multi-rank run directories.
128
+
129
+ ### Changed
130
+ - Packaging: version is now single-sourced from `pytscope.__version__` (Hatch
131
+ dynamic version); `docs/` added to the sdist.
132
+
133
+ ## [0.1.0] - 2026-06-06
134
+
135
+ First public beta. One telemetry backbone feeding four analysis verticals plus a
136
+ cross-signal diagnosis engine.
137
+
138
+ ### Added
139
+ - **Profiler** — live, integer-nanosecond timing core with `step()` / `mark()`
140
+ primitives, `iter_data()` dataloader timing, scalar logging, and optional
141
+ device-memory capture. ~3 µs/step overhead.
142
+ - **Integrations** — one-line PyTorch Lightning and Hugging Face `Trainer`
143
+ callbacks; DDP rank-aware (non-zero ranks no-op by default).
144
+ - **Timing vertical** — per-step attribution to data / forward / backward /
145
+ optimizer with median/p95 and rules for dataloader-bound, backward-heavy,
146
+ optimizer-heavy, and jitter.
147
+ - **Memory vertical** — CUDA + Apple MPS capture; fragmentation and
148
+ leak/growth detection.
149
+ - **Convergence vertical** — loss trend, divergence (NaN/Inf), and robust
150
+ local-window spike detection for loss and grad-norm.
151
+ - **Cross-signal rule** — correlates spikes across loss / grad-norm / step-time /
152
+ memory on one aligned timeline (the headline diagnostic).
153
+ - **Reproducibility vertical** — `pytscope diff A B` compares provenance,
154
+ config, and outcomes; diagnoses nondeterminism and finds the first divergence
155
+ step.
156
+ - **CLI** — `pytscope analyze` and `pytscope diff`.
157
+ - Pure-stdlib core; CUDA/MPS/CPU examples; 58 tests.
158
+
159
+ [Unreleased]: https://github.com/Sumu004/pytscope/compare/v0.2.1...HEAD
160
+ [0.2.1]: https://github.com/Sumu004/pytscope/releases/tag/v0.2.1
161
+ [0.2.0]: https://github.com/Sumu004/pytscope/releases/tag/v0.2.0
162
+ [0.1.0]: https://github.com/Sumu004/pytscope/releases/tag/v0.1.0
pytscope-0.2.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sumukh Chaluvaraju
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,367 @@
1
+ Metadata-Version: 2.4
2
+ Name: pytscope
3
+ Version: 0.2.1
4
+ Summary: An intelligence layer for ML training: live profiling + post-training diagnosis.
5
+ Project-URL: Homepage, https://github.com/Sumu004/pytscope
6
+ Project-URL: Repository, https://github.com/Sumu004/pytscope
7
+ Project-URL: Documentation, https://github.com/Sumu004/pytscope/tree/main/docs
8
+ Project-URL: Issues, https://github.com/Sumu004/pytscope/issues
9
+ Project-URL: Changelog, https://github.com/Sumu004/pytscope/blob/main/CHANGELOG.md
10
+ Author-email: Sumukh Chaluvaraju <sumukhchaluvaraj@gmail.com>
11
+ Maintainer-email: Sumukh Chaluvaraju <sumukhchaluvaraj@gmail.com>
12
+ License: MIT
13
+ License-File: LICENSE
14
+ Keywords: deep-learning,machine-learning,mlops,observability,performance,profiling,pytorch,training
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
+ Classifier: Topic :: System :: Monitoring
28
+ Classifier: Typing :: Typed
29
+ Requires-Python: >=3.9
30
+ Provides-Extra: dev
31
+ Requires-Dist: build; extra == 'dev'
32
+ Requires-Dist: pytest-cov; extra == 'dev'
33
+ Requires-Dist: pytest>=7; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4; extra == 'dev'
35
+ Requires-Dist: twine; extra == 'dev'
36
+ Provides-Extra: huggingface
37
+ Requires-Dist: transformers>=4.30; extra == 'huggingface'
38
+ Provides-Extra: lightning
39
+ Requires-Dist: lightning>=2.0; extra == 'lightning'
40
+ Provides-Extra: torch
41
+ Requires-Dist: torch>=2.0; extra == 'torch'
42
+ Description-Content-Type: text/markdown
43
+
44
+ <div align="center">
45
+
46
+ # pytscope
47
+
48
+ **An intelligence layer for ML training — go beyond collecting metrics to *explaining* them.**
49
+
50
+ [![CI](https://github.com/Sumu004/pytscope/actions/workflows/ci.yml/badge.svg)](https://github.com/Sumu004/pytscope/actions/workflows/ci.yml)
51
+ [![PyPI version](https://img.shields.io/pypi/v/pytscope.svg)](https://pypi.org/project/pytscope/)
52
+ [![Python versions](https://img.shields.io/pypi/pyversions/pytscope.svg)](https://pypi.org/project/pytscope/)
53
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
54
+
55
+ [Quickstart](#quickstart) · [Why it's different](#why-its-different) ·
56
+ [Demos](#try-the-demos) · [Validation](#status--validation) · [Docs](#documentation)
57
+
58
+ </div>
59
+
60
+ Standard profilers hand you a 50 MB trace and leave the "so what do I change?"
61
+ to you. `pytscope` captures **timing, memory, convergence signals, and
62
+ provenance on one aligned per-step timeline**, then runs a diagnosis engine that
63
+ turns the raw numbers into ranked, actionable findings.
64
+
65
+ ```
66
+ ● TIMING — 95 steps · 23.1 ms/step · 43.3 steps/s (median 22.0 · p95 28.4 ms · CV 0.18)
67
+ step time ▃▄▅▃▂▃▄▆▃▂▃▄▅▃▂ (low→high)
68
+ data ████████████████░░░░░░░░░░░░░░ 52.0% 12.01 ms
69
+ forward █████████░░░░░░░░░░░░░░░░░░░░░ 17.3% 4.00 ms
70
+ backward ██████████████░░░░░░░░░░░░░░░░ 26.0% 6.01 ms
71
+ optimizer █░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ 4.3% 1.00 ms
72
+
73
+ ● FINDINGS (1)
74
+ [HIGH] Input pipeline is a bottleneck (TIMING.DATALOADER_BOUND)
75
+ 52% of step time is spent fetching data (12.0 ms/step). The
76
+ accelerator is stalling on the dataloader.
77
+ -> Raise DataLoader num_workers, set persistent_workers=True and
78
+ pin_memory=True, prefetch, or move heavy transforms off the hot path.
79
+ ```
80
+
81
+ In a real terminal, each `●` is a lit indicator that's colored red, amber, or
82
+ green by what it's reporting — the same "hardware panel" grammar carried
83
+ through every section, gradient meter bar, and severity tag.
84
+
85
+ ## The headline: a Training Efficiency Budget
86
+
87
+ Most profilers hand you a list of findings. pytscope also gives you a single
88
+ **accounting identity** — every second of training, decomposed into named line
89
+ items that provably sum to your measured wall time, anchored to hardware peak
90
+ (**MFU**, Model FLOPs Utilization):
91
+
92
+ ```
93
+ ● EFFICIENCY BUDGET — wall-time decomposition
94
+ MFU 38.0% · useful compute 38.0% of 142.00s wall
95
+ useful_compute ███████████░░░░░░░░░░░░░░░░░░ 38.0% 53.96s
96
+ compute_overhead █████░░░░░░░░░░░░░░░░░░░░░░░░ 16.0% 22.72s (recoverable)
97
+ data_stall ████████░░░░░░░░░░░░░░░░░░░░░ 27.0% 38.34s (recoverable)
98
+ communication █████░░░░░░░░░░░░░░░░░░░░░░░░ 19.0% 26.98s (recoverable)
99
+
100
+ [HIGH] MFU is 38% — 62% of wall is recoverable (EFFICIENCY.LOW_MFU)
101
+ Biggest recoverable line: data_stall at 27% of wall.
102
+ -> Start with data_stall: raise num_workers, persistent_workers, prefetch.
103
+ ```
104
+
105
+ Because the phase timeline partitions each step, the decomposition is **exact** —
106
+ the line items sum to wall with no fudge factor, which makes the model
107
+ falsifiable. And every recoverable line is *seconds you can win back*, so fixes
108
+ rank themselves by payoff. FLOPs are counted automatically
109
+ (`AutoProfiler(measure_flops=True)`); peak comes from a built-in GPU table or
110
+ `--peak-tflops`.
111
+
112
+ ```bash
113
+ python examples/efficiency_mfu.py && pytscope analyze runs/mfu
114
+ ```
115
+
116
+ ## Why it's different
117
+
118
+ One backbone, four lenses. Every analyzer reads the same `StepRecord` timeline,
119
+ so findings can **cross-correlate signals no single existing tool aligns**:
120
+
121
+ | Vertical | Status |
122
+ |----------|--------|
123
+ | **Distributed** — multi-rank critical-path, straggler & comm/pipeline-bubble analysis | ✅ |
124
+ | **Timing** — attribute step time to data / fwd / bwd / optimizer | ✅ |
125
+ | **Convergence** — loss/grad-norm trend, divergence, spikes | ✅ |
126
+ | **Memory** — peak attribution, fragmentation, leak/growth | ✅ |
127
+ | **Cross-signal** — correlate spikes across all axes on one timeline | ✅ |
128
+ | **Reproducibility** — provenance capture + run-vs-run diff & drift diagnosis | ✅ |
129
+
130
+ The core is **pure-stdlib** — no heavy deps to profile your training.
131
+
132
+ ### The headline: a finding no single-axis tool can make
133
+
134
+ ```
135
+ ● FINDINGS (1)
136
+ [HIGH] Correlated instability at steps 70–72 (CROSS.CORRELATED_INSTABILITY)
137
+ At steps 70–72, 3 independent axes spike simultaneously (grad_norm,
138
+ loss, step_time): loss=3.579, grad_norm=45, step_time=25.6ms.
139
+ Co-occurrence across axes is strong evidence of a real optimization
140
+ event, not noise.
141
+ -> Inspect the LR schedule, gradient clipping, and the batch around
142
+ steps 70–72. A simultaneous loss + grad-norm spike usually means
143
+ the update blew up (LR too high / bad batch).
144
+ ```
145
+
146
+ HTA sees only timing; Cockpit only gradients; W&B only logged scalars. pytscope
147
+ sees them **on one clock** and reports the correlation. Reproduce it with
148
+ `python examples/cross_signal.py && pytscope analyze runs/cross`.
149
+
150
+ ### Distributed: the straggler no single-rank profiler can name
151
+
152
+ In synchronous data-parallel training every rank waits at the gradient
153
+ all-reduce for the **slowest** rank. That idle time is pure waste, and it's
154
+ invisible to any single-rank profiler — you only see it by putting all ranks on
155
+ one timeline. pytscope does, and uses a **statistical persistence test** (not a
156
+ threshold) to tell a genuine bad node from noise:
157
+
158
+ ```
159
+ ● DISTRIBUTED — 4 ranks, 60 aligned steps
160
+ wall lost to imbalance 18.6% · median sync skew 4.7 ms/step
161
+ rank 0: 10.0 ms · 0% (z=-4.5)
162
+ rank 2: 12.0 ms · 99% (z=+13.4) <- straggler
163
+
164
+ ● FINDINGS (1)
165
+ [HIGH] Rank 2 is a persistent straggler (DIST.STRAGGLER)
166
+ Rank 2 is the slowest (critical-path) rank in 99% of steps across 4
167
+ ranks (expected 25% by chance; z=13.4) and runs 20% slower than the
168
+ median rank. Synchronous all-reduce makes every other rank wait for it
169
+ — 18.6% of wall time is lost to this imbalance.
170
+ -> Investigate rank 2's device/host: thermal throttling, a slower GPU,
171
+ NUMA placement, or an unbalanced data shard.
172
+ ```
173
+
174
+ This is a **real** distributed system — reproduce it on your laptop (CPU, no GPU)
175
+ with genuine multi-process gloo all-reduce:
176
+
177
+ ```bash
178
+ pip install -e ".[torch]"
179
+ python examples/ddp_gloo.py --ranks 4 --straggler-rank 2
180
+ pytscope analyze runs/ddp_gloo
181
+ ```
182
+
183
+ For **pipeline parallelism**, pytscope measures the achieved bubble and compares
184
+ it to the inherent GPipe minimum `(p-1)/(m+p-1)`, so it flags only the *excess*
185
+ bubble you can actually fix — not the bubble that's just the cost of your `p`
186
+ and `m`.
187
+
188
+ ### Exposed communication: the metric that decides large-scale efficiency
189
+
190
+ Gradient all-reduce *can* run concurrently with backward compute — the part that
191
+ overlaps is free, the part that doesn't is **exposed** and sits on the critical
192
+ path. pytscope ingests a `torch.profiler`/Kineto trace and computes the split
193
+ exactly (interval arithmetic over the kernel timeline):
194
+
195
+ ```
196
+ ● COMMUNICATION OVERLAP — from kernel trace
197
+ comm 36.0 ms · overlapped 67% · exposed 12.0 ms (20% of wall)
198
+
199
+ [HIGH] Communication is not overlapped with compute (DIST.EXPOSED_COMM)
200
+ 20% of wall time is exposed communication. Only 67% of the 36.0 ms of
201
+ communication is hidden behind compute.
202
+ -> DDP gradient bucketing (bucket_cap_mb), overlap optimizer/all-reduce,
203
+ or increase per-GPU compute so backward hides the all-reduce.
204
+ ```
205
+
206
+ ```bash
207
+ pytscope analyze runs/job --trace trace.json # from torch.profiler
208
+ python examples/exposed_comm.py && pytscope analyze runs/trace_demo # no GPU
209
+ ```
210
+
211
+ ---
212
+
213
+ ## Overhead
214
+
215
+ Measured on `tests/test_overhead.py` (run `pytest -s`):
216
+
217
+ | Path | Cost |
218
+ |------|------|
219
+ | Pure instrumentation (begin/mark×3/end) | **~0.7 µs/step** |
220
+ | End-to-end incl. JSONL disk write | **~3 µs/step** |
221
+ | Disabled DDP rank (no-op) | **~0.06 µs/step** |
222
+
223
+ On a 50 ms training step that's **~0.006% overhead** — versus trace-dumping
224
+ profilers (Kineto/HTA) that add real overhead and emit multi-MB artifacts.
225
+ Memory bounded (live writer retains nothing); batched flushes; DDP-safe.
226
+
227
+ ## Install
228
+
229
+ ```bash
230
+ pip install -e ".[dev]" # core + tests
231
+ pip install -e ".[torch,lightning,huggingface]" # framework integrations
232
+ ```
233
+
234
+ ## Quickstart
235
+
236
+ **Automatic — zero changes to your loop (recommended):**
237
+
238
+ ```python
239
+ from pytscope.auto import AutoProfiler
240
+
241
+ prof = AutoProfiler("runs/exp1", model, optimizer, warmup=10)
242
+ prof.start()
243
+ for x, y in loader: # <- your loop, untouched
244
+ loss = loss_fn(model(x), y)
245
+ loss.backward()
246
+ optimizer.step(); optimizer.zero_grad()
247
+ prof.finish()
248
+ ```
249
+
250
+ `AutoProfiler` registers PyTorch hooks (forward, `optimizer.step`, and
251
+ synchronous collectives) to attribute **data / forward / backward / optimizer /
252
+ comm** automatically — no `mark()` calls anywhere in your training code. All
253
+ hooks/patches are removed on `finish()`.
254
+
255
+ **Manual loop** (full control, or gradient accumulation):
256
+
257
+ ```python
258
+ from pytscope import Profiler
259
+
260
+ prof = Profiler("runs/exp1", warmup=10)
261
+ prof.start()
262
+ for batch in prof.iter_data(loader): # times data fetch
263
+ with prof.step():
264
+ loss = loss_fn(model(batch))
265
+ prof.mark("forward")
266
+ loss.backward(); prof.mark("backward")
267
+ opt.step(); opt.zero_grad(); prof.mark("optimizer")
268
+ prof.finish()
269
+ ```
270
+
271
+ **Lightning (one line):**
272
+
273
+ ```python
274
+ from pytscope.integrations.lightning import PytscopeCallback
275
+ trainer = pl.Trainer(callbacks=[PytscopeCallback("runs/exp1")])
276
+ ```
277
+
278
+ **Hugging Face (one line):**
279
+
280
+ ```python
281
+ from pytscope.integrations.huggingface import PytscopeCallback
282
+ trainer = Trainer(..., callbacks=[PytscopeCallback("runs/exp1")])
283
+ ```
284
+
285
+ **Then analyze, or compare two runs:**
286
+
287
+ ```bash
288
+ pytscope analyze runs/exp1
289
+ pytscope diff runs/exp1 runs/exp2 # reproducibility / drift: why do they differ?
290
+ ```
291
+
292
+ Reports lean into a compact, amber-LED hardware-panel aesthetic — every
293
+ section is a "lit panel" (a colored ● indicator that reads red/amber/green by
294
+ severity), with gradient meter bars, severity-coded findings, and
295
+ loss/step-time sparklines. They auto-colorize in a real terminal and degrade
296
+ to byte-identical plain text when piped, in CI, or under
297
+ `NO_COLOR`/`--color=never` — never garbled, either way, and nothing written
298
+ to disk besides the run itself.
299
+
300
+ ## Try the demos
301
+
302
+ No ML deps:
303
+
304
+ ```bash
305
+ python examples/manual_loop.py && pytscope analyze runs/demo # timing
306
+ python examples/cross_signal.py && pytscope analyze runs/cross # cross-signal
307
+ ```
308
+
309
+ Real PyTorch (CUDA / Apple MPS / CPU, auto-detected), with real device timing
310
+ and memory:
311
+
312
+ ```bash
313
+ pip install -e ".[torch]"
314
+ python examples/pytorch_real.py && pytscope analyze runs/pytorch # healthy
315
+ python examples/pytorch_real.py --leak && pytscope analyze runs/pytorch # catches the leak
316
+ ```
317
+
318
+ The `--leak` run reports `MEMORY.GROWTH [HIGH]` from genuinely captured device
319
+ memory. (Memory attribution is most accurate on CUDA, which exposes true in-step
320
+ peaks; on MPS we sample resident memory at the step boundary.)
321
+
322
+ ---
323
+
324
+ ## Architecture
325
+
326
+ ```
327
+ training loop → collectors → RunStore (aligned timeline)
328
+
329
+ analyzers (timing | memory | convergence | repro)
330
+
331
+ diagnosis engine (ranked, cross-signal findings)
332
+
333
+ reporters (CLI — amber-LED hardware-panel terminal report)
334
+ ```
335
+
336
+ Adding a heuristic is one decorated function (`@rule`); adding a vertical is one
337
+ analyzer over the existing timeline.
338
+
339
+ ## Status & validation
340
+
341
+ **v0.1, validated on real multi-GPU NCCL hardware** — straggler attribution and
342
+ exposed-comm now have a clean run on 2× T4 (Kaggle, free tier, no paid rental):
343
+ an exact pass on straggler detection (`z=14.1`, named the injected rank
344
+ correctly) and a directionally-correct exposed-comm read that also surfaced a
345
+ genuine finding about PCIe-only interconnects. MFU-on-GPU is the last gap —
346
+ unblocked (a demo bug found and fixed) with a rerun pending.
347
+ [Full report →](docs/validation-runs/2026-06-08-kaggle-2xT4/RESULTS.md) ·
348
+ [Validation matrix & protocol →](docs/VALIDATION.md)
349
+
350
+ DDP is first-class; FSDP/tensor/pipeline parallelism are not yet.
351
+
352
+ ## Documentation
353
+
354
+ - [Usage guide](docs/usage.md) — install, instrument, and the CLI.
355
+ - [Architecture](docs/architecture.md) — the one-timeline design.
356
+ - [Diagnostics reference](docs/diagnostics.md) — every finding and its fix.
357
+ - [Validation](docs/VALIDATION.md) — what's proven, and the multi-GPU protocol.
358
+
359
+ ## Contributing
360
+
361
+ Contributions are welcome — adding a diagnosis rule is the most approachable
362
+ first PR. See [CONTRIBUTING.md](CONTRIBUTING.md) and the
363
+ [Code of Conduct](CODE_OF_CONDUCT.md). Releases follow [RELEASING.md](RELEASING.md).
364
+
365
+ ## License
366
+
367
+ [MIT](LICENSE) © 2026 Sumukh Chaluvaraju