PyPI - glasstrace - Versions diffs - 0.2.0__py3-none-any.whl - Mend

glasstrace 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

glasstrace/__init__.py +7 -0
glasstrace/hooks.py +190 -0
glasstrace/profiler.py +61 -0
glasstrace/report.py +133 -0
glasstrace-0.2.0.dist-info/METADATA +77 -0
glasstrace-0.2.0.dist-info/RECORD +8 -0
glasstrace-0.2.0.dist-info/WHEEL +4 -0
glasstrace-0.2.0.dist-info/licenses/LICENSE +21 -0

glasstrace/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""glasstrace — per-layer profiler for transformer inference."""
+from glasstrace.profiler import ProfileResult, profile
+__version__ = "0.2.0"
+__all__ = ["__version__", "profile", "ProfileResult"]

glasstrace/hooks.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""Forward hooks that record per-module timing and shape info during inference."""
+from __future__ import annotations
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+import torch
+import torch.nn as nn
+class Phase(str, Enum):
+    PREFILL = "prefill"
+    DECODE = "decode"
+    UNKNOWN = "unknown"
+@dataclass
+class ModuleEvent:
+    """A single recorded forward pass through one module."""
+    module_path: str           # e.g. "model.layers.0.self_attn.q_proj"
+    module_type: str           # e.g. "Linear"
+    input_shape: tuple | None  # shape of the first tensor input, if any
+    output_shape: tuple | None # shape of the output tensor, if any
+    duration_ms: float         # how long the forward pass took, in milliseconds
+    device: str                # "cuda", "mps", or "cpu"
+    phase: Phase = Phase.UNKNOWN
+@dataclass
+class ModuleTracer:
+    """Registers forward hooks on a model and collects per-module timing.
+    Uses CUDA events for accurate GPU timing when available, falls back to
+    wall-clock time otherwise. CPU/MPS wall-clock timing is approximate but
+    fine for development."""
+    target_types: tuple[type, ...] = (nn.Linear, nn.LayerNorm)
+    events: list[ModuleEvent] = field(default_factory=list)
+    memory_samples: list[dict] = field(default_factory=list)
+    _handles: list[Any] = field(default_factory=list)
+    _pending: dict[int, dict[str, Any]] = field(default_factory=dict)
+    _pass_count: int = 0  #tracks forward pass number
+    def attach(self, model: nn.Module) -> None:
+        """Walk the model and register hooks on every module of a target type."""
+        for name, module in model.named_modules():
+            if isinstance(module, self.target_types):
+                pre_handle = module.register_forward_pre_hook(
+                    self._make_pre_hook(name, type(module).__name__)
+                )
+                post_handle = module.register_forward_hook(
+                    self._make_post_hook(name, type(module).__name__)
+                )
+                self._handles.extend([pre_handle, post_handle])
+        self._attach_memory_sampler(model)
+    def _attach_memory_sampler(self, model: nn.Module) -> None:
+        """Sample GPU memory allocated at the start of each forward pass."""
+        import torch
+        if not torch.cuda.is_available():
+            return
+        tracer_ref = self  # capture self for the closure
+        for name, module in model.named_modules():
+            if isinstance(module, nn.Linear):
+                def memory_hook(mod, inputs):
+                    mem_bytes = torch.cuda.memory_allocated()
+                    phase = tracer_ref._detect_phase(
+                        tracer_ref._shape_of(inputs[0]) if inputs else None
+                    )
+                    tracer_ref.memory_samples.append({
+                        "pass": tracer_ref._pass_count,
+                        "phase": phase.value,
+                        "memory_bytes": mem_bytes,
+                    })
+                    tracer_ref._pass_count += 1
+                handle = module.register_forward_pre_hook(memory_hook)
+                self._handles.append(handle)
+                break  # first Linear only
+        return
+    def detach(self) -> None:
+        """Remove all registered hooks."""
+        for handle in self._handles:
+            handle.remove()
+        self._handles.clear()
+        self._pending.clear()
+    def _make_pre_hook(self, module_path: str, module_type: str):
+        def pre_hook(module: nn.Module, inputs: tuple) -> None:
+            device = self._device_of(inputs, module)
+            input_shape = self._shape_of(inputs[0]) if inputs else None
+            timing: dict[str, Any] = {
+                "module_path": module_path,
+                "module_type": module_type,
+                "input_shape": input_shape,
+                "device": device,
+            }
+            if device == "cuda":
+                start = torch.cuda.Event(enable_timing=True)
+                end = torch.cuda.Event(enable_timing=True)
+                start.record()
+                timing["cuda_start"] = start
+                timing["cuda_end"] = end
+            else:
+                timing["wall_start"] = time.perf_counter()
+            self._pending[id(module)] = timing
+        return pre_hook
+    def _make_post_hook(self, module_path: str, module_type: str):
+        def post_hook(module: nn.Module, inputs: tuple, output: Any) -> None:
+            timing = self._pending.pop(id(module), None)
+            if timing is None:
+                return
+            output_shape = self._shape_of(output)
+            if timing["device"] == "cuda":
+                timing["cuda_end"].record()
+                torch.cuda.synchronize()
+                duration_ms = timing["cuda_start"].elapsed_time(timing["cuda_end"])
+            else:
+                duration_ms = (time.perf_counter() - timing["wall_start"]) * 1000.0
+            # Detect phase from input sequence dimension
+            phase = self._detect_phase(timing["input_shape"])
+            self.events.append(
+                ModuleEvent(
+                    module_path=timing["module_path"],
+                    module_type=timing["module_type"],
+                    input_shape=timing["input_shape"],
+                    output_shape=output_shape,
+                    duration_ms=duration_ms,
+                    device=timing["device"],
+                    phase=phase,
+                )
+            )
+        return post_hook
+    @staticmethod
+    def _shape_of(x: Any) -> tuple | None:
+        if isinstance(x, torch.Tensor):
+            return tuple(x.shape)
+        if isinstance(x, (list, tuple)) and len(x) > 0 and isinstance(x[0], torch.Tensor):
+            return tuple(x[0].shape)
+        return None
+    @staticmethod
+    def _device_of(inputs: tuple, module: nn.Module) -> str:
+        # Prefer the input's device; fall back to a parameter's device.
+        if inputs and isinstance(inputs[0], torch.Tensor):
+            return inputs[0].device.type
+        for p in module.parameters():
+            return p.device.type
+        return "cpu"
+    @staticmethod
+    def _detect_phase(input_shape: tuple | None) -> Phase:
+        """Infer prefill vs decode from the sequence dimension of the input.
+        For decoder-only transformers: seq_len > 1 means prefill (processing
+        the full prompt). seq_len == 1 means decode (one new token per pass).
+        """
+        if input_shape is None:
+            return Phase.UNKNOWN
+        # Shape is (batch, seq_len, hidden_dim) for most transformer layers
+        if len(input_shape) >= 2:
+            seq_len = input_shape[1]
+            if seq_len == 1:
+                return Phase.DECODE
+            if seq_len > 1:
+                return Phase.PREFILL
+        return Phase.UNKNOWN

glasstrace/profiler.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""The user-facing profile() context manager."""
+from __future__ import annotations
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Callable
+import torch.nn as nn
+from glasstrace.hooks import ModuleEvent, ModuleTracer
+from glasstrace.report import format_report
+@dataclass
+class ProfileResult:
+    """Holds events and memory samples from a profile() block."""
+    events: list[ModuleEvent] = field(default_factory=list)
+    memory_samples: list[dict] = field(default_factory=list)
+    def report(self, top_n: int = 20) -> str:
+        """Return a formatted two-section text report."""
+        return format_report(self.events, self.memory_samples, top_n=top_n)
+    def __len__(self) -> int:
+        return len(self.events)
+@contextmanager
+def profile(model: nn.Module, warmup: Callable[[], None] | None = None):
+    """Profile a model's forward passes within a with-block.
+    Args:
+        model: the model to instrument.
+        warmup: optional zero-arg callable run once before profiling starts,
+            with its events discarded. Strongly recommended on CUDA to avoid
+            cold-start timing artifacts.
+    Example:
+        def warmup():
+            model.generate(**inputs, max_new_tokens=5)
+        with glasstrace.profile(model, warmup=warmup) as p:
+            model.generate(**inputs, max_new_tokens=50)
+        print(p.report())
+    """
+    if warmup is not None:
+        import torch
+        with torch.no_grad():
+            warmup()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+    tracer = ModuleTracer()
+    tracer.attach(model)
+    result = ProfileResult(events=tracer.events, memory_samples=tracer.memory_samples)
+    try:
+        yield result
+    finally:
+        tracer.detach()

glasstrace/report.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Text-table report generation from ModuleEvent lists."""
+from __future__ import annotations
+from collections import defaultdict
+from typing import Iterable
+from tabulate import tabulate
+from glasstrace.hooks import ModuleEvent, Phase
+def _aggregate(events: list[ModuleEvent]) -> list[dict]:
+    """Aggregate events by module path: sum times, count calls."""
+    agg: dict[str, dict] = defaultdict(
+        lambda: {"calls": 0, "total_ms": 0.0, "module_type": "", "device": ""}
+    )
+    for e in events:
+        a = agg[e.module_path]
+        a["calls"] += 1
+        a["total_ms"] += e.duration_ms
+        a["module_type"] = e.module_type
+        a["device"] = e.device
+    return [
+        {"path": path, **vals}
+        for path, vals in sorted(
+            agg.items(), key=lambda x: x[1]["total_ms"], reverse=True
+        )
+    ]
+def _section_table(rows: list[dict], total_ms: float, extra_col: str | None = None) -> str:
+    """Format a list of aggregated module rows as a text table."""
+    if not rows:
+        return "  (no events)\n"
+    table_rows = []
+    for r in rows:
+        row = {
+            "Module": r["path"],
+            "Type": r["module_type"],
+            "Calls": r["calls"],
+            "Total ms": f"{r['total_ms']:.2f}",
+            "Per-call ms": f"{r['total_ms'] / r['calls']:.2f}",
+            "% of phase": f"{r['total_ms'] / total_ms * 100:.1f}" if total_ms > 0 else "—",
+        }
+        table_rows.append(row)
+    return tabulate(table_rows, headers="keys", tablefmt="simple") + "\n"
+def format_report(
+    events: Iterable[ModuleEvent],
+    memory_samples: list[dict] | None = None,
+    top_n: int = 20,
+) -> str:
+    """Produce a two-section report: prefill and decode."""
+    events = list(events)
+    if not events:
+        return (
+            "glasstrace: no events recorded.\n"
+            "(Was the model actually run inside the profile() block?)"
+        )
+    device = events[0].device
+    prefill = [e for e in events if e.phase == Phase.PREFILL]
+    decode = [e for e in events if e.phase == Phase.DECODE]
+    unknown = [e for e in events if e.phase == Phase.UNKNOWN]
+    prefill_ms = sum(e.duration_ms for e in prefill)
+    decode_ms = sum(e.duration_ms for e in decode)
+    total_ms = sum(e.duration_ms for e in events)
+    # Decode passes = number of unique decode events for one module
+    # (all modules fire once per decode token)
+    decode_passes = decode[0].module_path and len(
+        [e for e in decode if e.module_path == decode[0].module_path]
+    ) if decode else 0
+    per_token_ms = decode_ms / decode_passes if decode_passes > 0 else 0.0
+    # Memory summary
+    mem_summary = ""
+    if memory_samples:
+        decode_samples = [s for s in memory_samples if s["phase"] == "decode"]
+        if decode_samples:
+            min_mem = min(s["memory_bytes"] for s in decode_samples)
+            max_mem = max(s["memory_bytes"] for s in decode_samples)
+            kv_growth_mb = (max_mem - min_mem) / (1024 ** 2)
+            mem_summary = f"  kv-cache growth during decode: {kv_growth_mb:.1f} MB\n"
+    header = (
+        f"\nglasstrace report\n"
+        f"  modules profiled: {len({e.module_path for e in events})}\n"
+        f"  total events: {len(events)}\n"
+        f"  total measured time: {total_ms:.2f} ms\n"
+        f"  device: {device}\n"
+        + mem_summary
+    )
+    # Prefill section
+    prefill_header = (
+        f"\n── prefill (1 pass, {prefill_ms:.1f} ms total) "
+        + "─" * 40 + "\n"
+    )
+    prefill_rows = _aggregate(prefill)[:top_n]
+    prefill_table = _section_table(prefill_rows, prefill_ms)
+    # Decode section
+    decode_header = (
+        f"\n── decode ({decode_passes} passes, {decode_ms:.1f} ms total"
+        + (f", {per_token_ms:.1f} ms/token avg" if per_token_ms > 0 else "")
+        + ") " + "─" * 20 + "\n"
+    )
+    decode_rows = _aggregate(decode)[:top_n]
+    decode_table = _section_table(decode_rows, decode_ms)
+    # Unknown section (should be empty for standard transformer runs)
+    unknown_section = ""
+    if unknown:
+        unknown_ms = sum(e.duration_ms for e in unknown)
+        unknown_section = (
+            f"\n── unclassified ({len(unknown)} events, {unknown_ms:.1f} ms) ──\n"
+        )
+    return (
+        header
+        + prefill_header
+        + prefill_table
+        + decode_header
+        + decode_table
+        + unknown_section
+    )

glasstrace-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,77 @@
+Metadata-Version: 2.4
+Name: glasstrace
+Version: 0.2.0
+Summary: Per-layer latency and memory profiler for transformer inference.
+Project-URL: Homepage, https://github.com/manu-j3400/glasstrace
+Project-URL: Repository, https://github.com/manu-j3400/glasstrace
+Project-URL: Issues, https://github.com/manu-j3400/glasstrace/issues
+Author-email: Manu <therealmanujawahar@gmail.com>
+License: MIT
+License-File: LICENSE
+Keywords: inference,llm,profiler,pytorch,transformers
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.11
+Requires-Dist: tabulate>=0.9
+Requires-Dist: torch>=2.0
+Requires-Dist: transformers>=4.40
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Requires-Dist: ruff>=0.5; extra == 'dev'
+Description-Content-Type: text/markdown
+# glasstrace
+[![CI](https://github.com/manu-j3400/glasstrace/actions/workflows/ci.yml/badge.svg)](https://github.com/manu-j3400/glasstrace/actions/workflows/ci.yml)
+> Per-layer latency and memory profiler for transformer inference.
+`glasstrace` shows you where time actually goes inside your LLM. Decomposes inference cost by layer, operation, and inference phase (prefill vs decode).
+## Why
+When you call `model.generate()`, you get a number: total latency. That's not enough to make anything faster. `glasstrace` turns the black box into a measured picture: which layers are slow, where memory pressure lives, and what changes when you tweak batch size or sequence length.
+## Install
+```bash
+pip install git+https://github.com/manu-j3400/glasstrace.git
+```
+PyPI release coming with v1.0.
+## Usage
+```python
+import glasstrace
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+inputs = tokenizer("Hello, world!", return_tensors="pt")
+with glasstrace.profile(model) as p:
+    out = model.generate(**inputs, max_new_tokens=50)
+print(p.report())
+```
+## Status
+**v0.1.0 — alpha.** Works on Qwen 2.5 0.5B and Llama 3.2 1B with CUDA. Tracks `nn.Linear` and `nn.LayerNorm` modules. Memory tracking, HTML reports, and broader model coverage planned for v0.2.
+## Roadmap
+- [x] v0.1 — Per-module CUDA timing, text-table report
+- [x] v0.2 — Prefill vs decode split, memory tracking, HTML report
+- [ ] v0.3 — Multi-model tested coverage, CLI
+- [ ] v0.4 — Comparative analyses across Llama, Qwen, Phi (blog post)
+- [ ] v1.0 — PyPI release, docs, demo video
+## License
+MIT

glasstrace-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+glasstrace/__init__.py,sha256=eMu0nO65p3mgvtn3j1SKJTkDVxiA70dEnZUSDz-d7Ck,201
+glasstrace/hooks.py,sha256=EpH-sRjpTlLUiV4IK3Vek8RhMHaRJWLMpxqYXn_DsiQ,6883
+glasstrace/profiler.py,sha256=TtyLyGSkyNId1rfwzZGJLlco9twFhZbG11yzahqGo4I,1804
+glasstrace/report.py,sha256=qtSVIdpMBozyCkKjfd-G9ViEpfZynljOaINHvPBDGSg,4410
+glasstrace-0.2.0.dist-info/METADATA,sha256=PnVOhQWIBb6PJTSnfMvJyfZZYBMEP7QeUIy1ddsoBB0,2813
+glasstrace-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+glasstrace-0.2.0.dist-info/licenses/LICENSE,sha256=M0ttOeZIwgeeugfuDScWfFP88rhf_MKucUHTS5V_Rvk,1069
+glasstrace-0.2.0.dist-info/RECORD,,

glasstrace-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

glasstrace-0.2.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Manu Jawahar
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.