PyPI - mpiptop - Versions diffs - 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

mpiptop 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

mpiptop.py CHANGED Viewed

@@ -6,6 +6,7 @@ from __future__ import annotations
 import argparse
 import colorsys
 import dataclasses
+import datetime
 import hashlib
 import json
 import os
@@ -52,7 +53,9 @@ class ProgramSelector:
 @dataclasses.dataclass(frozen=True)
 class State:
+    launcher: str
     prte_pid: int
+    slurm_job_id: Optional[str]
     rankfile: str
     ranks: List[RankInfo]
     selector: ProgramSelector
@@ -79,10 +82,40 @@ class ParsedPySpy:
     threads: List[ThreadBlock]
+@dataclasses.dataclass(frozen=True)
+class RankSnapshot:
+    output: Optional[str]
+    error: Optional[str]
+    stack_lines: List[str]
+    details: List[str]
+@dataclasses.dataclass
+class SessionEvent:
+    timestamp: float
+    ranks: Dict[int, Dict[str, object]]
+@dataclasses.dataclass
+class TimelineLevel:
+    start: int
+    end: int
+    selected: int = 0
+    buckets: List[Tuple[int, int]] = dataclasses.field(default_factory=list)
 PUNCT_STYLE = "grey62"
 BORDER_STYLE = "grey62"
 KEY_STYLE = "#7ad7ff"
 HEADER_HEIGHT = 3
+SESSION_VERSION = 1
+SESSION_LOG_FILE = "session.jsonl"
+SESSION_METADATA_FILE = "metadata.json"
+SESSION_EVENTS_FILE = "events.jsonl"
+SPARKLINE_CHARS = "▁▂▃▄▅▆▇█"
+HEARTBEAT_INTERVAL = 60
+DIVERGENCE_THRESHOLD = 0.5
+DIVERGENCE_INTERVAL = 60
 ENV_KEYS = (
     "PATH",
     "LD_LIBRARY_PATH",
@@ -101,6 +134,7 @@ import os
 TARGET = os.environ.get("MPIPTOP_TARGET", "")
 MODULE = os.environ.get("MPIPTOP_MODULE", "")
+JOB_ID = os.environ.get("MPIPTOP_SLURM_JOB_ID", "")
 ENV_KEYS = [
     "PATH",
     "LD_LIBRARY_PATH",
@@ -174,6 +208,12 @@ def matches(cmd):
     return True
+def matches_job(env):
+    if not JOB_ID:
+        return True
+    return env.get("SLURM_JOB_ID") == JOB_ID
 results = []
 for pid in os.listdir("/proc"):
     if not pid.isdigit():
@@ -183,7 +223,14 @@ for pid in os.listdir("/proc"):
         if not matches(cmd):
             continue
         env = read_env(pid)
-        rank = env.get("OMPI_COMM_WORLD_RANK") or env.get("PMIX_RANK") or env.get("PMI_RANK")
+        if not matches_job(env):
+            continue
+        rank = (
+            env.get("OMPI_COMM_WORLD_RANK")
+            or env.get("PMIX_RANK")
+            or env.get("PMI_RANK")
+            or env.get("SLURM_PROCID")
+        )
         if rank is None:
             continue
         results.append(
@@ -203,6 +250,266 @@ print(json.dumps(results))
 """
+def iso_timestamp(value: Optional[float] = None) -> str:
+    ts = time.time() if value is None else value
+    return datetime.datetime.fromtimestamp(ts).isoformat(timespec="seconds")
+def default_session_path() -> str:
+    stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    return os.path.abspath(f"mpiptop-session-{stamp}.jsonl")
+def normalize_session_path(path: str) -> Tuple[str, str]:
+    if path.endswith(".jsonl") or (os.path.exists(path) and os.path.isfile(path)):
+        base_dir = os.path.dirname(path) or "."
+        return base_dir, path
+    return path, os.path.join(path, SESSION_LOG_FILE)
+def ensure_session_path(path: str) -> Tuple[str, str]:
+    base_dir, log_path = normalize_session_path(path)
+    if os.path.exists(path):
+        if os.path.isdir(path):
+            if os.listdir(path):
+                if os.path.exists(log_path) or os.path.exists(os.path.join(path, SESSION_METADATA_FILE)):
+                    return base_dir, log_path
+                raise SystemExit(f"record path exists and is not empty: {path}")
+        elif os.path.isfile(path):
+            return base_dir, log_path
+        else:
+            raise SystemExit(f"record path exists and is not a file or directory: {path}")
+    else:
+        if log_path.endswith(".jsonl"):
+            os.makedirs(base_dir, exist_ok=True)
+        else:
+            os.makedirs(base_dir, exist_ok=True)
+    return base_dir, log_path
+def write_session_metadata(log_path: str, state: State, refresh: int, pythonpath: str) -> None:
+    payload = {
+        "version": SESSION_VERSION,
+        "created_at": iso_timestamp(),
+        "refresh": refresh,
+        "rankfile": state.rankfile,
+        "prte_pid": state.prte_pid,
+        "launcher": state.launcher,
+        "slurm_job_id": state.slurm_job_id,
+        "selector": dataclasses.asdict(state.selector),
+        "ranks": [dataclasses.asdict(rank) for rank in state.ranks],
+        "pythonpath": pythonpath,
+        "record_on_change": True,
+    }
+    if os.path.exists(log_path) and os.path.getsize(log_path) > 0:
+        return
+    with open(log_path, "a", encoding="utf-8") as handle:
+        handle.write(json.dumps({"type": "metadata", "data": payload}) + "\n")
+def load_session_metadata(path: str) -> Dict[str, object]:
+    base_dir, log_path = normalize_session_path(path)
+    metadata_path = os.path.join(base_dir, SESSION_METADATA_FILE)
+    if os.path.exists(metadata_path):
+        with open(metadata_path, "r", encoding="utf-8") as handle:
+            return json.load(handle)
+    if not os.path.exists(log_path):
+        raise SystemExit(f"metadata not found in {path}")
+    with open(log_path, "r", encoding="utf-8") as handle:
+        for line in handle:
+            raw = line.strip()
+            if not raw:
+                continue
+            data = json.loads(raw)
+            if isinstance(data, dict) and data.get("type") == "metadata":
+                payload = data.get("data")
+                if isinstance(payload, dict):
+                    return payload
+            if isinstance(data, dict) and "version" in data and "ranks" in data:
+                return data
+    raise SystemExit(f"metadata not found in {log_path}")
+def read_last_event(path: str) -> Optional[Dict[str, object]]:
+    if not os.path.exists(path):
+        return None
+    with open(path, "rb") as handle:
+        handle.seek(0, os.SEEK_END)
+        pos = handle.tell()
+        if pos == 0:
+            return None
+        chunk = b""
+        while pos > 0:
+            step = min(4096, pos)
+            pos -= step
+            handle.seek(pos)
+            chunk = handle.read(step) + chunk
+            if b"\n" in chunk:
+                break
+        lines = [line for line in chunk.splitlines() if line.strip()]
+        while lines:
+            raw = lines.pop().decode("utf-8", errors="ignore")
+            try:
+                data = json.loads(raw)
+            except json.JSONDecodeError:
+                continue
+            if isinstance(data, dict) and data.get("type") == "metadata":
+                continue
+            if isinstance(data, dict) and data.get("type") == "event":
+                payload = data.get("data")
+                if isinstance(payload, dict):
+                    return payload
+            return data
+        return None
+def load_session_events(path: str) -> List[SessionEvent]:
+    base_dir, log_path = normalize_session_path(path)
+    events_path = os.path.join(base_dir, SESSION_EVENTS_FILE)
+    if not os.path.exists(events_path) and not os.path.exists(log_path):
+        raise SystemExit(f"events not found in {path}")
+    path_to_read = events_path if os.path.exists(events_path) else log_path
+    events: List[SessionEvent] = []
+    with open(path_to_read, "r", encoding="utf-8") as handle:
+        for line in handle:
+            raw = line.strip()
+            if not raw:
+                continue
+            data = json.loads(raw)
+            if isinstance(data, dict) and data.get("type") == "metadata":
+                continue
+            if isinstance(data, dict) and data.get("type") == "event":
+                data = data.get("data", {})
+            if not isinstance(data, dict):
+                continue
+            timestamp = float(data.get("t", 0.0))
+            ranks_raw = data.get("ranks", {})
+            ranks: Dict[int, Dict[str, object]] = {}
+            for key, value in ranks_raw.items():
+                try:
+                    rank_id = int(key)
+                except (TypeError, ValueError):
+                    continue
+                ranks[rank_id] = value
+            events.append(SessionEvent(timestamp=timestamp, ranks=ranks))
+    return events
+def signature_from_snapshot(snapshot: Optional[RankSnapshot]) -> str:
+    if snapshot is None:
+        return "missing"
+    if snapshot.error:
+        return f"error:{snapshot.error}"
+    if snapshot.output is None:
+        return "missing"
+    digest = hashlib.sha1(snapshot.output.encode("utf-8", errors="ignore")).hexdigest()
+    return digest
+def snapshot_signature(ranks: List[RankInfo], snapshots: Dict[int, RankSnapshot]) -> Dict[int, str]:
+    signature: Dict[int, str] = {}
+    for info in ranks:
+        signature[info.rank] = signature_from_snapshot(snapshots.get(info.rank))
+    return signature
+def signature_from_event(event: Dict[str, object]) -> Optional[Dict[int, str]]:
+    ranks = event.get("ranks", {})
+    if not isinstance(ranks, dict):
+        return None
+    signature: Dict[int, str] = {}
+    for key, payload in ranks.items():
+        try:
+            rank_id = int(key)
+        except (TypeError, ValueError):
+            continue
+        if not isinstance(payload, dict):
+            signature[rank_id] = "missing"
+            continue
+        if payload.get("error"):
+            signature[rank_id] = f"error:{payload.get('error')}"
+        elif payload.get("py_spy"):
+            digest = hashlib.sha1(
+                str(payload.get("py_spy")).encode("utf-8", errors="ignore")
+            ).hexdigest()
+            signature[rank_id] = digest
+        else:
+            signature[rank_id] = "missing"
+    return signature
+class RecordSession:
+    def __init__(self, path: str, state: State, refresh: int, pythonpath: str):
+        self.base_dir, self.log_path = ensure_session_path(path)
+        write_session_metadata(self.log_path, state, refresh, pythonpath)
+        self.handle = open(self.log_path, "a", encoding="utf-8")
+        self.event_count = 0
+        self.last_signature: Optional[Dict[int, str]] = None
+        last_event = read_last_event(self.log_path)
+        if last_event:
+            self.last_signature = signature_from_event(last_event)
+            self.event_count = self._count_events()
+    def _count_events(self) -> int:
+        if not os.path.exists(self.log_path):
+            return 0
+        count = 0
+        with open(self.log_path, "r", encoding="utf-8") as handle:
+            for line in handle:
+                raw = line.strip()
+                if not raw:
+                    continue
+                try:
+                    data = json.loads(raw)
+                except json.JSONDecodeError:
+                    continue
+                if isinstance(data, dict) and data.get("type") == "metadata":
+                    continue
+                count += 1
+        return count
+    def record_if_changed(
+        self,
+        state: State,
+        rank_to_proc: Dict[int, RankProcess],
+        snapshots: Dict[int, RankSnapshot],
+    ) -> bool:
+        signature = snapshot_signature(state.ranks, snapshots)
+        if self.last_signature is not None and signature == self.last_signature:
+            return False
+        payload: Dict[str, object] = {"t": time.time(), "ranks": {}}
+        ranks_payload: Dict[str, object] = {}
+        for info in state.ranks:
+            rank = info.rank
+            proc = rank_to_proc.get(rank)
+            snapshot = snapshots.get(rank)
+            entry: Dict[str, object] = {"host": info.host}
+            if proc is not None:
+                entry["pid"] = proc.pid
+                entry["cmdline"] = proc.cmdline
+                entry["rss_kb"] = proc.rss_kb
+            if snapshot is None:
+                entry["error"] = "No data"
+            elif snapshot.error:
+                entry["error"] = snapshot.error
+            elif snapshot.output is not None:
+                entry["py_spy"] = snapshot.output
+            else:
+                entry["error"] = "No data"
+            ranks_payload[str(rank)] = entry
+        payload["ranks"] = ranks_payload
+        self.handle.write(json.dumps({"type": "event", "data": payload}) + "\n")
+        self.handle.flush()
+        self.last_signature = signature
+        self.event_count += 1
+        return True
+    def close(self) -> None:
+        try:
+            self.handle.close()
+        except Exception:
+            pass
 def read_ps() -> List[Proc]:
     result = subprocess.run(
         ["ps", "-eo", "pid=,ppid=,args="],
@@ -410,6 +717,7 @@ def matches_python_cmd(cmd: List[str], selector: ProgramSelector) -> bool:
 def find_rank_pids_local(
     selector: ProgramSelector,
+    slurm_job_id: Optional[str],
 ) -> List[Tuple[int, int, str, Optional[int], Optional[str], Dict[str, str]]]:
     results: List[Tuple[int, int, str, Optional[int], Optional[str], Dict[str, str]]] = []
     for pid in os.listdir("/proc"):
@@ -428,7 +736,14 @@ def find_rank_pids_local(
                     continue
                 key, value = item.split(b"=", 1)
                 env[key.decode(errors="ignore")] = value.decode(errors="ignore")
-            rank = env.get("OMPI_COMM_WORLD_RANK") or env.get("PMIX_RANK") or env.get("PMI_RANK")
+            if slurm_job_id and env.get("SLURM_JOB_ID") != slurm_job_id:
+                continue
+            rank = (
+                env.get("OMPI_COMM_WORLD_RANK")
+                or env.get("PMIX_RANK")
+                or env.get("PMI_RANK")
+                or env.get("SLURM_PROCID")
+            )
             if rank is None:
                 continue
             rss_kb = read_rss_kb(int(pid))
@@ -476,12 +791,15 @@ def run_ssh(host: str, command: str, timeout: int = 8) -> subprocess.CompletedPr
 def find_rank_pids_remote(
-    host: str, selector: ProgramSelector
+    host: str,
+    selector: ProgramSelector,
+    slurm_job_id: Optional[str],
 ) -> Tuple[List[Tuple[int, int, str, Optional[int], Optional[str], Dict[str, str]]], Optional[str]]:
     env_prefix = build_env_prefix(
         {
             "MPIPTOP_TARGET": selector.script or "",
             "MPIPTOP_MODULE": selector.module or "",
+            "MPIPTOP_SLURM_JOB_ID": slurm_job_id or "",
         }
     )
     remote_cmd = f"{env_prefix}python3 - <<'PY'\n{REMOTE_FINDER_SCRIPT}\nPY"
@@ -1134,6 +1452,9 @@ def build_header(
     program_lines = wrap_program_lines(state.selector, width)
     if not program_lines:
         program_lines = [Text("python")]
+    for line in program_lines:
+        line.no_wrap = True
+        line.overflow = "crop"
     controls_plain = "q quit | space refresh | t threads | d details"
     padding = max(0, width - len(controls_plain))
@@ -1155,6 +1476,8 @@ def build_header(
         text.append_text(line)
     text.append("\n")
     text.append_text(line2)
+    text.no_wrap = True
+    text.overflow = "crop"
     return text, len(program_lines) + 1
@@ -1246,18 +1569,536 @@ def build_details_text(
     return output
+def format_elapsed(start: Optional[float]) -> str:
+    if start is None:
+        return "0:00"
+    elapsed = max(0, int(time.time() - start))
+    return format_duration(elapsed)
+def format_duration(elapsed: int) -> str:
+    hours = elapsed // 3600
+    minutes = (elapsed % 3600) // 60
+    seconds = elapsed % 60
+    if hours:
+        return f"{hours}:{minutes:02d}:{seconds:02d}"
+    return f"{minutes}:{seconds:02d}"
+def build_live_header(
+    state: State,
+    last_update: str,
+    refresh: int,
+    record_line: Optional[str],
+    width: int,
+) -> Tuple[Text, int]:
+    program_lines = wrap_program_lines(state.selector, width)
+    if not program_lines:
+        program_lines = [Text("python")]
+    for line in program_lines:
+        line.no_wrap = True
+        line.overflow = "crop"
+    record_text = None
+    if record_line:
+        record_text = Text()
+        record_text.append("REC", style="bold red")
+        record_text.append(" recording: ")
+        record_text.append(record_line)
+        record_text.truncate(width)
+        record_text.no_wrap = True
+        record_text.overflow = "crop"
+    controls_plain = "q quit | space refresh | t threads | d details | r record"
+    padding = max(0, width - len(controls_plain))
+    controls_line = Text(" " * padding + controls_plain)
+    for token in ["q", "space", "t", "d", "r"]:
+        start = controls_plain.find(token)
+        if start != -1:
+            controls_line.stylize(KEY_STYLE, padding + start, padding + start + len(token))
+    controls_line.truncate(width)
+    controls_line.no_wrap = True
+    controls_line.overflow = "crop"
+    text = Text()
+    for idx, line in enumerate(program_lines):
+        if idx:
+            text.append("\n")
+        text.append_text(line)
+    text.append("\n")
+    if record_text is not None:
+        text.append_text(record_text)
+        text.append("\n")
+    text.append_text(controls_line)
+    text.no_wrap = True
+    text.overflow = "crop"
+    extra_lines = 2 if record_text is not None else 1
+    return text, len(program_lines) + extra_lines
+def build_review_header(
+    state: State,
+    event_index: int,
+    event_total: int,
+    event_time: str,
+    timeline_lines: List[Text],
+    width: int,
+) -> Tuple[Text, int]:
+    program_lines = wrap_program_lines(state.selector, width)
+    if not program_lines:
+        program_lines = [Text("python")]
+    status_line = Text(
+        f"review {event_index + 1}/{event_total} | {event_time}"
+    )
+    status_line.truncate(width)
+    controls_plain = "q quit | left/right move | down zoom | up zoom out | t threads | d details"
+    padding = max(0, width - len(controls_plain))
+    controls_line = Text(" " * padding + controls_plain)
+    for token in ["q", "left/right", "down", "up", "t", "d"]:
+        start = controls_plain.find(token)
+        if start != -1:
+            controls_line.stylize(KEY_STYLE, padding + start, padding + start + len(token))
+    controls_line.truncate(width)
+    controls_line.no_wrap = True
+    controls_line.overflow = "crop"
+    text = Text()
+    for idx, line in enumerate(program_lines):
+        if idx:
+            text.append("\n")
+        text.append_text(line)
+    text.append("\n")
+    text.append_text(status_line)
+    for line in timeline_lines:
+        text.append("\n")
+        text.append_text(line)
+    text.append("\n")
+    text.append_text(controls_line)
+    text.no_wrap = True
+    text.overflow = "crop"
+    return text, len(program_lines) + 1 + len(timeline_lines) + 1
+def build_buckets(start: int, end: int, width: int) -> List[Tuple[int, int]]:
+    count = max(0, end - start)
+    if count == 0:
+        return []
+    bucket_count = max(1, min(width, count))
+    base = count // bucket_count
+    remainder = count % bucket_count
+    buckets: List[Tuple[int, int]] = []
+    current = start
+    for idx in range(bucket_count):
+        size = base + (1 if idx < remainder else 0)
+        buckets.append((current, current + size))
+        current += size
+    return buckets
+def divergence_color(ratio: float) -> str:
+    clamped = min(1.0, max(0.0, ratio))
+    intensity = clamped ** 0.7
+    base = (170, 170, 170)
+    hot = (255, 122, 0)
+    r = int(base[0] + (hot[0] - base[0]) * intensity)
+    g = int(base[1] + (hot[1] - base[1]) * intensity)
+    b = int(base[2] + (hot[2] - base[2]) * intensity)
+    return f"#{r:02x}{g:02x}{b:02x}"
+def compute_event_metrics(
+    events: List[SessionEvent],
+    ranks: List[RankInfo],
+    show_threads: bool,
+) -> Tuple[List[int], List[float], List[int]]:
+    max_stack_lens: List[int] = []
+    divergence_ratios: List[float] = []
+    common_prefixes: List[int] = []
+    for event in events:
+        stacks_by_rank: Dict[int, List[str]] = {}
+        for info in ranks:
+            payload = event.ranks.get(info.rank, {})
+            if payload.get("error"):
+                stacks_by_rank[info.rank] = []
+                continue
+            output = payload.get("py_spy")
+            if not output:
+                stacks_by_rank[info.rank] = []
+                continue
+            lines, _details = render_pyspy_output(str(output), show_threads)
+            stacks_by_rank[info.rank] = extract_stack_lines(lines)
+        max_len = max((len(stack) for stack in stacks_by_rank.values()), default=0)
+        common_len = common_prefix_length(stacks_by_rank)
+        similarity = float(common_len) / float(max_len) if max_len else 0.0
+        ratio = 1.0 - similarity if max_len else 0.0
+        max_stack_lens.append(max_len)
+        divergence_ratios.append(ratio)
+        common_prefixes.append(common_len)
+    return max_stack_lens, divergence_ratios, common_prefixes
+def render_timeline_lines(
+    levels: List[TimelineLevel],
+    max_stack_lens: List[int],
+    divergence_ratios: List[float],
+    width: int,
+) -> List[Text]:
+    lines: List[Text] = []
+    for level_index, level in enumerate(levels):
+        level.buckets = build_buckets(level.start, level.end, width)
+        if level.buckets:
+            level.selected = max(0, min(level.selected, len(level.buckets) - 1))
+        stats: List[Tuple[int, float]] = []
+        for start, end in level.buckets:
+            bucket_heights = max_stack_lens[start:end]
+            bucket_ratios = divergence_ratios[start:end]
+            height = max(bucket_heights) if bucket_heights else 0
+            ratio = max(bucket_ratios) if bucket_ratios else 0.0
+            stats.append((height, ratio))
+        max_height = max((height for height, _ in stats), default=1)
+        if max_height <= 0:
+            max_height = 1
+        text = Text()
+        for idx, (height, ratio) in enumerate(stats):
+            normalized = float(height) / float(max_height) if max_height else 0.0
+            level_idx = int(round(normalized * (len(SPARKLINE_CHARS) - 1)))
+            level_idx = max(0, min(level_idx, len(SPARKLINE_CHARS) - 1))
+            char = SPARKLINE_CHARS[level_idx]
+            style = divergence_color(ratio)
+            if idx == level.selected:
+                if level_index == len(levels) - 1:
+                    style = f"{style} bold underline"
+                else:
+                    style = f"{style} underline"
+            text.append(char, style=style)
+        text.no_wrap = True
+        text.overflow = "crop"
+        lines.append(text)
+    return lines
+def event_snapshots_from_event(
+    event: SessionEvent,
+    ranks: List[RankInfo],
+    show_threads: bool,
+) -> Dict[int, RankSnapshot]:
+    snapshots: Dict[int, RankSnapshot] = {}
+    for info in ranks:
+        payload = event.ranks.get(info.rank)
+        if not payload:
+            snapshots[info.rank] = RankSnapshot(
+                output=None,
+                error="No data",
+                stack_lines=["No data"],
+                details=[],
+            )
+            continue
+        if payload.get("error"):
+            snapshots[info.rank] = RankSnapshot(
+                output=None,
+                error=str(payload.get("error")),
+                stack_lines=[str(payload.get("error"))],
+                details=[],
+            )
+            continue
+        output = payload.get("py_spy")
+        if not output:
+            snapshots[info.rank] = RankSnapshot(
+                output=None,
+                error="No data",
+                stack_lines=["No data"],
+                details=[],
+            )
+            continue
+        lines, details = render_pyspy_output(str(output), show_threads)
+        snapshots[info.rank] = RankSnapshot(
+            output=str(output),
+            error=None,
+            stack_lines=lines,
+            details=details,
+        )
+    return snapshots
+def rank_to_proc_from_event(
+    event: SessionEvent,
+    ranks: List[RankInfo],
+) -> Dict[int, RankProcess]:
+    rank_to_proc: Dict[int, RankProcess] = {}
+    for info in ranks:
+        payload = event.ranks.get(info.rank)
+        if not payload:
+            continue
+        pid = payload.get("pid")
+        cmdline = payload.get("cmdline")
+        rss_kb = payload.get("rss_kb")
+        if pid is None or cmdline is None:
+            continue
+        try:
+            pid_value = int(pid)
+        except (TypeError, ValueError):
+            continue
+        rss_value = None
+        if rss_kb is not None:
+            try:
+                rss_value = int(rss_kb)
+            except (TypeError, ValueError):
+                rss_value = None
+        rank_to_proc[info.rank] = RankProcess(
+            pid=pid_value,
+            cmdline=str(cmdline),
+            rss_kb=rss_value,
+            python_exe=None,
+            env={},
+        )
+    return rank_to_proc
+def compute_divergence_from_snapshots(
+    ranks: List[RankInfo], snapshots: Dict[int, RankSnapshot]
+) -> Tuple[float, int, int]:
+    stack_lines_by_rank = {
+        info.rank: extract_stack_lines(snapshots.get(info.rank, RankSnapshot(None, "No data", [], [])).stack_lines)
+        for info in ranks
+    }
+    max_len = max((len(stack) for stack in stack_lines_by_rank.values()), default=0)
+    common_len = common_prefix_length(stack_lines_by_rank)
+    similarity = float(common_len) / float(max_len) if max_len else 0.0
+    divergence = 1.0 - similarity if max_len else 0.0
+    return divergence, common_len, max_len
+def read_key(timeout: float) -> Optional[str]:
+    if sys.stdin not in select_with_timeout(timeout):
+        return None
+    key = sys.stdin.read(1)
+    if key != "\x1b":
+        return key
+    seq = key
+    for _ in range(2):
+        if sys.stdin in select_with_timeout(0.01):
+            seq += sys.stdin.read(1)
+    if seq == "\x1b[A":
+        return "up"
+    if seq == "\x1b[B":
+        return "down"
+    if seq == "\x1b[C":
+        return "right"
+    if seq == "\x1b[D":
+        return "left"
+    return None
+def is_pid_alive(pid: int) -> bool:
+    if pid <= 0:
+        return False
+    try:
+        os.kill(pid, 0)
+    except ProcessLookupError:
+        return False
+    except PermissionError:
+        return True
+    return True
+def parse_scontrol_kv(line: str) -> Dict[str, str]:
+    fields: Dict[str, str] = {}
+    for token in line.split():
+        if "=" not in token:
+            continue
+        key, value = token.split("=", 1)
+        fields[key] = value
+    return fields
+def run_scontrol_show_job(job_id: str) -> Dict[str, str]:
+    result = subprocess.run(
+        ["scontrol", "show", "job", "-o", str(job_id)],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        stderr = (result.stderr or result.stdout or "").strip()
+        raise SystemExit(f"scontrol show job failed for {job_id}: {stderr or 'unknown error'}")
+    line = (result.stdout or "").strip()
+    if not line:
+        raise SystemExit(f"scontrol show job returned empty output for {job_id}")
+    return parse_scontrol_kv(line)
+def expand_slurm_nodelist(nodelist: str) -> List[str]:
+    result = subprocess.run(
+        ["scontrol", "show", "hostnames", nodelist],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        stderr = (result.stderr or result.stdout or "").strip()
+        raise SystemExit(f"scontrol show hostnames failed: {stderr or 'unknown error'}")
+    hosts = [line.strip() for line in result.stdout.splitlines() if line.strip()]
+    if not hosts:
+        raise SystemExit(f"no hosts parsed from nodelist: {nodelist}")
+    return hosts
+def parse_tasks_per_node(raw: str) -> List[int]:
+    if not raw:
+        return []
+    counts: List[int] = []
+    for part in raw.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        match = re.match(r"(\d+)\(x(\d+)\)", part)
+        if match:
+            value = int(match.group(1))
+            repeat = int(match.group(2))
+            counts.extend([value] * repeat)
+            continue
+        if part.isdigit():
+            counts.append(int(part))
+    return counts
+def distribute_tasks(num_tasks: int, num_nodes: int) -> List[int]:
+    if num_nodes <= 0:
+        return []
+    base = num_tasks // num_nodes
+    remainder = num_tasks % num_nodes
+    counts = [base] * num_nodes
+    for idx in range(remainder):
+        counts[idx] += 1
+    return counts
+def slurm_job_to_ranks(job_id: str) -> List[RankInfo]:
+    info = run_scontrol_show_job(job_id)
+    nodelist = info.get("NodeList") or info.get("Nodes")
+    if not nodelist:
+        raise SystemExit(f"no NodeList found for slurm job {job_id}")
+    hosts = expand_slurm_nodelist(nodelist)
+    tasks_per_node = parse_tasks_per_node(info.get("TasksPerNode", ""))
+    num_tasks = 0
+    try:
+        num_tasks = int(info.get("NumTasks", "0") or 0)
+    except ValueError:
+        num_tasks = 0
+    if len(tasks_per_node) != len(hosts):
+        if num_tasks > 0:
+            tasks_per_node = distribute_tasks(num_tasks, len(hosts))
+        else:
+            tasks_per_node = [1] * len(hosts)
+    ranks: List[RankInfo] = []
+    rank_id = 0
+    for host, count in zip(hosts, tasks_per_node):
+        for _ in range(max(0, count)):
+            ranks.append(RankInfo(rank=rank_id, host=host))
+            rank_id += 1
+    return ranks
+def resolve_slurm_job_id(args: argparse.Namespace) -> Optional[str]:
+    if getattr(args, "slurm_job", None):
+        return str(args.slurm_job)
+    env_job = os.environ.get("SLURM_JOB_ID")
+    if env_job:
+        return env_job
+    user = os.environ.get("USER")
+    if not user:
+        return None
+    result = subprocess.run(
+        ["squeue", "-u", user, "-h", "-t", "R", "-o", "%i"],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return None
+    jobs = [line.strip() for line in result.stdout.splitlines() if line.strip()]
+    if len(jobs) == 1:
+        return jobs[0]
+    return None
+def describe_slurm_jobs() -> str:
+    user = os.environ.get("USER")
+    if not user:
+        return ""
+    result = subprocess.run(
+        ["squeue", "-u", user, "-h", "-o", "%i %t %j %R"],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return ""
+    lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
+    if not lines:
+        return ""
+    return "\n".join(lines[:10])
+def is_slurm_job_alive(job_id: Optional[str]) -> bool:
+    if not job_id:
+        return False
+    result = subprocess.run(
+        ["squeue", "-j", str(job_id), "-h", "-o", "%t"],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return False
+    state = (result.stdout or "").strip()
+    return bool(state)
 def detect_state(args: argparse.Namespace) -> State:
     procs = read_ps()
-    prte = find_prterun(procs, args.prterun_pid)
-    rankfile = args.rankfile or find_rankfile_path(prte.args)
-    if not rankfile:
-        raise SystemExit("rankfile not found in prterun/mpirun args")
-    ranks = parse_rankfile(rankfile)
-    children = build_children_map(procs)
-    descendants = find_descendants(children, prte.pid)
-    program_proc = select_program(procs, descendants)
-    selector = parse_python_selector(program_proc.args if program_proc else "")
-    return State(prte_pid=prte.pid, rankfile=rankfile, ranks=ranks, selector=selector)
+    prte_error = None
+    prte = None
+    try:
+        prte = find_prterun(procs, args.prterun_pid)
+    except SystemExit as exc:
+        prte_error = str(exc)
+    if prte is not None:
+        rankfile = args.rankfile or find_rankfile_path(prte.args)
+        if not rankfile:
+            raise SystemExit("rankfile not found in prterun/mpirun args")
+        ranks = parse_rankfile(rankfile)
+        children = build_children_map(procs)
+        descendants = find_descendants(children, prte.pid)
+        program_proc = select_program(procs, descendants)
+        selector = parse_python_selector(program_proc.args if program_proc else "")
+        return State(
+            launcher="prte",
+            prte_pid=prte.pid,
+            slurm_job_id=None,
+            rankfile=rankfile,
+            ranks=ranks,
+            selector=selector,
+        )
+    slurm_job_id = resolve_slurm_job_id(args)
+    if slurm_job_id:
+        ranks = slurm_job_to_ranks(slurm_job_id)
+        selector = ProgramSelector(module=None, script=None, display="")
+        return State(
+            launcher="slurm",
+            prte_pid=0,
+            slurm_job_id=slurm_job_id,
+            rankfile=f"slurm:{slurm_job_id}",
+            ranks=ranks,
+            selector=selector,
+        )
+    hint = describe_slurm_jobs()
+    if hint:
+        hint = "\n" + hint
+    raise SystemExit(
+        prte_error
+        or f"no prterun/mpirun process found and no slurm job detected (try --slurm-job){hint}"
+    )
 def collect_rank_pids(state: State) -> Tuple[Dict[int, RankProcess], List[str]]:
@@ -1268,10 +2109,10 @@ def collect_rank_pids(state: State) -> Tuple[Dict[int, RankProcess], List[str]]:
     for host in hosts:
         if is_local_host(host):
-            entries = find_rank_pids_local(state.selector)
+            entries = find_rank_pids_local(state.selector, state.slurm_job_id)
             host_error = None
         else:
-            entries, host_error = find_rank_pids_remote(host, state.selector)
+            entries, host_error = find_rank_pids_remote(host, state.selector, state.slurm_job_id)
         if host_error:
             errors.append(host_error)
         for rank, pid, cmd, rss_kb, python_exe, env_subset in entries:
@@ -1297,42 +2138,105 @@ def collect_stacks(
     pythonpath: str,
     show_threads: bool,
     install_attempted: set,
-) -> Tuple[Dict[int, List[str]], Dict[int, List[str]], List[str]]:
-    stacks: Dict[int, List[str]] = {}
-    details_by_rank: Dict[int, List[str]] = {}
+) -> Tuple[Dict[int, RankSnapshot], List[str]]:
+    snapshots: Dict[int, RankSnapshot] = {}
     errors: List[str] = []
     for entry in state.ranks:
         proc = rank_to_proc.get(entry.rank)
         if proc is None:
-            stacks[entry.rank] = ["No process"]
-            details_by_rank[entry.rank] = []
+            snapshots[entry.rank] = RankSnapshot(
+                output=None,
+                error="No process",
+                stack_lines=["No process"],
+                details=[],
+            )
             continue
         output, error = run_py_spy(entry.host, proc, pythonpath, install_attempted)
         if error:
             errors.append(error)
-            stacks[entry.rank] = [error]
-            details_by_rank[entry.rank] = []
+            snapshots[entry.rank] = RankSnapshot(
+                output=None,
+                error=error,
+                stack_lines=[error],
+                details=[],
+            )
             continue
         lines, details = render_pyspy_output(output or "", show_threads)
-        stacks[entry.rank] = lines
-        details_by_rank[entry.rank] = details
-    return stacks, details_by_rank, errors
+        snapshots[entry.rank] = RankSnapshot(
+            output=output,
+            error=None,
+            stack_lines=lines,
+            details=details,
+        )
+    return snapshots, errors
-def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+def parse_live_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Show MPI Python stacks across hosts.")
     parser.add_argument("--rankfile", help="Override rankfile path")
     parser.add_argument("--prterun-pid", type=int, help="PID of prterun/mpirun")
+    parser.add_argument("--slurm-job", help="Slurm job ID to inspect")
     parser.add_argument("--refresh", type=int, default=10, help="Refresh interval (seconds)")
     parser.add_argument(
         "--pythonpath",
         help="PYTHONPATH to export remotely (defaults to local PYTHONPATH)",
     )
+    parser.add_argument(
+        "--out",
+        help="Output path for recordings (.jsonl file or directory)",
+    )
     return parser.parse_args(argv)
-def main(argv: Optional[Sequence[str]] = None) -> int:
-    args = parse_args(argv)
+def parse_review_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Review a recorded mpiptop session.")
+    parser.add_argument("path", help="Path to a recorded session (.jsonl file or directory)")
+    return parser.parse_args(argv)
+def parse_summarize_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Summarize a recorded mpiptop session.")
+    parser.add_argument("path", help="Path to a recorded session (.jsonl file or directory)")
+    parser.add_argument(
+        "--format",
+        choices=["text", "json"],
+        default="text",
+        help="Output format",
+    )
+    parser.add_argument(
+        "--top",
+        type=int,
+        default=5,
+        help="Top signatures to report",
+    )
+    return parser.parse_args(argv)
+def parse_record_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Record an mpiptop session.")
+    parser.add_argument("--rankfile", help="Override rankfile path")
+    parser.add_argument("--prterun-pid", type=int, help="PID of prterun/mpirun")
+    parser.add_argument("--slurm-job", help="Slurm job ID to inspect")
+    parser.add_argument("--refresh", type=int, default=10, help="Refresh interval (seconds)")
+    parser.add_argument(
+        "--pythonpath",
+        help="PYTHONPATH to export remotely (defaults to local PYTHONPATH)",
+    )
+    parser.add_argument(
+        "--out",
+        help="Output path for recordings (.jsonl file or directory)",
+    )
+    parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="Only print start/stop lines",
+    )
+    args = parser.parse_args(argv)
+    args.record = True
+    return args
+def run_live(args: argparse.Namespace) -> int:
     pythonpath = args.pythonpath if args.pythonpath is not None else os.environ.get("PYTHONPATH", "")
     state = detect_state(args)
@@ -1341,6 +2245,10 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
     show_threads = False
     show_details = False
     install_attempted: set = set()
+    record_session: Optional[RecordSession] = None
+    recording_enabled = bool(getattr(args, "record", False))
+    record_started_at: Optional[float] = None
+    record_path = args.out
     def handle_sigint(_sig, _frame):
         raise KeyboardInterrupt
@@ -1357,32 +2265,60 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
     last_update = "never"
     next_refresh = 0.0
+    def start_recording() -> None:
+        nonlocal record_session, recording_enabled, record_started_at, record_path
+        if record_session is None:
+            record_path = record_path or default_session_path()
+            record_session = RecordSession(record_path, state, refresh, pythonpath)
+        recording_enabled = True
+        if record_started_at is None:
+            record_started_at = time.time()
+    def stop_recording() -> None:
+        nonlocal recording_enabled, record_started_at
+        recording_enabled = False
+        record_started_at = None
+    if recording_enabled:
+        start_recording()
     def refresh_view() -> None:
-        nonlocal last_update, state
-        rank_to_proc, pid_errors = collect_rank_pids(state)
+        nonlocal last_update, state, record_session
+        rank_to_proc, _pid_errors = collect_rank_pids(state)
         candidate = best_selector_from_procs(rank_to_proc.values())
         if candidate and selector_score(candidate) > selector_score(state.selector):
             state = dataclasses.replace(state, selector=candidate)
-        stacks, details_by_rank, stack_errors = collect_stacks(
+        snapshots, _stack_errors = collect_stacks(
             state, rank_to_proc, pythonpath, show_threads, install_attempted
         )
+        if recording_enabled and record_session is not None:
+            record_session.record_if_changed(state, rank_to_proc, snapshots)
         stacks_text: Dict[int, Text] = {}
-        stack_lines_by_rank = {rank: extract_stack_lines(lines) for rank, lines in stacks.items()}
+        stack_lines_by_rank = {
+            rank: extract_stack_lines(snapshot.stack_lines)
+            for rank, snapshot in snapshots.items()
+        }
         prefix_len = common_prefix_length(stack_lines_by_rank)
         diff_index = None
         if any(stack_lines_by_rank.values()):
-            if prefix_len > 0:
-                diff_index = prefix_len - 1
-            else:
-                diff_index = 0
-        for rank, lines in stacks.items():
+            diff_index = max(0, prefix_len - 1) if prefix_len > 0 else 0
+        for rank, snapshot in snapshots.items():
+            lines = snapshot.stack_lines
             marked = mark_diff_line(lines, diff_index) if diff_index is not None else lines
             stacks_text[rank] = style_lines(marked)
-        errors = pid_errors + stack_errors
+        details_by_rank = {
+            rank: snapshot.details for rank, snapshot in snapshots.items()
+        }
         last_update = time.strftime("%H:%M:%S")
         width, height = shutil.get_terminal_size((120, 40))
         content_width = max(0, width - 4)
-        header, header_lines = build_header(state, last_update, errors, refresh, content_width)
+        record_line = None
+        if record_session is not None and recording_enabled:
+            record_line = f"{record_session.log_path} | events {record_session.event_count} | {format_elapsed(record_started_at)}"
+            record_line = shorten(record_line, max(10, content_width - 12))
+        header, header_lines = build_live_header(
+            state, last_update, refresh, record_line, content_width
+        )
         header_height = header_lines + 2
         header_height = max(3, min(header_height, max(3, height - 1)))
         layout["header"].size = header_height
@@ -1412,26 +2348,405 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
                     refresh_view()
                     next_refresh = now + refresh
-                if sys.stdin in select_with_timeout(0.1):
-                    key = sys.stdin.read(1)
-                    if key == "q":
-                        return 0
-                    if key == " ":
-                        next_refresh = 0.0
-                    if key == "t":
-                        show_threads = not show_threads
-                        next_refresh = 0.0
-                    if key == "d":
-                        show_details = not show_details
-                        next_refresh = 0.0
+                key = read_key(0.1)
+                if key is None:
+                    continue
+                if key == "q":
+                    return 0
+                if key == " ":
+                    next_refresh = 0.0
+                if key == "t":
+                    show_threads = not show_threads
+                    next_refresh = 0.0
+                if key == "d":
+                    show_details = not show_details
+                    next_refresh = 0.0
+                if key == "r":
+                    if recording_enabled:
+                        stop_recording()
+                    else:
+                        start_recording()
+                    next_refresh = 0.0
     except KeyboardInterrupt:
         return 0
     finally:
         termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+        if record_session is not None:
+            record_session.close()
+            if record_session.event_count > 0:
+                print(f"Recording saved to: {record_session.log_path}")
     return 0
+def run_record_batch(args: argparse.Namespace) -> int:
+    pythonpath = args.pythonpath if args.pythonpath is not None else os.environ.get("PYTHONPATH", "")
+    state = detect_state(args)
+    refresh = max(1, args.refresh)
+    record_path = args.out or default_session_path()
+    record_session = RecordSession(record_path, state, refresh, pythonpath)
+    quiet = bool(args.quiet)
+    install_attempted: set = set()
+    start_time = time.time()
+    last_change: Optional[float] = None
+    last_heartbeat = start_time
+    last_divergence_time = 0.0
+    stop_reason = "completed"
+    target = state.selector.display or "python"
+    target = shorten(target, 120)
+    print(
+        f"recording start | path={record_session.log_path} | ranks={len(state.ranks)} | "
+        f"refresh={refresh}s | target={target}"
+    )
+    try:
+        while True:
+            loop_start = time.time()
+            if state.launcher == "prte":
+                if not is_pid_alive(state.prte_pid):
+                    stop_reason = "prterun-exited"
+                    break
+            else:
+                if not is_slurm_job_alive(state.slurm_job_id):
+                    stop_reason = "slurm-job-exited"
+                    break
+            rank_to_proc, _pid_errors = collect_rank_pids(state)
+            snapshots, _stack_errors = collect_stacks(
+                state, rank_to_proc, pythonpath, False, install_attempted
+            )
+            if record_session.record_if_changed(state, rank_to_proc, snapshots):
+                last_change = time.time()
+            divergence, common_len, max_len = compute_divergence_from_snapshots(state.ranks, snapshots)
+            now = time.time()
+            if not quiet and now - last_heartbeat >= HEARTBEAT_INTERVAL:
+                last_change_age = "never"
+                if last_change is not None:
+                    last_change_age = format_duration(int(now - last_change))
+                elapsed = format_duration(int(now - start_time))
+                print(
+                    f"heartbeat | events={record_session.event_count} | "
+                    f"last_change={last_change_age} | elapsed={elapsed}"
+                )
+                last_heartbeat = now
+            if (
+                not quiet
+                and divergence >= DIVERGENCE_THRESHOLD
+                and now - last_divergence_time >= DIVERGENCE_INTERVAL
+            ):
+                print(
+                    f"divergence | ratio={divergence:.2f} | common={common_len} | max={max_len}"
+                )
+                last_divergence_time = now
+            elapsed = time.time() - loop_start
+            sleep_for = refresh - elapsed
+            if sleep_for > 0:
+                time.sleep(sleep_for)
+    except KeyboardInterrupt:
+        stop_reason = "interrupted"
+    finally:
+        record_session.close()
+        elapsed = format_duration(int(time.time() - start_time))
+        print(
+            f"recording stop | reason={stop_reason} | events={record_session.event_count} | "
+            f"elapsed={elapsed} | path={record_session.log_path}"
+        )
+    return 0
+def run_review(args: argparse.Namespace) -> int:
+    metadata = load_session_metadata(args.path)
+    ranks = [
+        RankInfo(rank=int(item["rank"]), host=str(item["host"]))
+        for item in metadata.get("ranks", [])
+        if "rank" in item and "host" in item
+    ]
+    if not ranks:
+        raise SystemExit("no ranks found in metadata")
+    selector_payload = metadata.get("selector", {}) if isinstance(metadata.get("selector"), dict) else {}
+    selector = ProgramSelector(
+        module=selector_payload.get("module"),
+        script=selector_payload.get("script"),
+        display=selector_payload.get("display", ""),
+    )
+    state = State(
+        launcher=str(metadata.get("launcher", "prte")),
+        prte_pid=int(metadata.get("prte_pid", 0) or 0),
+        slurm_job_id=metadata.get("slurm_job_id"),
+        rankfile=str(metadata.get("rankfile", "")),
+        ranks=ranks,
+        selector=selector,
+    )
+    events = load_session_events(args.path)
+    if not events:
+        raise SystemExit("no events recorded")
+    console = Console()
+    show_threads = False
+    show_details = False
+    levels = [TimelineLevel(0, len(events), selected=0)]
+    max_stack_lens, divergence_ratios, _ = compute_event_metrics(
+        events, ranks, show_threads
+    )
+    def handle_sigint(_sig, _frame):
+        raise KeyboardInterrupt
+    signal.signal(signal.SIGINT, handle_sigint)
+    fd = sys.stdin.fileno()
+    old_settings = termios.tcgetattr(fd)
+    tty.setcbreak(fd)
+    layout = Layout()
+    layout.split_column(Layout(name="header", size=HEADER_HEIGHT), Layout(name="body"))
+    def refresh_view() -> None:
+        width, height = shutil.get_terminal_size((120, 40))
+        content_width = max(0, width - 4)
+        timeline_lines = render_timeline_lines(levels, max_stack_lens, divergence_ratios, content_width)
+        active_level = levels[-1]
+        if not active_level.buckets:
+            return
+        current_index = active_level.buckets[active_level.selected][0]
+        current_index = max(0, min(current_index, len(events) - 1))
+        event = events[current_index]
+        snapshots = event_snapshots_from_event(event, ranks, show_threads)
+        rank_to_proc = rank_to_proc_from_event(event, ranks)
+        stack_lines_by_rank = {
+            rank: extract_stack_lines(snapshot.stack_lines)
+            for rank, snapshot in snapshots.items()
+        }
+        prefix_len = common_prefix_length(stack_lines_by_rank)
+        diff_index = None
+        if any(stack_lines_by_rank.values()):
+            diff_index = max(0, prefix_len - 1) if prefix_len > 0 else 0
+        stacks_text: Dict[int, Text] = {}
+        for rank, snapshot in snapshots.items():
+            lines = snapshot.stack_lines
+            marked = mark_diff_line(lines, diff_index) if diff_index is not None else lines
+            stacks_text[rank] = style_lines(marked)
+        details_by_rank = {
+            rank: snapshot.details for rank, snapshot in snapshots.items()
+        }
+        event_time = iso_timestamp(event.timestamp)
+        header, header_lines = build_review_header(
+            state,
+            current_index,
+            len(events),
+            event_time,
+            timeline_lines,
+            content_width,
+        )
+        header_height = header_lines + 2
+        header_height = max(3, min(header_height, max(3, height - 1)))
+        layout["header"].size = header_height
+        body_height = max(1, height - header_height)
+        total_columns = len(ranks) + (1 if show_details else 0)
+        column_width = max(1, content_width // max(1, total_columns))
+        inner_width = max(1, column_width - 4)
+        details_text = (
+            build_details_text(ranks, rank_to_proc, details_by_rank, inner_width)
+            if show_details
+            else None
+        )
+        layout["header"].update(
+            Panel(header, padding=(0, 1), border_style=BORDER_STYLE)
+        )
+        layout["body"].update(
+            render_columns(ranks, stacks_text, details_text, body_height, rank_to_proc)
+        )
+    try:
+        refresh_view()
+        with Live(layout, console=console, refresh_per_second=10, screen=True):
+            while True:
+                key = read_key(0.1)
+                if key is None:
+                    continue
+                if key == "q":
+                    return 0
+                if key == "t":
+                    show_threads = not show_threads
+                    max_stack_lens, divergence_ratios, _ = compute_event_metrics(
+                        events, ranks, show_threads
+                    )
+                    refresh_view()
+                if key == "d":
+                    show_details = not show_details
+                    refresh_view()
+                if key == "left":
+                    level = levels[-1]
+                    level.selected = max(0, level.selected - 1)
+                    refresh_view()
+                if key == "right":
+                    level = levels[-1]
+                    level.selected = min(max(0, len(level.buckets) - 1), level.selected + 1)
+                    refresh_view()
+                if key == "down":
+                    level = levels[-1]
+                    if not level.buckets:
+                        continue
+                    bucket = level.buckets[level.selected]
+                    if bucket[1] - bucket[0] <= 1:
+                        continue
+                    levels.append(TimelineLevel(bucket[0], bucket[1], selected=0))
+                    refresh_view()
+                if key == "up":
+                    if len(levels) > 1:
+                        levels.pop()
+                        refresh_view()
+    except KeyboardInterrupt:
+        return 0
+    finally:
+        termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+    return 0
+def run_summarize(args: argparse.Namespace) -> int:
+    metadata = load_session_metadata(args.path)
+    events = load_session_events(args.path)
+    ranks = [
+        RankInfo(rank=int(item["rank"]), host=str(item["host"]))
+        for item in metadata.get("ranks", [])
+        if "rank" in item and "host" in item
+    ]
+    if not ranks:
+        raise SystemExit("no ranks found in metadata")
+    if not events:
+        raise SystemExit("no events recorded")
+    rank_order = [info.rank for info in ranks]
+    signature_counts: Dict[Tuple[str, ...], int] = {}
+    signature_examples: Dict[Tuple[str, ...], Dict[int, str]] = {}
+    rank_change_counts: Dict[int, int] = {rank: 0 for rank in rank_order}
+    previous_rank_signature: Dict[int, str] = {rank: "" for rank in rank_order}
+    max_stack_lens, divergence_ratios, common_prefixes = compute_event_metrics(
+        events, ranks, show_threads=False
+    )
+    for event in events:
+        per_rank_signature: Dict[int, str] = {}
+        per_rank_top_frame: Dict[int, str] = {}
+        for info in ranks:
+            payload = event.ranks.get(info.rank, {})
+            if payload.get("error"):
+                signature = f"error:{payload.get('error')}"
+                top_frame = signature
+            else:
+                output = payload.get("py_spy")
+                if output:
+                    lines, _details = render_pyspy_output(str(output), show_threads=False)
+                    stack_lines = extract_stack_lines(lines)
+                    signature = hashlib.sha1(
+                        "\n".join(stack_lines).encode("utf-8", errors="ignore")
+                    ).hexdigest()
+                    top_frame = stack_lines[0].strip() if stack_lines else "empty"
+                else:
+                    signature = "empty"
+                    top_frame = "empty"
+            per_rank_signature[info.rank] = signature
+            per_rank_top_frame[info.rank] = top_frame
+        for rank, signature in per_rank_signature.items():
+            if previous_rank_signature.get(rank) != signature:
+                rank_change_counts[rank] = rank_change_counts.get(rank, 0) + 1
+            previous_rank_signature[rank] = signature
+        signature_key = tuple(per_rank_signature[rank] for rank in rank_order)
+        signature_counts[signature_key] = signature_counts.get(signature_key, 0) + 1
+        if signature_key not in signature_examples:
+            signature_examples[signature_key] = per_rank_top_frame
+    sorted_signatures = sorted(
+        signature_counts.items(), key=lambda item: item[1], reverse=True
+    )
+    top_signatures = sorted_signatures[: max(1, args.top)]
+    total_events = len(events)
+    start_time = iso_timestamp(events[0].timestamp)
+    end_time = iso_timestamp(events[-1].timestamp)
+    if args.format == "json":
+        payload = {
+            "metadata": metadata,
+            "event_count": total_events,
+            "time_range": {"start": start_time, "end": end_time},
+            "rank_change_counts": rank_change_counts,
+            "top_signatures": [
+                {
+                    "count": count,
+                    "ratio": count / float(total_events),
+                    "example_top_frames": signature_examples.get(signature_key, {}),
+                }
+                for signature_key, count in top_signatures
+            ],
+            "most_divergent": sorted(
+                [
+                    {
+                        "index": idx,
+                        "timestamp": iso_timestamp(events[idx].timestamp),
+                        "divergence_ratio": divergence_ratios[idx],
+                        "common_prefix_len": common_prefixes[idx],
+                        "max_stack_len": max_stack_lens[idx],
+                    }
+                    for idx in range(total_events)
+                ],
+                key=lambda item: item["divergence_ratio"],
+                reverse=True,
+            )[:5],
+        }
+        print(json.dumps(payload, indent=2, sort_keys=True))
+        return 0
+    print(f"Session: {args.path}")
+    print(f"Events: {total_events} ({start_time} -> {end_time})")
+    print(f"Ranks: {', '.join(str(rank) for rank in rank_order)}")
+    print("")
+    print("Top stack signatures:")
+    for idx, (signature_key, count) in enumerate(top_signatures, start=1):
+        ratio = count / float(total_events)
+        print(f"{idx}. {count} events ({ratio:.1%})")
+        example = signature_examples.get(signature_key, {})
+        for rank in rank_order:
+            frame = example.get(rank, "")
+            frame = shorten(frame, 120)
+            print(f"   rank {rank}: {frame}")
+    print("")
+    print("Rank change counts:")
+    for rank in rank_order:
+        print(f"  rank {rank}: {rank_change_counts.get(rank, 0)}")
+    print("")
+    print("Most divergent events:")
+    divergent = sorted(
+        range(total_events),
+        key=lambda idx: divergence_ratios[idx],
+        reverse=True,
+    )[:5]
+    for idx in divergent:
+        print(
+            f"  #{idx + 1} @ {iso_timestamp(events[idx].timestamp)} | "
+            f"ratio {divergence_ratios[idx]:.2f} | "
+            f"common {common_prefixes[idx]} | "
+            f"max {max_stack_lens[idx]}"
+        )
+    return 0
+def main(argv: Optional[Sequence[str]] = None) -> int:
+    argv = list(argv) if argv is not None else sys.argv[1:]
+    if argv and argv[0] in {"review", "summarize", "record"}:
+        command = argv[0]
+        sub_args = argv[1:]
+        if command == "review":
+            return run_review(parse_review_args(sub_args))
+        if command == "record":
+            return run_record_batch(parse_record_args(sub_args))
+        return run_summarize(parse_summarize_args(sub_args))
+    return run_live(parse_live_args(argv))
 def select_with_timeout(timeout: float):
     import select

mpiptop 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

mpiptop 0.1.1py3-none-any.whl → 0.2.1py3-none-any.whl