mpiptop 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mpiptop
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: TUI for viewing MPI Python stacks across hosts
5
5
  Author: yieldthought
6
6
  License-Expression: MIT
@@ -48,6 +48,17 @@ mpiptop --rankfile /etc/mpirun/rankfile_01_02
48
48
  mpiptop --prterun-pid 12345
49
49
  mpiptop --refresh 5
50
50
  mpiptop --pythonpath /path/to/your/code
51
+ mpiptop record --out ./mpiptop-session-20260123-120000.jsonl
51
52
  ```
52
53
 
53
- Controls: `q` quit | `space` refresh | `t` threads | `d` details
54
+ Record/review (record is batch mode; use plain `mpiptop` for the TUI):
55
+ ```bash
56
+ mpiptop record
57
+ mpiptop record --quiet
58
+ mpiptop review ./mpiptop-session-20260123-120000.jsonl
59
+ mpiptop summarize ./mpiptop-session-20260123-120000.jsonl --format text
60
+ ```
61
+
62
+ Live controls: `q` quit | `space` refresh | `t` threads | `d` details | `r` record
63
+
64
+ Review controls: `q` quit | `left/right` move | `down` zoom | `up` zoom out | `t` threads | `d` details
@@ -0,0 +1,7 @@
1
+ mpiptop.py,sha256=D4h-jOyhYU4M0FzQ6JMX5gDBE8UVd4detm5JDFdTm4c,86492
2
+ mpiptop-0.2.0.dist-info/licenses/LICENSE,sha256=ChKmQ8qCXxdXRR_HIJECjIA5NLWlUTEJWh7Xkhm2wAA,1069
3
+ mpiptop-0.2.0.dist-info/METADATA,sha256=3vT5lrkqfuh6O2DL6xCm412w7opwQAAzJFdD6IGHs7g,1910
4
+ mpiptop-0.2.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
5
+ mpiptop-0.2.0.dist-info/entry_points.txt,sha256=RsGsr8GBLfUNpb432YWS5gz4MWfWdK9xJRr1SmdnLo8,41
6
+ mpiptop-0.2.0.dist-info/top_level.txt,sha256=c2Vdu6tTg0DEPUWD8Odyods7fXsPWMQ2kSvjdKiTClc,8
7
+ mpiptop-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
mpiptop.py CHANGED
@@ -6,6 +6,7 @@ from __future__ import annotations
6
6
  import argparse
7
7
  import colorsys
8
8
  import dataclasses
9
+ import datetime
9
10
  import hashlib
10
11
  import json
11
12
  import os
@@ -79,10 +80,40 @@ class ParsedPySpy:
79
80
  threads: List[ThreadBlock]
80
81
 
81
82
 
83
+ @dataclasses.dataclass(frozen=True)
84
+ class RankSnapshot:
85
+ output: Optional[str]
86
+ error: Optional[str]
87
+ stack_lines: List[str]
88
+ details: List[str]
89
+
90
+
91
+ @dataclasses.dataclass
92
+ class SessionEvent:
93
+ timestamp: float
94
+ ranks: Dict[int, Dict[str, object]]
95
+
96
+
97
+ @dataclasses.dataclass
98
+ class TimelineLevel:
99
+ start: int
100
+ end: int
101
+ selected: int = 0
102
+ buckets: List[Tuple[int, int]] = dataclasses.field(default_factory=list)
103
+
104
+
82
105
  PUNCT_STYLE = "grey62"
83
106
  BORDER_STYLE = "grey62"
84
107
  KEY_STYLE = "#7ad7ff"
85
108
  HEADER_HEIGHT = 3
109
+ SESSION_VERSION = 1
110
+ SESSION_LOG_FILE = "session.jsonl"
111
+ SESSION_METADATA_FILE = "metadata.json"
112
+ SESSION_EVENTS_FILE = "events.jsonl"
113
+ SPARKLINE_CHARS = "▁▂▃▄▅▆▇█"
114
+ HEARTBEAT_INTERVAL = 60
115
+ DIVERGENCE_THRESHOLD = 0.5
116
+ DIVERGENCE_INTERVAL = 60
86
117
  ENV_KEYS = (
87
118
  "PATH",
88
119
  "LD_LIBRARY_PATH",
@@ -203,6 +234,264 @@ print(json.dumps(results))
203
234
  """
204
235
 
205
236
 
237
+ def iso_timestamp(value: Optional[float] = None) -> str:
238
+ ts = time.time() if value is None else value
239
+ return datetime.datetime.fromtimestamp(ts).isoformat(timespec="seconds")
240
+
241
+
242
+ def default_session_path() -> str:
243
+ stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
244
+ return os.path.abspath(f"mpiptop-session-{stamp}.jsonl")
245
+
246
+
247
+ def normalize_session_path(path: str) -> Tuple[str, str]:
248
+ if path.endswith(".jsonl") or (os.path.exists(path) and os.path.isfile(path)):
249
+ base_dir = os.path.dirname(path) or "."
250
+ return base_dir, path
251
+ return path, os.path.join(path, SESSION_LOG_FILE)
252
+
253
+
254
+ def ensure_session_path(path: str) -> Tuple[str, str]:
255
+ base_dir, log_path = normalize_session_path(path)
256
+ if os.path.exists(path):
257
+ if os.path.isdir(path):
258
+ if os.listdir(path):
259
+ if os.path.exists(log_path) or os.path.exists(os.path.join(path, SESSION_METADATA_FILE)):
260
+ return base_dir, log_path
261
+ raise SystemExit(f"record path exists and is not empty: {path}")
262
+ elif os.path.isfile(path):
263
+ return base_dir, log_path
264
+ else:
265
+ raise SystemExit(f"record path exists and is not a file or directory: {path}")
266
+ else:
267
+ if log_path.endswith(".jsonl"):
268
+ os.makedirs(base_dir, exist_ok=True)
269
+ else:
270
+ os.makedirs(base_dir, exist_ok=True)
271
+ return base_dir, log_path
272
+
273
+
274
+ def write_session_metadata(log_path: str, state: State, refresh: int, pythonpath: str) -> None:
275
+ payload = {
276
+ "version": SESSION_VERSION,
277
+ "created_at": iso_timestamp(),
278
+ "refresh": refresh,
279
+ "rankfile": state.rankfile,
280
+ "prte_pid": state.prte_pid,
281
+ "selector": dataclasses.asdict(state.selector),
282
+ "ranks": [dataclasses.asdict(rank) for rank in state.ranks],
283
+ "pythonpath": pythonpath,
284
+ "record_on_change": True,
285
+ }
286
+ if os.path.exists(log_path) and os.path.getsize(log_path) > 0:
287
+ return
288
+ with open(log_path, "a", encoding="utf-8") as handle:
289
+ handle.write(json.dumps({"type": "metadata", "data": payload}) + "\n")
290
+
291
+
292
+ def load_session_metadata(path: str) -> Dict[str, object]:
293
+ base_dir, log_path = normalize_session_path(path)
294
+ metadata_path = os.path.join(base_dir, SESSION_METADATA_FILE)
295
+ if os.path.exists(metadata_path):
296
+ with open(metadata_path, "r", encoding="utf-8") as handle:
297
+ return json.load(handle)
298
+ if not os.path.exists(log_path):
299
+ raise SystemExit(f"metadata not found in {path}")
300
+ with open(log_path, "r", encoding="utf-8") as handle:
301
+ for line in handle:
302
+ raw = line.strip()
303
+ if not raw:
304
+ continue
305
+ data = json.loads(raw)
306
+ if isinstance(data, dict) and data.get("type") == "metadata":
307
+ payload = data.get("data")
308
+ if isinstance(payload, dict):
309
+ return payload
310
+ if isinstance(data, dict) and "version" in data and "ranks" in data:
311
+ return data
312
+ raise SystemExit(f"metadata not found in {log_path}")
313
+
314
+
315
+ def read_last_event(path: str) -> Optional[Dict[str, object]]:
316
+ if not os.path.exists(path):
317
+ return None
318
+ with open(path, "rb") as handle:
319
+ handle.seek(0, os.SEEK_END)
320
+ pos = handle.tell()
321
+ if pos == 0:
322
+ return None
323
+ chunk = b""
324
+ while pos > 0:
325
+ step = min(4096, pos)
326
+ pos -= step
327
+ handle.seek(pos)
328
+ chunk = handle.read(step) + chunk
329
+ if b"\n" in chunk:
330
+ break
331
+ lines = [line for line in chunk.splitlines() if line.strip()]
332
+ while lines:
333
+ raw = lines.pop().decode("utf-8", errors="ignore")
334
+ try:
335
+ data = json.loads(raw)
336
+ except json.JSONDecodeError:
337
+ continue
338
+ if isinstance(data, dict) and data.get("type") == "metadata":
339
+ continue
340
+ if isinstance(data, dict) and data.get("type") == "event":
341
+ payload = data.get("data")
342
+ if isinstance(payload, dict):
343
+ return payload
344
+ return data
345
+ return None
346
+
347
+
348
+ def load_session_events(path: str) -> List[SessionEvent]:
349
+ base_dir, log_path = normalize_session_path(path)
350
+ events_path = os.path.join(base_dir, SESSION_EVENTS_FILE)
351
+ if not os.path.exists(events_path) and not os.path.exists(log_path):
352
+ raise SystemExit(f"events not found in {path}")
353
+ path_to_read = events_path if os.path.exists(events_path) else log_path
354
+ events: List[SessionEvent] = []
355
+ with open(path_to_read, "r", encoding="utf-8") as handle:
356
+ for line in handle:
357
+ raw = line.strip()
358
+ if not raw:
359
+ continue
360
+ data = json.loads(raw)
361
+ if isinstance(data, dict) and data.get("type") == "metadata":
362
+ continue
363
+ if isinstance(data, dict) and data.get("type") == "event":
364
+ data = data.get("data", {})
365
+ if not isinstance(data, dict):
366
+ continue
367
+ timestamp = float(data.get("t", 0.0))
368
+ ranks_raw = data.get("ranks", {})
369
+ ranks: Dict[int, Dict[str, object]] = {}
370
+ for key, value in ranks_raw.items():
371
+ try:
372
+ rank_id = int(key)
373
+ except (TypeError, ValueError):
374
+ continue
375
+ ranks[rank_id] = value
376
+ events.append(SessionEvent(timestamp=timestamp, ranks=ranks))
377
+ return events
378
+
379
+
380
+ def signature_from_snapshot(snapshot: Optional[RankSnapshot]) -> str:
381
+ if snapshot is None:
382
+ return "missing"
383
+ if snapshot.error:
384
+ return f"error:{snapshot.error}"
385
+ if snapshot.output is None:
386
+ return "missing"
387
+ digest = hashlib.sha1(snapshot.output.encode("utf-8", errors="ignore")).hexdigest()
388
+ return digest
389
+
390
+
391
+ def snapshot_signature(ranks: List[RankInfo], snapshots: Dict[int, RankSnapshot]) -> Dict[int, str]:
392
+ signature: Dict[int, str] = {}
393
+ for info in ranks:
394
+ signature[info.rank] = signature_from_snapshot(snapshots.get(info.rank))
395
+ return signature
396
+
397
+
398
+ def signature_from_event(event: Dict[str, object]) -> Optional[Dict[int, str]]:
399
+ ranks = event.get("ranks", {})
400
+ if not isinstance(ranks, dict):
401
+ return None
402
+ signature: Dict[int, str] = {}
403
+ for key, payload in ranks.items():
404
+ try:
405
+ rank_id = int(key)
406
+ except (TypeError, ValueError):
407
+ continue
408
+ if not isinstance(payload, dict):
409
+ signature[rank_id] = "missing"
410
+ continue
411
+ if payload.get("error"):
412
+ signature[rank_id] = f"error:{payload.get('error')}"
413
+ elif payload.get("py_spy"):
414
+ digest = hashlib.sha1(
415
+ str(payload.get("py_spy")).encode("utf-8", errors="ignore")
416
+ ).hexdigest()
417
+ signature[rank_id] = digest
418
+ else:
419
+ signature[rank_id] = "missing"
420
+ return signature
421
+
422
+
423
+ class RecordSession:
424
+ def __init__(self, path: str, state: State, refresh: int, pythonpath: str):
425
+ self.base_dir, self.log_path = ensure_session_path(path)
426
+ write_session_metadata(self.log_path, state, refresh, pythonpath)
427
+ self.handle = open(self.log_path, "a", encoding="utf-8")
428
+ self.event_count = 0
429
+ self.last_signature: Optional[Dict[int, str]] = None
430
+ last_event = read_last_event(self.log_path)
431
+ if last_event:
432
+ self.last_signature = signature_from_event(last_event)
433
+ self.event_count = self._count_events()
434
+
435
+ def _count_events(self) -> int:
436
+ if not os.path.exists(self.log_path):
437
+ return 0
438
+ count = 0
439
+ with open(self.log_path, "r", encoding="utf-8") as handle:
440
+ for line in handle:
441
+ raw = line.strip()
442
+ if not raw:
443
+ continue
444
+ try:
445
+ data = json.loads(raw)
446
+ except json.JSONDecodeError:
447
+ continue
448
+ if isinstance(data, dict) and data.get("type") == "metadata":
449
+ continue
450
+ count += 1
451
+ return count
452
+
453
+ def record_if_changed(
454
+ self,
455
+ state: State,
456
+ rank_to_proc: Dict[int, RankProcess],
457
+ snapshots: Dict[int, RankSnapshot],
458
+ ) -> bool:
459
+ signature = snapshot_signature(state.ranks, snapshots)
460
+ if self.last_signature is not None and signature == self.last_signature:
461
+ return False
462
+ payload: Dict[str, object] = {"t": time.time(), "ranks": {}}
463
+ ranks_payload: Dict[str, object] = {}
464
+ for info in state.ranks:
465
+ rank = info.rank
466
+ proc = rank_to_proc.get(rank)
467
+ snapshot = snapshots.get(rank)
468
+ entry: Dict[str, object] = {"host": info.host}
469
+ if proc is not None:
470
+ entry["pid"] = proc.pid
471
+ entry["cmdline"] = proc.cmdline
472
+ entry["rss_kb"] = proc.rss_kb
473
+ if snapshot is None:
474
+ entry["error"] = "No data"
475
+ elif snapshot.error:
476
+ entry["error"] = snapshot.error
477
+ elif snapshot.output is not None:
478
+ entry["py_spy"] = snapshot.output
479
+ else:
480
+ entry["error"] = "No data"
481
+ ranks_payload[str(rank)] = entry
482
+ payload["ranks"] = ranks_payload
483
+ self.handle.write(json.dumps({"type": "event", "data": payload}) + "\n")
484
+ self.handle.flush()
485
+ self.last_signature = signature
486
+ self.event_count += 1
487
+ return True
488
+
489
+ def close(self) -> None:
490
+ try:
491
+ self.handle.close()
492
+ except Exception:
493
+ pass
494
+
206
495
  def read_ps() -> List[Proc]:
207
496
  result = subprocess.run(
208
497
  ["ps", "-eo", "pid=,ppid=,args="],
@@ -349,6 +638,28 @@ def parse_python_selector(args: str) -> ProgramSelector:
349
638
  return ProgramSelector(module=module, script=script, display=display)
350
639
 
351
640
 
641
+ def selector_score(selector: ProgramSelector) -> Tuple[int, int, int, int]:
642
+ if not selector.display:
643
+ return (0, 0, 0, 0)
644
+ has_script = 1 if selector.script else 0
645
+ has_module = 1 if selector.module else 0
646
+ display = f" {selector.display} "
647
+ has_python_target = 1 if ".py" in selector.display or " -m " in display else 0
648
+ return (has_script, has_module, has_python_target, len(selector.display))
649
+
650
+
651
+ def best_selector_from_procs(procs: Iterable[RankProcess]) -> Optional[ProgramSelector]:
652
+ best: Optional[ProgramSelector] = None
653
+ best_score = selector_score(best or ProgramSelector(module=None, script=None, display=""))
654
+ for proc in procs:
655
+ candidate = parse_python_selector(proc.cmdline)
656
+ score = selector_score(candidate)
657
+ if score > best_score:
658
+ best = candidate
659
+ best_score = score
660
+ return best
661
+
662
+
352
663
  def extract_python_exe(cmdline: str) -> Optional[str]:
353
664
  if not cmdline:
354
665
  return None
@@ -1110,11 +1421,11 @@ def build_header(
1110
1421
  state: State, last_update: str, errors: List[str], refresh: int, width: int
1111
1422
  ) -> Tuple[Text, int]:
1112
1423
  program_lines = wrap_program_lines(state.selector, width)
1113
- if program_lines:
1114
- last_line = program_lines[-1]
1115
- last_line.append(f" | ranks: {len(state.ranks)} | rankfile: {state.rankfile}")
1116
- else:
1117
- program_lines = [Text(f"python | ranks: {len(state.ranks)} | rankfile: {state.rankfile}")]
1424
+ if not program_lines:
1425
+ program_lines = [Text("python")]
1426
+ for line in program_lines:
1427
+ line.no_wrap = True
1428
+ line.overflow = "crop"
1118
1429
 
1119
1430
  controls_plain = "q quit | space refresh | t threads | d details"
1120
1431
  padding = max(0, width - len(controls_plain))
@@ -1136,6 +1447,8 @@ def build_header(
1136
1447
  text.append_text(line)
1137
1448
  text.append("\n")
1138
1449
  text.append_text(line2)
1450
+ text.no_wrap = True
1451
+ text.overflow = "crop"
1139
1452
  return text, len(program_lines) + 1
1140
1453
 
1141
1454
 
@@ -1227,6 +1540,338 @@ def build_details_text(
1227
1540
  return output
1228
1541
 
1229
1542
 
1543
+ def format_elapsed(start: Optional[float]) -> str:
1544
+ if start is None:
1545
+ return "0:00"
1546
+ elapsed = max(0, int(time.time() - start))
1547
+ return format_duration(elapsed)
1548
+
1549
+
1550
+ def format_duration(elapsed: int) -> str:
1551
+ hours = elapsed // 3600
1552
+ minutes = (elapsed % 3600) // 60
1553
+ seconds = elapsed % 60
1554
+ if hours:
1555
+ return f"{hours}:{minutes:02d}:{seconds:02d}"
1556
+ return f"{minutes}:{seconds:02d}"
1557
+
1558
+
1559
+ def build_live_header(
1560
+ state: State,
1561
+ last_update: str,
1562
+ refresh: int,
1563
+ record_line: Optional[str],
1564
+ width: int,
1565
+ ) -> Tuple[Text, int]:
1566
+ program_lines = wrap_program_lines(state.selector, width)
1567
+ if not program_lines:
1568
+ program_lines = [Text("python")]
1569
+ for line in program_lines:
1570
+ line.no_wrap = True
1571
+ line.overflow = "crop"
1572
+
1573
+ record_text = None
1574
+ if record_line:
1575
+ record_text = Text()
1576
+ record_text.append("REC", style="bold red")
1577
+ record_text.append(" recording: ")
1578
+ record_text.append(record_line)
1579
+ record_text.truncate(width)
1580
+ record_text.no_wrap = True
1581
+ record_text.overflow = "crop"
1582
+
1583
+ controls_plain = "q quit | space refresh | t threads | d details | r record"
1584
+ padding = max(0, width - len(controls_plain))
1585
+ controls_line = Text(" " * padding + controls_plain)
1586
+ for token in ["q", "space", "t", "d", "r"]:
1587
+ start = controls_plain.find(token)
1588
+ if start != -1:
1589
+ controls_line.stylize(KEY_STYLE, padding + start, padding + start + len(token))
1590
+ controls_line.truncate(width)
1591
+ controls_line.no_wrap = True
1592
+ controls_line.overflow = "crop"
1593
+
1594
+ text = Text()
1595
+ for idx, line in enumerate(program_lines):
1596
+ if idx:
1597
+ text.append("\n")
1598
+ text.append_text(line)
1599
+ text.append("\n")
1600
+ if record_text is not None:
1601
+ text.append_text(record_text)
1602
+ text.append("\n")
1603
+ text.append_text(controls_line)
1604
+ text.no_wrap = True
1605
+ text.overflow = "crop"
1606
+ extra_lines = 2 if record_text is not None else 1
1607
+ return text, len(program_lines) + extra_lines
1608
+
1609
+
1610
+ def build_review_header(
1611
+ state: State,
1612
+ event_index: int,
1613
+ event_total: int,
1614
+ event_time: str,
1615
+ timeline_lines: List[Text],
1616
+ width: int,
1617
+ ) -> Tuple[Text, int]:
1618
+ program_lines = wrap_program_lines(state.selector, width)
1619
+ if not program_lines:
1620
+ program_lines = [Text("python")]
1621
+ status_line = Text(
1622
+ f"review {event_index + 1}/{event_total} | {event_time}"
1623
+ )
1624
+ status_line.truncate(width)
1625
+
1626
+ controls_plain = "q quit | left/right move | down zoom | up zoom out | t threads | d details"
1627
+ padding = max(0, width - len(controls_plain))
1628
+ controls_line = Text(" " * padding + controls_plain)
1629
+ for token in ["q", "left/right", "down", "up", "t", "d"]:
1630
+ start = controls_plain.find(token)
1631
+ if start != -1:
1632
+ controls_line.stylize(KEY_STYLE, padding + start, padding + start + len(token))
1633
+ controls_line.truncate(width)
1634
+ controls_line.no_wrap = True
1635
+ controls_line.overflow = "crop"
1636
+
1637
+ text = Text()
1638
+ for idx, line in enumerate(program_lines):
1639
+ if idx:
1640
+ text.append("\n")
1641
+ text.append_text(line)
1642
+ text.append("\n")
1643
+ text.append_text(status_line)
1644
+ for line in timeline_lines:
1645
+ text.append("\n")
1646
+ text.append_text(line)
1647
+ text.append("\n")
1648
+ text.append_text(controls_line)
1649
+ text.no_wrap = True
1650
+ text.overflow = "crop"
1651
+ return text, len(program_lines) + 1 + len(timeline_lines) + 1
1652
+
1653
+
1654
+ def build_buckets(start: int, end: int, width: int) -> List[Tuple[int, int]]:
1655
+ count = max(0, end - start)
1656
+ if count == 0:
1657
+ return []
1658
+ bucket_count = max(1, min(width, count))
1659
+ base = count // bucket_count
1660
+ remainder = count % bucket_count
1661
+ buckets: List[Tuple[int, int]] = []
1662
+ current = start
1663
+ for idx in range(bucket_count):
1664
+ size = base + (1 if idx < remainder else 0)
1665
+ buckets.append((current, current + size))
1666
+ current += size
1667
+ return buckets
1668
+
1669
+
1670
+ def divergence_color(ratio: float) -> str:
1671
+ clamped = min(1.0, max(0.0, ratio))
1672
+ intensity = clamped ** 0.7
1673
+ base = (170, 170, 170)
1674
+ hot = (255, 122, 0)
1675
+ r = int(base[0] + (hot[0] - base[0]) * intensity)
1676
+ g = int(base[1] + (hot[1] - base[1]) * intensity)
1677
+ b = int(base[2] + (hot[2] - base[2]) * intensity)
1678
+ return f"#{r:02x}{g:02x}{b:02x}"
1679
+
1680
+
1681
+ def compute_event_metrics(
1682
+ events: List[SessionEvent],
1683
+ ranks: List[RankInfo],
1684
+ show_threads: bool,
1685
+ ) -> Tuple[List[int], List[float], List[int]]:
1686
+ max_stack_lens: List[int] = []
1687
+ divergence_ratios: List[float] = []
1688
+ common_prefixes: List[int] = []
1689
+ for event in events:
1690
+ stacks_by_rank: Dict[int, List[str]] = {}
1691
+ for info in ranks:
1692
+ payload = event.ranks.get(info.rank, {})
1693
+ if payload.get("error"):
1694
+ stacks_by_rank[info.rank] = []
1695
+ continue
1696
+ output = payload.get("py_spy")
1697
+ if not output:
1698
+ stacks_by_rank[info.rank] = []
1699
+ continue
1700
+ lines, _details = render_pyspy_output(str(output), show_threads)
1701
+ stacks_by_rank[info.rank] = extract_stack_lines(lines)
1702
+ max_len = max((len(stack) for stack in stacks_by_rank.values()), default=0)
1703
+ common_len = common_prefix_length(stacks_by_rank)
1704
+ similarity = float(common_len) / float(max_len) if max_len else 0.0
1705
+ ratio = 1.0 - similarity if max_len else 0.0
1706
+ max_stack_lens.append(max_len)
1707
+ divergence_ratios.append(ratio)
1708
+ common_prefixes.append(common_len)
1709
+ return max_stack_lens, divergence_ratios, common_prefixes
1710
+
1711
+
1712
+ def render_timeline_lines(
1713
+ levels: List[TimelineLevel],
1714
+ max_stack_lens: List[int],
1715
+ divergence_ratios: List[float],
1716
+ width: int,
1717
+ ) -> List[Text]:
1718
+ lines: List[Text] = []
1719
+ for level_index, level in enumerate(levels):
1720
+ level.buckets = build_buckets(level.start, level.end, width)
1721
+ if level.buckets:
1722
+ level.selected = max(0, min(level.selected, len(level.buckets) - 1))
1723
+ stats: List[Tuple[int, float]] = []
1724
+ for start, end in level.buckets:
1725
+ bucket_heights = max_stack_lens[start:end]
1726
+ bucket_ratios = divergence_ratios[start:end]
1727
+ height = max(bucket_heights) if bucket_heights else 0
1728
+ ratio = max(bucket_ratios) if bucket_ratios else 0.0
1729
+ stats.append((height, ratio))
1730
+ max_height = max((height for height, _ in stats), default=1)
1731
+ if max_height <= 0:
1732
+ max_height = 1
1733
+ text = Text()
1734
+ for idx, (height, ratio) in enumerate(stats):
1735
+ normalized = float(height) / float(max_height) if max_height else 0.0
1736
+ level_idx = int(round(normalized * (len(SPARKLINE_CHARS) - 1)))
1737
+ level_idx = max(0, min(level_idx, len(SPARKLINE_CHARS) - 1))
1738
+ char = SPARKLINE_CHARS[level_idx]
1739
+ style = divergence_color(ratio)
1740
+ if idx == level.selected:
1741
+ if level_index == len(levels) - 1:
1742
+ style = f"{style} bold underline"
1743
+ else:
1744
+ style = f"{style} underline"
1745
+ text.append(char, style=style)
1746
+ text.no_wrap = True
1747
+ text.overflow = "crop"
1748
+ lines.append(text)
1749
+ return lines
1750
+
1751
+
1752
+ def event_snapshots_from_event(
1753
+ event: SessionEvent,
1754
+ ranks: List[RankInfo],
1755
+ show_threads: bool,
1756
+ ) -> Dict[int, RankSnapshot]:
1757
+ snapshots: Dict[int, RankSnapshot] = {}
1758
+ for info in ranks:
1759
+ payload = event.ranks.get(info.rank)
1760
+ if not payload:
1761
+ snapshots[info.rank] = RankSnapshot(
1762
+ output=None,
1763
+ error="No data",
1764
+ stack_lines=["No data"],
1765
+ details=[],
1766
+ )
1767
+ continue
1768
+ if payload.get("error"):
1769
+ snapshots[info.rank] = RankSnapshot(
1770
+ output=None,
1771
+ error=str(payload.get("error")),
1772
+ stack_lines=[str(payload.get("error"))],
1773
+ details=[],
1774
+ )
1775
+ continue
1776
+ output = payload.get("py_spy")
1777
+ if not output:
1778
+ snapshots[info.rank] = RankSnapshot(
1779
+ output=None,
1780
+ error="No data",
1781
+ stack_lines=["No data"],
1782
+ details=[],
1783
+ )
1784
+ continue
1785
+ lines, details = render_pyspy_output(str(output), show_threads)
1786
+ snapshots[info.rank] = RankSnapshot(
1787
+ output=str(output),
1788
+ error=None,
1789
+ stack_lines=lines,
1790
+ details=details,
1791
+ )
1792
+ return snapshots
1793
+
1794
+
1795
+ def rank_to_proc_from_event(
1796
+ event: SessionEvent,
1797
+ ranks: List[RankInfo],
1798
+ ) -> Dict[int, RankProcess]:
1799
+ rank_to_proc: Dict[int, RankProcess] = {}
1800
+ for info in ranks:
1801
+ payload = event.ranks.get(info.rank)
1802
+ if not payload:
1803
+ continue
1804
+ pid = payload.get("pid")
1805
+ cmdline = payload.get("cmdline")
1806
+ rss_kb = payload.get("rss_kb")
1807
+ if pid is None or cmdline is None:
1808
+ continue
1809
+ try:
1810
+ pid_value = int(pid)
1811
+ except (TypeError, ValueError):
1812
+ continue
1813
+ rss_value = None
1814
+ if rss_kb is not None:
1815
+ try:
1816
+ rss_value = int(rss_kb)
1817
+ except (TypeError, ValueError):
1818
+ rss_value = None
1819
+ rank_to_proc[info.rank] = RankProcess(
1820
+ pid=pid_value,
1821
+ cmdline=str(cmdline),
1822
+ rss_kb=rss_value,
1823
+ python_exe=None,
1824
+ env={},
1825
+ )
1826
+ return rank_to_proc
1827
+
1828
+
1829
+ def compute_divergence_from_snapshots(
1830
+ ranks: List[RankInfo], snapshots: Dict[int, RankSnapshot]
1831
+ ) -> Tuple[float, int, int]:
1832
+ stack_lines_by_rank = {
1833
+ info.rank: extract_stack_lines(snapshots.get(info.rank, RankSnapshot(None, "No data", [], [])).stack_lines)
1834
+ for info in ranks
1835
+ }
1836
+ max_len = max((len(stack) for stack in stack_lines_by_rank.values()), default=0)
1837
+ common_len = common_prefix_length(stack_lines_by_rank)
1838
+ similarity = float(common_len) / float(max_len) if max_len else 0.0
1839
+ divergence = 1.0 - similarity if max_len else 0.0
1840
+ return divergence, common_len, max_len
1841
+
1842
+
1843
+ def read_key(timeout: float) -> Optional[str]:
1844
+ if sys.stdin not in select_with_timeout(timeout):
1845
+ return None
1846
+ key = sys.stdin.read(1)
1847
+ if key != "\x1b":
1848
+ return key
1849
+ seq = key
1850
+ for _ in range(2):
1851
+ if sys.stdin in select_with_timeout(0.01):
1852
+ seq += sys.stdin.read(1)
1853
+ if seq == "\x1b[A":
1854
+ return "up"
1855
+ if seq == "\x1b[B":
1856
+ return "down"
1857
+ if seq == "\x1b[C":
1858
+ return "right"
1859
+ if seq == "\x1b[D":
1860
+ return "left"
1861
+ return None
1862
+
1863
+
1864
+ def is_pid_alive(pid: int) -> bool:
1865
+ if pid <= 0:
1866
+ return False
1867
+ try:
1868
+ os.kill(pid, 0)
1869
+ except ProcessLookupError:
1870
+ return False
1871
+ except PermissionError:
1872
+ return True
1873
+ return True
1874
+
1230
1875
  def detect_state(args: argparse.Namespace) -> State:
1231
1876
  procs = read_ps()
1232
1877
  prte = find_prterun(procs, args.prterun_pid)
@@ -1278,29 +1923,40 @@ def collect_stacks(
1278
1923
  pythonpath: str,
1279
1924
  show_threads: bool,
1280
1925
  install_attempted: set,
1281
- ) -> Tuple[Dict[int, List[str]], Dict[int, List[str]], List[str]]:
1282
- stacks: Dict[int, List[str]] = {}
1283
- details_by_rank: Dict[int, List[str]] = {}
1926
+ ) -> Tuple[Dict[int, RankSnapshot], List[str]]:
1927
+ snapshots: Dict[int, RankSnapshot] = {}
1284
1928
  errors: List[str] = []
1285
1929
  for entry in state.ranks:
1286
1930
  proc = rank_to_proc.get(entry.rank)
1287
1931
  if proc is None:
1288
- stacks[entry.rank] = ["No process"]
1289
- details_by_rank[entry.rank] = []
1932
+ snapshots[entry.rank] = RankSnapshot(
1933
+ output=None,
1934
+ error="No process",
1935
+ stack_lines=["No process"],
1936
+ details=[],
1937
+ )
1290
1938
  continue
1291
1939
  output, error = run_py_spy(entry.host, proc, pythonpath, install_attempted)
1292
1940
  if error:
1293
1941
  errors.append(error)
1294
- stacks[entry.rank] = [error]
1295
- details_by_rank[entry.rank] = []
1942
+ snapshots[entry.rank] = RankSnapshot(
1943
+ output=None,
1944
+ error=error,
1945
+ stack_lines=[error],
1946
+ details=[],
1947
+ )
1296
1948
  continue
1297
1949
  lines, details = render_pyspy_output(output or "", show_threads)
1298
- stacks[entry.rank] = lines
1299
- details_by_rank[entry.rank] = details
1300
- return stacks, details_by_rank, errors
1950
+ snapshots[entry.rank] = RankSnapshot(
1951
+ output=output,
1952
+ error=None,
1953
+ stack_lines=lines,
1954
+ details=details,
1955
+ )
1956
+ return snapshots, errors
1301
1957
 
1302
1958
 
1303
- def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
1959
+ def parse_live_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
1304
1960
  parser = argparse.ArgumentParser(description="Show MPI Python stacks across hosts.")
1305
1961
  parser.add_argument("--rankfile", help="Override rankfile path")
1306
1962
  parser.add_argument("--prterun-pid", type=int, help="PID of prterun/mpirun")
@@ -1309,11 +1965,61 @@ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
1309
1965
  "--pythonpath",
1310
1966
  help="PYTHONPATH to export remotely (defaults to local PYTHONPATH)",
1311
1967
  )
1968
+ parser.add_argument(
1969
+ "--out",
1970
+ help="Output path for recordings (.jsonl file or directory)",
1971
+ )
1312
1972
  return parser.parse_args(argv)
1313
1973
 
1314
1974
 
1315
- def main(argv: Optional[Sequence[str]] = None) -> int:
1316
- args = parse_args(argv)
1975
+ def parse_review_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
1976
+ parser = argparse.ArgumentParser(description="Review a recorded mpiptop session.")
1977
+ parser.add_argument("path", help="Path to a recorded session (.jsonl file or directory)")
1978
+ return parser.parse_args(argv)
1979
+
1980
+
1981
+ def parse_summarize_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
1982
+ parser = argparse.ArgumentParser(description="Summarize a recorded mpiptop session.")
1983
+ parser.add_argument("path", help="Path to a recorded session (.jsonl file or directory)")
1984
+ parser.add_argument(
1985
+ "--format",
1986
+ choices=["text", "json"],
1987
+ default="text",
1988
+ help="Output format",
1989
+ )
1990
+ parser.add_argument(
1991
+ "--top",
1992
+ type=int,
1993
+ default=5,
1994
+ help="Top signatures to report",
1995
+ )
1996
+ return parser.parse_args(argv)
1997
+
1998
+
1999
+ def parse_record_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
2000
+ parser = argparse.ArgumentParser(description="Record an mpiptop session.")
2001
+ parser.add_argument("--rankfile", help="Override rankfile path")
2002
+ parser.add_argument("--prterun-pid", type=int, help="PID of prterun/mpirun")
2003
+ parser.add_argument("--refresh", type=int, default=10, help="Refresh interval (seconds)")
2004
+ parser.add_argument(
2005
+ "--pythonpath",
2006
+ help="PYTHONPATH to export remotely (defaults to local PYTHONPATH)",
2007
+ )
2008
+ parser.add_argument(
2009
+ "--out",
2010
+ help="Output path for recordings (.jsonl file or directory)",
2011
+ )
2012
+ parser.add_argument(
2013
+ "--quiet",
2014
+ action="store_true",
2015
+ help="Only print start/stop lines",
2016
+ )
2017
+ args = parser.parse_args(argv)
2018
+ args.record = True
2019
+ return args
2020
+
2021
+
2022
+ def run_live(args: argparse.Namespace) -> int:
1317
2023
  pythonpath = args.pythonpath if args.pythonpath is not None else os.environ.get("PYTHONPATH", "")
1318
2024
 
1319
2025
  state = detect_state(args)
@@ -1322,6 +2028,10 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
1322
2028
  show_threads = False
1323
2029
  show_details = False
1324
2030
  install_attempted: set = set()
2031
+ record_session: Optional[RecordSession] = None
2032
+ recording_enabled = bool(getattr(args, "record", False))
2033
+ record_started_at: Optional[float] = None
2034
+ record_path = args.out
1325
2035
 
1326
2036
  def handle_sigint(_sig, _frame):
1327
2037
  raise KeyboardInterrupt
@@ -1338,29 +2048,60 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
1338
2048
  last_update = "never"
1339
2049
  next_refresh = 0.0
1340
2050
 
2051
+ def start_recording() -> None:
2052
+ nonlocal record_session, recording_enabled, record_started_at, record_path
2053
+ if record_session is None:
2054
+ record_path = record_path or default_session_path()
2055
+ record_session = RecordSession(record_path, state, refresh, pythonpath)
2056
+ recording_enabled = True
2057
+ if record_started_at is None:
2058
+ record_started_at = time.time()
2059
+
2060
+ def stop_recording() -> None:
2061
+ nonlocal recording_enabled, record_started_at
2062
+ recording_enabled = False
2063
+ record_started_at = None
2064
+
2065
+ if recording_enabled:
2066
+ start_recording()
2067
+
1341
2068
  def refresh_view() -> None:
1342
- nonlocal last_update
1343
- rank_to_proc, pid_errors = collect_rank_pids(state)
1344
- stacks, details_by_rank, stack_errors = collect_stacks(
2069
+ nonlocal last_update, state, record_session
2070
+ rank_to_proc, _pid_errors = collect_rank_pids(state)
2071
+ candidate = best_selector_from_procs(rank_to_proc.values())
2072
+ if candidate and selector_score(candidate) > selector_score(state.selector):
2073
+ state = dataclasses.replace(state, selector=candidate)
2074
+ snapshots, _stack_errors = collect_stacks(
1345
2075
  state, rank_to_proc, pythonpath, show_threads, install_attempted
1346
2076
  )
2077
+ if recording_enabled and record_session is not None:
2078
+ record_session.record_if_changed(state, rank_to_proc, snapshots)
1347
2079
  stacks_text: Dict[int, Text] = {}
1348
- stack_lines_by_rank = {rank: extract_stack_lines(lines) for rank, lines in stacks.items()}
2080
+ stack_lines_by_rank = {
2081
+ rank: extract_stack_lines(snapshot.stack_lines)
2082
+ for rank, snapshot in snapshots.items()
2083
+ }
1349
2084
  prefix_len = common_prefix_length(stack_lines_by_rank)
1350
2085
  diff_index = None
1351
2086
  if any(stack_lines_by_rank.values()):
1352
- if prefix_len > 0:
1353
- diff_index = prefix_len - 1
1354
- else:
1355
- diff_index = 0
1356
- for rank, lines in stacks.items():
2087
+ diff_index = max(0, prefix_len - 1) if prefix_len > 0 else 0
2088
+ for rank, snapshot in snapshots.items():
2089
+ lines = snapshot.stack_lines
1357
2090
  marked = mark_diff_line(lines, diff_index) if diff_index is not None else lines
1358
2091
  stacks_text[rank] = style_lines(marked)
1359
- errors = pid_errors + stack_errors
2092
+ details_by_rank = {
2093
+ rank: snapshot.details for rank, snapshot in snapshots.items()
2094
+ }
1360
2095
  last_update = time.strftime("%H:%M:%S")
1361
2096
  width, height = shutil.get_terminal_size((120, 40))
1362
2097
  content_width = max(0, width - 4)
1363
- header, header_lines = build_header(state, last_update, errors, refresh, content_width)
2098
+ record_line = None
2099
+ if record_session is not None and recording_enabled:
2100
+ record_line = f"{record_session.log_path} | events {record_session.event_count} | {format_elapsed(record_started_at)}"
2101
+ record_line = shorten(record_line, max(10, content_width - 12))
2102
+ header, header_lines = build_live_header(
2103
+ state, last_update, refresh, record_line, content_width
2104
+ )
1364
2105
  header_height = header_lines + 2
1365
2106
  header_height = max(3, min(header_height, max(3, height - 1)))
1366
2107
  layout["header"].size = header_height
@@ -1390,26 +2131,398 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
1390
2131
  refresh_view()
1391
2132
  next_refresh = now + refresh
1392
2133
 
1393
- if sys.stdin in select_with_timeout(0.1):
1394
- key = sys.stdin.read(1)
1395
- if key == "q":
1396
- return 0
1397
- if key == " ":
1398
- next_refresh = 0.0
1399
- if key == "t":
1400
- show_threads = not show_threads
1401
- next_refresh = 0.0
1402
- if key == "d":
1403
- show_details = not show_details
1404
- next_refresh = 0.0
2134
+ key = read_key(0.1)
2135
+ if key is None:
2136
+ continue
2137
+ if key == "q":
2138
+ return 0
2139
+ if key == " ":
2140
+ next_refresh = 0.0
2141
+ if key == "t":
2142
+ show_threads = not show_threads
2143
+ next_refresh = 0.0
2144
+ if key == "d":
2145
+ show_details = not show_details
2146
+ next_refresh = 0.0
2147
+ if key == "r":
2148
+ if recording_enabled:
2149
+ stop_recording()
2150
+ else:
2151
+ start_recording()
2152
+ next_refresh = 0.0
1405
2153
  except KeyboardInterrupt:
1406
2154
  return 0
1407
2155
  finally:
1408
2156
  termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
2157
+ if record_session is not None:
2158
+ record_session.close()
2159
+ if record_session.event_count > 0:
2160
+ print(f"Recording saved to: {record_session.log_path}")
1409
2161
 
1410
2162
  return 0
1411
2163
 
1412
2164
 
2165
+ def run_record_batch(args: argparse.Namespace) -> int:
2166
+ pythonpath = args.pythonpath if args.pythonpath is not None else os.environ.get("PYTHONPATH", "")
2167
+ state = detect_state(args)
2168
+ refresh = max(1, args.refresh)
2169
+ record_path = args.out or default_session_path()
2170
+ record_session = RecordSession(record_path, state, refresh, pythonpath)
2171
+ quiet = bool(args.quiet)
2172
+ install_attempted: set = set()
2173
+ start_time = time.time()
2174
+ last_change: Optional[float] = None
2175
+ last_heartbeat = start_time
2176
+ last_divergence_time = 0.0
2177
+ stop_reason = "completed"
2178
+
2179
+ target = state.selector.display or "python"
2180
+ target = shorten(target, 120)
2181
+ print(
2182
+ f"recording start | path={record_session.log_path} | ranks={len(state.ranks)} | "
2183
+ f"refresh={refresh}s | target={target}"
2184
+ )
2185
+
2186
+ try:
2187
+ while True:
2188
+ loop_start = time.time()
2189
+ if not is_pid_alive(state.prte_pid):
2190
+ stop_reason = "prterun-exited"
2191
+ break
2192
+ rank_to_proc, _pid_errors = collect_rank_pids(state)
2193
+ snapshots, _stack_errors = collect_stacks(
2194
+ state, rank_to_proc, pythonpath, False, install_attempted
2195
+ )
2196
+ if record_session.record_if_changed(state, rank_to_proc, snapshots):
2197
+ last_change = time.time()
2198
+ divergence, common_len, max_len = compute_divergence_from_snapshots(state.ranks, snapshots)
2199
+ now = time.time()
2200
+ if not quiet and now - last_heartbeat >= HEARTBEAT_INTERVAL:
2201
+ last_change_age = "never"
2202
+ if last_change is not None:
2203
+ last_change_age = format_duration(int(now - last_change))
2204
+ elapsed = format_duration(int(now - start_time))
2205
+ print(
2206
+ f"heartbeat | events={record_session.event_count} | "
2207
+ f"last_change={last_change_age} | elapsed={elapsed}"
2208
+ )
2209
+ last_heartbeat = now
2210
+ if (
2211
+ not quiet
2212
+ and divergence >= DIVERGENCE_THRESHOLD
2213
+ and now - last_divergence_time >= DIVERGENCE_INTERVAL
2214
+ ):
2215
+ print(
2216
+ f"divergence | ratio={divergence:.2f} | common={common_len} | max={max_len}"
2217
+ )
2218
+ last_divergence_time = now
2219
+ elapsed = time.time() - loop_start
2220
+ sleep_for = refresh - elapsed
2221
+ if sleep_for > 0:
2222
+ time.sleep(sleep_for)
2223
+ except KeyboardInterrupt:
2224
+ stop_reason = "interrupted"
2225
+ finally:
2226
+ record_session.close()
2227
+ elapsed = format_duration(int(time.time() - start_time))
2228
+ print(
2229
+ f"recording stop | reason={stop_reason} | events={record_session.event_count} | "
2230
+ f"elapsed={elapsed} | path={record_session.log_path}"
2231
+ )
2232
+
2233
+ return 0
2234
+
2235
+
2236
+ def run_review(args: argparse.Namespace) -> int:
2237
+ metadata = load_session_metadata(args.path)
2238
+ ranks = [
2239
+ RankInfo(rank=int(item["rank"]), host=str(item["host"]))
2240
+ for item in metadata.get("ranks", [])
2241
+ if "rank" in item and "host" in item
2242
+ ]
2243
+ if not ranks:
2244
+ raise SystemExit("no ranks found in metadata")
2245
+ selector_payload = metadata.get("selector", {}) if isinstance(metadata.get("selector"), dict) else {}
2246
+ selector = ProgramSelector(
2247
+ module=selector_payload.get("module"),
2248
+ script=selector_payload.get("script"),
2249
+ display=selector_payload.get("display", ""),
2250
+ )
2251
+ state = State(
2252
+ prte_pid=int(metadata.get("prte_pid", 0) or 0),
2253
+ rankfile=str(metadata.get("rankfile", "")),
2254
+ ranks=ranks,
2255
+ selector=selector,
2256
+ )
2257
+ events = load_session_events(args.path)
2258
+ if not events:
2259
+ raise SystemExit("no events recorded")
2260
+
2261
+ console = Console()
2262
+ show_threads = False
2263
+ show_details = False
2264
+ levels = [TimelineLevel(0, len(events), selected=0)]
2265
+ max_stack_lens, divergence_ratios, _ = compute_event_metrics(
2266
+ events, ranks, show_threads
2267
+ )
2268
+
2269
+ def handle_sigint(_sig, _frame):
2270
+ raise KeyboardInterrupt
2271
+
2272
+ signal.signal(signal.SIGINT, handle_sigint)
2273
+
2274
+ fd = sys.stdin.fileno()
2275
+ old_settings = termios.tcgetattr(fd)
2276
+ tty.setcbreak(fd)
2277
+
2278
+ layout = Layout()
2279
+ layout.split_column(Layout(name="header", size=HEADER_HEIGHT), Layout(name="body"))
2280
+
2281
+ def refresh_view() -> None:
2282
+ width, height = shutil.get_terminal_size((120, 40))
2283
+ content_width = max(0, width - 4)
2284
+ timeline_lines = render_timeline_lines(levels, max_stack_lens, divergence_ratios, content_width)
2285
+ active_level = levels[-1]
2286
+ if not active_level.buckets:
2287
+ return
2288
+ current_index = active_level.buckets[active_level.selected][0]
2289
+ current_index = max(0, min(current_index, len(events) - 1))
2290
+ event = events[current_index]
2291
+ snapshots = event_snapshots_from_event(event, ranks, show_threads)
2292
+ rank_to_proc = rank_to_proc_from_event(event, ranks)
2293
+ stack_lines_by_rank = {
2294
+ rank: extract_stack_lines(snapshot.stack_lines)
2295
+ for rank, snapshot in snapshots.items()
2296
+ }
2297
+ prefix_len = common_prefix_length(stack_lines_by_rank)
2298
+ diff_index = None
2299
+ if any(stack_lines_by_rank.values()):
2300
+ diff_index = max(0, prefix_len - 1) if prefix_len > 0 else 0
2301
+ stacks_text: Dict[int, Text] = {}
2302
+ for rank, snapshot in snapshots.items():
2303
+ lines = snapshot.stack_lines
2304
+ marked = mark_diff_line(lines, diff_index) if diff_index is not None else lines
2305
+ stacks_text[rank] = style_lines(marked)
2306
+ details_by_rank = {
2307
+ rank: snapshot.details for rank, snapshot in snapshots.items()
2308
+ }
2309
+ event_time = iso_timestamp(event.timestamp)
2310
+ header, header_lines = build_review_header(
2311
+ state,
2312
+ current_index,
2313
+ len(events),
2314
+ event_time,
2315
+ timeline_lines,
2316
+ content_width,
2317
+ )
2318
+ header_height = header_lines + 2
2319
+ header_height = max(3, min(header_height, max(3, height - 1)))
2320
+ layout["header"].size = header_height
2321
+ body_height = max(1, height - header_height)
2322
+ total_columns = len(ranks) + (1 if show_details else 0)
2323
+ column_width = max(1, content_width // max(1, total_columns))
2324
+ inner_width = max(1, column_width - 4)
2325
+ details_text = (
2326
+ build_details_text(ranks, rank_to_proc, details_by_rank, inner_width)
2327
+ if show_details
2328
+ else None
2329
+ )
2330
+ layout["header"].update(
2331
+ Panel(header, padding=(0, 1), border_style=BORDER_STYLE)
2332
+ )
2333
+ layout["body"].update(
2334
+ render_columns(ranks, stacks_text, details_text, body_height, rank_to_proc)
2335
+ )
2336
+
2337
+ try:
2338
+ refresh_view()
2339
+ with Live(layout, console=console, refresh_per_second=10, screen=True):
2340
+ while True:
2341
+ key = read_key(0.1)
2342
+ if key is None:
2343
+ continue
2344
+ if key == "q":
2345
+ return 0
2346
+ if key == "t":
2347
+ show_threads = not show_threads
2348
+ max_stack_lens, divergence_ratios, _ = compute_event_metrics(
2349
+ events, ranks, show_threads
2350
+ )
2351
+ refresh_view()
2352
+ if key == "d":
2353
+ show_details = not show_details
2354
+ refresh_view()
2355
+ if key == "left":
2356
+ level = levels[-1]
2357
+ level.selected = max(0, level.selected - 1)
2358
+ refresh_view()
2359
+ if key == "right":
2360
+ level = levels[-1]
2361
+ level.selected = min(max(0, len(level.buckets) - 1), level.selected + 1)
2362
+ refresh_view()
2363
+ if key == "down":
2364
+ level = levels[-1]
2365
+ if not level.buckets:
2366
+ continue
2367
+ bucket = level.buckets[level.selected]
2368
+ if bucket[1] - bucket[0] <= 1:
2369
+ continue
2370
+ levels.append(TimelineLevel(bucket[0], bucket[1], selected=0))
2371
+ refresh_view()
2372
+ if key == "up":
2373
+ if len(levels) > 1:
2374
+ levels.pop()
2375
+ refresh_view()
2376
+ except KeyboardInterrupt:
2377
+ return 0
2378
+ finally:
2379
+ termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
2380
+
2381
+ return 0
2382
+
2383
+
2384
+ def run_summarize(args: argparse.Namespace) -> int:
2385
+ metadata = load_session_metadata(args.path)
2386
+ events = load_session_events(args.path)
2387
+ ranks = [
2388
+ RankInfo(rank=int(item["rank"]), host=str(item["host"]))
2389
+ for item in metadata.get("ranks", [])
2390
+ if "rank" in item and "host" in item
2391
+ ]
2392
+ if not ranks:
2393
+ raise SystemExit("no ranks found in metadata")
2394
+ if not events:
2395
+ raise SystemExit("no events recorded")
2396
+
2397
+ rank_order = [info.rank for info in ranks]
2398
+ signature_counts: Dict[Tuple[str, ...], int] = {}
2399
+ signature_examples: Dict[Tuple[str, ...], Dict[int, str]] = {}
2400
+ rank_change_counts: Dict[int, int] = {rank: 0 for rank in rank_order}
2401
+ previous_rank_signature: Dict[int, str] = {rank: "" for rank in rank_order}
2402
+ max_stack_lens, divergence_ratios, common_prefixes = compute_event_metrics(
2403
+ events, ranks, show_threads=False
2404
+ )
2405
+
2406
+ for event in events:
2407
+ per_rank_signature: Dict[int, str] = {}
2408
+ per_rank_top_frame: Dict[int, str] = {}
2409
+ for info in ranks:
2410
+ payload = event.ranks.get(info.rank, {})
2411
+ if payload.get("error"):
2412
+ signature = f"error:{payload.get('error')}"
2413
+ top_frame = signature
2414
+ else:
2415
+ output = payload.get("py_spy")
2416
+ if output:
2417
+ lines, _details = render_pyspy_output(str(output), show_threads=False)
2418
+ stack_lines = extract_stack_lines(lines)
2419
+ signature = hashlib.sha1(
2420
+ "\n".join(stack_lines).encode("utf-8", errors="ignore")
2421
+ ).hexdigest()
2422
+ top_frame = stack_lines[0].strip() if stack_lines else "empty"
2423
+ else:
2424
+ signature = "empty"
2425
+ top_frame = "empty"
2426
+ per_rank_signature[info.rank] = signature
2427
+ per_rank_top_frame[info.rank] = top_frame
2428
+
2429
+ for rank, signature in per_rank_signature.items():
2430
+ if previous_rank_signature.get(rank) != signature:
2431
+ rank_change_counts[rank] = rank_change_counts.get(rank, 0) + 1
2432
+ previous_rank_signature[rank] = signature
2433
+
2434
+ signature_key = tuple(per_rank_signature[rank] for rank in rank_order)
2435
+ signature_counts[signature_key] = signature_counts.get(signature_key, 0) + 1
2436
+ if signature_key not in signature_examples:
2437
+ signature_examples[signature_key] = per_rank_top_frame
2438
+
2439
+ sorted_signatures = sorted(
2440
+ signature_counts.items(), key=lambda item: item[1], reverse=True
2441
+ )
2442
+ top_signatures = sorted_signatures[: max(1, args.top)]
2443
+ total_events = len(events)
2444
+ start_time = iso_timestamp(events[0].timestamp)
2445
+ end_time = iso_timestamp(events[-1].timestamp)
2446
+
2447
+ if args.format == "json":
2448
+ payload = {
2449
+ "metadata": metadata,
2450
+ "event_count": total_events,
2451
+ "time_range": {"start": start_time, "end": end_time},
2452
+ "rank_change_counts": rank_change_counts,
2453
+ "top_signatures": [
2454
+ {
2455
+ "count": count,
2456
+ "ratio": count / float(total_events),
2457
+ "example_top_frames": signature_examples.get(signature_key, {}),
2458
+ }
2459
+ for signature_key, count in top_signatures
2460
+ ],
2461
+ "most_divergent": sorted(
2462
+ [
2463
+ {
2464
+ "index": idx,
2465
+ "timestamp": iso_timestamp(events[idx].timestamp),
2466
+ "divergence_ratio": divergence_ratios[idx],
2467
+ "common_prefix_len": common_prefixes[idx],
2468
+ "max_stack_len": max_stack_lens[idx],
2469
+ }
2470
+ for idx in range(total_events)
2471
+ ],
2472
+ key=lambda item: item["divergence_ratio"],
2473
+ reverse=True,
2474
+ )[:5],
2475
+ }
2476
+ print(json.dumps(payload, indent=2, sort_keys=True))
2477
+ return 0
2478
+
2479
+ print(f"Session: {args.path}")
2480
+ print(f"Events: {total_events} ({start_time} -> {end_time})")
2481
+ print(f"Ranks: {', '.join(str(rank) for rank in rank_order)}")
2482
+ print("")
2483
+ print("Top stack signatures:")
2484
+ for idx, (signature_key, count) in enumerate(top_signatures, start=1):
2485
+ ratio = count / float(total_events)
2486
+ print(f"{idx}. {count} events ({ratio:.1%})")
2487
+ example = signature_examples.get(signature_key, {})
2488
+ for rank in rank_order:
2489
+ frame = example.get(rank, "")
2490
+ frame = shorten(frame, 120)
2491
+ print(f" rank {rank}: {frame}")
2492
+ print("")
2493
+ print("Rank change counts:")
2494
+ for rank in rank_order:
2495
+ print(f" rank {rank}: {rank_change_counts.get(rank, 0)}")
2496
+ print("")
2497
+ print("Most divergent events:")
2498
+ divergent = sorted(
2499
+ range(total_events),
2500
+ key=lambda idx: divergence_ratios[idx],
2501
+ reverse=True,
2502
+ )[:5]
2503
+ for idx in divergent:
2504
+ print(
2505
+ f" #{idx + 1} @ {iso_timestamp(events[idx].timestamp)} | "
2506
+ f"ratio {divergence_ratios[idx]:.2f} | "
2507
+ f"common {common_prefixes[idx]} | "
2508
+ f"max {max_stack_lens[idx]}"
2509
+ )
2510
+ return 0
2511
+
2512
+
2513
+ def main(argv: Optional[Sequence[str]] = None) -> int:
2514
+ argv = list(argv) if argv is not None else sys.argv[1:]
2515
+ if argv and argv[0] in {"review", "summarize", "record"}:
2516
+ command = argv[0]
2517
+ sub_args = argv[1:]
2518
+ if command == "review":
2519
+ return run_review(parse_review_args(sub_args))
2520
+ if command == "record":
2521
+ return run_record_batch(parse_record_args(sub_args))
2522
+ return run_summarize(parse_summarize_args(sub_args))
2523
+ return run_live(parse_live_args(argv))
2524
+
2525
+
1413
2526
  def select_with_timeout(timeout: float):
1414
2527
  import select
1415
2528
 
@@ -1,7 +0,0 @@
1
- mpiptop.py,sha256=nz8YMs0j54vfxDr0bfjwON76QWZ_QAtR2rp0VkuhrFM,45815
2
- mpiptop-0.1.0.dist-info/licenses/LICENSE,sha256=ChKmQ8qCXxdXRR_HIJECjIA5NLWlUTEJWh7Xkhm2wAA,1069
3
- mpiptop-0.1.0.dist-info/METADATA,sha256=VauVpnkAiQokz6X5OXncDhygTbeJPrU_9rOTFya_oMQ,1477
4
- mpiptop-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
5
- mpiptop-0.1.0.dist-info/entry_points.txt,sha256=RsGsr8GBLfUNpb432YWS5gz4MWfWdK9xJRr1SmdnLo8,41
6
- mpiptop-0.1.0.dist-info/top_level.txt,sha256=c2Vdu6tTg0DEPUWD8Odyods7fXsPWMQ2kSvjdKiTClc,8
7
- mpiptop-0.1.0.dist-info/RECORD,,