mpiptop 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mpiptop-0.1.0.dist-info → mpiptop-0.2.0.dist-info}/METADATA +13 -2
- mpiptop-0.2.0.dist-info/RECORD +7 -0
- {mpiptop-0.1.0.dist-info → mpiptop-0.2.0.dist-info}/WHEEL +1 -1
- mpiptop.py +1154 -41
- mpiptop-0.1.0.dist-info/RECORD +0 -7
- {mpiptop-0.1.0.dist-info → mpiptop-0.2.0.dist-info}/entry_points.txt +0 -0
- {mpiptop-0.1.0.dist-info → mpiptop-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {mpiptop-0.1.0.dist-info → mpiptop-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mpiptop
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: TUI for viewing MPI Python stacks across hosts
|
|
5
5
|
Author: yieldthought
|
|
6
6
|
License-Expression: MIT
|
|
@@ -48,6 +48,17 @@ mpiptop --rankfile /etc/mpirun/rankfile_01_02
|
|
|
48
48
|
mpiptop --prterun-pid 12345
|
|
49
49
|
mpiptop --refresh 5
|
|
50
50
|
mpiptop --pythonpath /path/to/your/code
|
|
51
|
+
mpiptop record --out ./mpiptop-session-20260123-120000.jsonl
|
|
51
52
|
```
|
|
52
53
|
|
|
53
|
-
|
|
54
|
+
Record/review (record is batch mode; use plain `mpiptop` for the TUI):
|
|
55
|
+
```bash
|
|
56
|
+
mpiptop record
|
|
57
|
+
mpiptop record --quiet
|
|
58
|
+
mpiptop review ./mpiptop-session-20260123-120000.jsonl
|
|
59
|
+
mpiptop summarize ./mpiptop-session-20260123-120000.jsonl --format text
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Live controls: `q` quit | `space` refresh | `t` threads | `d` details | `r` record
|
|
63
|
+
|
|
64
|
+
Review controls: `q` quit | `left/right` move | `down` zoom | `up` zoom out | `t` threads | `d` details
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
mpiptop.py,sha256=D4h-jOyhYU4M0FzQ6JMX5gDBE8UVd4detm5JDFdTm4c,86492
|
|
2
|
+
mpiptop-0.2.0.dist-info/licenses/LICENSE,sha256=ChKmQ8qCXxdXRR_HIJECjIA5NLWlUTEJWh7Xkhm2wAA,1069
|
|
3
|
+
mpiptop-0.2.0.dist-info/METADATA,sha256=3vT5lrkqfuh6O2DL6xCm412w7opwQAAzJFdD6IGHs7g,1910
|
|
4
|
+
mpiptop-0.2.0.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
|
|
5
|
+
mpiptop-0.2.0.dist-info/entry_points.txt,sha256=RsGsr8GBLfUNpb432YWS5gz4MWfWdK9xJRr1SmdnLo8,41
|
|
6
|
+
mpiptop-0.2.0.dist-info/top_level.txt,sha256=c2Vdu6tTg0DEPUWD8Odyods7fXsPWMQ2kSvjdKiTClc,8
|
|
7
|
+
mpiptop-0.2.0.dist-info/RECORD,,
|
mpiptop.py
CHANGED
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
import argparse
|
|
7
7
|
import colorsys
|
|
8
8
|
import dataclasses
|
|
9
|
+
import datetime
|
|
9
10
|
import hashlib
|
|
10
11
|
import json
|
|
11
12
|
import os
|
|
@@ -79,10 +80,40 @@ class ParsedPySpy:
|
|
|
79
80
|
threads: List[ThreadBlock]
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
@dataclasses.dataclass(frozen=True)
|
|
84
|
+
class RankSnapshot:
|
|
85
|
+
output: Optional[str]
|
|
86
|
+
error: Optional[str]
|
|
87
|
+
stack_lines: List[str]
|
|
88
|
+
details: List[str]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclasses.dataclass
|
|
92
|
+
class SessionEvent:
|
|
93
|
+
timestamp: float
|
|
94
|
+
ranks: Dict[int, Dict[str, object]]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclasses.dataclass
|
|
98
|
+
class TimelineLevel:
|
|
99
|
+
start: int
|
|
100
|
+
end: int
|
|
101
|
+
selected: int = 0
|
|
102
|
+
buckets: List[Tuple[int, int]] = dataclasses.field(default_factory=list)
|
|
103
|
+
|
|
104
|
+
|
|
82
105
|
PUNCT_STYLE = "grey62"
|
|
83
106
|
BORDER_STYLE = "grey62"
|
|
84
107
|
KEY_STYLE = "#7ad7ff"
|
|
85
108
|
HEADER_HEIGHT = 3
|
|
109
|
+
SESSION_VERSION = 1
|
|
110
|
+
SESSION_LOG_FILE = "session.jsonl"
|
|
111
|
+
SESSION_METADATA_FILE = "metadata.json"
|
|
112
|
+
SESSION_EVENTS_FILE = "events.jsonl"
|
|
113
|
+
SPARKLINE_CHARS = "▁▂▃▄▅▆▇█"
|
|
114
|
+
HEARTBEAT_INTERVAL = 60
|
|
115
|
+
DIVERGENCE_THRESHOLD = 0.5
|
|
116
|
+
DIVERGENCE_INTERVAL = 60
|
|
86
117
|
ENV_KEYS = (
|
|
87
118
|
"PATH",
|
|
88
119
|
"LD_LIBRARY_PATH",
|
|
@@ -203,6 +234,264 @@ print(json.dumps(results))
|
|
|
203
234
|
"""
|
|
204
235
|
|
|
205
236
|
|
|
237
|
+
def iso_timestamp(value: Optional[float] = None) -> str:
|
|
238
|
+
ts = time.time() if value is None else value
|
|
239
|
+
return datetime.datetime.fromtimestamp(ts).isoformat(timespec="seconds")
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def default_session_path() -> str:
|
|
243
|
+
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
244
|
+
return os.path.abspath(f"mpiptop-session-{stamp}.jsonl")
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def normalize_session_path(path: str) -> Tuple[str, str]:
|
|
248
|
+
if path.endswith(".jsonl") or (os.path.exists(path) and os.path.isfile(path)):
|
|
249
|
+
base_dir = os.path.dirname(path) or "."
|
|
250
|
+
return base_dir, path
|
|
251
|
+
return path, os.path.join(path, SESSION_LOG_FILE)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def ensure_session_path(path: str) -> Tuple[str, str]:
|
|
255
|
+
base_dir, log_path = normalize_session_path(path)
|
|
256
|
+
if os.path.exists(path):
|
|
257
|
+
if os.path.isdir(path):
|
|
258
|
+
if os.listdir(path):
|
|
259
|
+
if os.path.exists(log_path) or os.path.exists(os.path.join(path, SESSION_METADATA_FILE)):
|
|
260
|
+
return base_dir, log_path
|
|
261
|
+
raise SystemExit(f"record path exists and is not empty: {path}")
|
|
262
|
+
elif os.path.isfile(path):
|
|
263
|
+
return base_dir, log_path
|
|
264
|
+
else:
|
|
265
|
+
raise SystemExit(f"record path exists and is not a file or directory: {path}")
|
|
266
|
+
else:
|
|
267
|
+
if log_path.endswith(".jsonl"):
|
|
268
|
+
os.makedirs(base_dir, exist_ok=True)
|
|
269
|
+
else:
|
|
270
|
+
os.makedirs(base_dir, exist_ok=True)
|
|
271
|
+
return base_dir, log_path
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def write_session_metadata(log_path: str, state: State, refresh: int, pythonpath: str) -> None:
|
|
275
|
+
payload = {
|
|
276
|
+
"version": SESSION_VERSION,
|
|
277
|
+
"created_at": iso_timestamp(),
|
|
278
|
+
"refresh": refresh,
|
|
279
|
+
"rankfile": state.rankfile,
|
|
280
|
+
"prte_pid": state.prte_pid,
|
|
281
|
+
"selector": dataclasses.asdict(state.selector),
|
|
282
|
+
"ranks": [dataclasses.asdict(rank) for rank in state.ranks],
|
|
283
|
+
"pythonpath": pythonpath,
|
|
284
|
+
"record_on_change": True,
|
|
285
|
+
}
|
|
286
|
+
if os.path.exists(log_path) and os.path.getsize(log_path) > 0:
|
|
287
|
+
return
|
|
288
|
+
with open(log_path, "a", encoding="utf-8") as handle:
|
|
289
|
+
handle.write(json.dumps({"type": "metadata", "data": payload}) + "\n")
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def load_session_metadata(path: str) -> Dict[str, object]:
|
|
293
|
+
base_dir, log_path = normalize_session_path(path)
|
|
294
|
+
metadata_path = os.path.join(base_dir, SESSION_METADATA_FILE)
|
|
295
|
+
if os.path.exists(metadata_path):
|
|
296
|
+
with open(metadata_path, "r", encoding="utf-8") as handle:
|
|
297
|
+
return json.load(handle)
|
|
298
|
+
if not os.path.exists(log_path):
|
|
299
|
+
raise SystemExit(f"metadata not found in {path}")
|
|
300
|
+
with open(log_path, "r", encoding="utf-8") as handle:
|
|
301
|
+
for line in handle:
|
|
302
|
+
raw = line.strip()
|
|
303
|
+
if not raw:
|
|
304
|
+
continue
|
|
305
|
+
data = json.loads(raw)
|
|
306
|
+
if isinstance(data, dict) and data.get("type") == "metadata":
|
|
307
|
+
payload = data.get("data")
|
|
308
|
+
if isinstance(payload, dict):
|
|
309
|
+
return payload
|
|
310
|
+
if isinstance(data, dict) and "version" in data and "ranks" in data:
|
|
311
|
+
return data
|
|
312
|
+
raise SystemExit(f"metadata not found in {log_path}")
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def read_last_event(path: str) -> Optional[Dict[str, object]]:
|
|
316
|
+
if not os.path.exists(path):
|
|
317
|
+
return None
|
|
318
|
+
with open(path, "rb") as handle:
|
|
319
|
+
handle.seek(0, os.SEEK_END)
|
|
320
|
+
pos = handle.tell()
|
|
321
|
+
if pos == 0:
|
|
322
|
+
return None
|
|
323
|
+
chunk = b""
|
|
324
|
+
while pos > 0:
|
|
325
|
+
step = min(4096, pos)
|
|
326
|
+
pos -= step
|
|
327
|
+
handle.seek(pos)
|
|
328
|
+
chunk = handle.read(step) + chunk
|
|
329
|
+
if b"\n" in chunk:
|
|
330
|
+
break
|
|
331
|
+
lines = [line for line in chunk.splitlines() if line.strip()]
|
|
332
|
+
while lines:
|
|
333
|
+
raw = lines.pop().decode("utf-8", errors="ignore")
|
|
334
|
+
try:
|
|
335
|
+
data = json.loads(raw)
|
|
336
|
+
except json.JSONDecodeError:
|
|
337
|
+
continue
|
|
338
|
+
if isinstance(data, dict) and data.get("type") == "metadata":
|
|
339
|
+
continue
|
|
340
|
+
if isinstance(data, dict) and data.get("type") == "event":
|
|
341
|
+
payload = data.get("data")
|
|
342
|
+
if isinstance(payload, dict):
|
|
343
|
+
return payload
|
|
344
|
+
return data
|
|
345
|
+
return None
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def load_session_events(path: str) -> List[SessionEvent]:
|
|
349
|
+
base_dir, log_path = normalize_session_path(path)
|
|
350
|
+
events_path = os.path.join(base_dir, SESSION_EVENTS_FILE)
|
|
351
|
+
if not os.path.exists(events_path) and not os.path.exists(log_path):
|
|
352
|
+
raise SystemExit(f"events not found in {path}")
|
|
353
|
+
path_to_read = events_path if os.path.exists(events_path) else log_path
|
|
354
|
+
events: List[SessionEvent] = []
|
|
355
|
+
with open(path_to_read, "r", encoding="utf-8") as handle:
|
|
356
|
+
for line in handle:
|
|
357
|
+
raw = line.strip()
|
|
358
|
+
if not raw:
|
|
359
|
+
continue
|
|
360
|
+
data = json.loads(raw)
|
|
361
|
+
if isinstance(data, dict) and data.get("type") == "metadata":
|
|
362
|
+
continue
|
|
363
|
+
if isinstance(data, dict) and data.get("type") == "event":
|
|
364
|
+
data = data.get("data", {})
|
|
365
|
+
if not isinstance(data, dict):
|
|
366
|
+
continue
|
|
367
|
+
timestamp = float(data.get("t", 0.0))
|
|
368
|
+
ranks_raw = data.get("ranks", {})
|
|
369
|
+
ranks: Dict[int, Dict[str, object]] = {}
|
|
370
|
+
for key, value in ranks_raw.items():
|
|
371
|
+
try:
|
|
372
|
+
rank_id = int(key)
|
|
373
|
+
except (TypeError, ValueError):
|
|
374
|
+
continue
|
|
375
|
+
ranks[rank_id] = value
|
|
376
|
+
events.append(SessionEvent(timestamp=timestamp, ranks=ranks))
|
|
377
|
+
return events
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def signature_from_snapshot(snapshot: Optional[RankSnapshot]) -> str:
|
|
381
|
+
if snapshot is None:
|
|
382
|
+
return "missing"
|
|
383
|
+
if snapshot.error:
|
|
384
|
+
return f"error:{snapshot.error}"
|
|
385
|
+
if snapshot.output is None:
|
|
386
|
+
return "missing"
|
|
387
|
+
digest = hashlib.sha1(snapshot.output.encode("utf-8", errors="ignore")).hexdigest()
|
|
388
|
+
return digest
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def snapshot_signature(ranks: List[RankInfo], snapshots: Dict[int, RankSnapshot]) -> Dict[int, str]:
|
|
392
|
+
signature: Dict[int, str] = {}
|
|
393
|
+
for info in ranks:
|
|
394
|
+
signature[info.rank] = signature_from_snapshot(snapshots.get(info.rank))
|
|
395
|
+
return signature
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def signature_from_event(event: Dict[str, object]) -> Optional[Dict[int, str]]:
|
|
399
|
+
ranks = event.get("ranks", {})
|
|
400
|
+
if not isinstance(ranks, dict):
|
|
401
|
+
return None
|
|
402
|
+
signature: Dict[int, str] = {}
|
|
403
|
+
for key, payload in ranks.items():
|
|
404
|
+
try:
|
|
405
|
+
rank_id = int(key)
|
|
406
|
+
except (TypeError, ValueError):
|
|
407
|
+
continue
|
|
408
|
+
if not isinstance(payload, dict):
|
|
409
|
+
signature[rank_id] = "missing"
|
|
410
|
+
continue
|
|
411
|
+
if payload.get("error"):
|
|
412
|
+
signature[rank_id] = f"error:{payload.get('error')}"
|
|
413
|
+
elif payload.get("py_spy"):
|
|
414
|
+
digest = hashlib.sha1(
|
|
415
|
+
str(payload.get("py_spy")).encode("utf-8", errors="ignore")
|
|
416
|
+
).hexdigest()
|
|
417
|
+
signature[rank_id] = digest
|
|
418
|
+
else:
|
|
419
|
+
signature[rank_id] = "missing"
|
|
420
|
+
return signature
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
class RecordSession:
|
|
424
|
+
def __init__(self, path: str, state: State, refresh: int, pythonpath: str):
|
|
425
|
+
self.base_dir, self.log_path = ensure_session_path(path)
|
|
426
|
+
write_session_metadata(self.log_path, state, refresh, pythonpath)
|
|
427
|
+
self.handle = open(self.log_path, "a", encoding="utf-8")
|
|
428
|
+
self.event_count = 0
|
|
429
|
+
self.last_signature: Optional[Dict[int, str]] = None
|
|
430
|
+
last_event = read_last_event(self.log_path)
|
|
431
|
+
if last_event:
|
|
432
|
+
self.last_signature = signature_from_event(last_event)
|
|
433
|
+
self.event_count = self._count_events()
|
|
434
|
+
|
|
435
|
+
def _count_events(self) -> int:
|
|
436
|
+
if not os.path.exists(self.log_path):
|
|
437
|
+
return 0
|
|
438
|
+
count = 0
|
|
439
|
+
with open(self.log_path, "r", encoding="utf-8") as handle:
|
|
440
|
+
for line in handle:
|
|
441
|
+
raw = line.strip()
|
|
442
|
+
if not raw:
|
|
443
|
+
continue
|
|
444
|
+
try:
|
|
445
|
+
data = json.loads(raw)
|
|
446
|
+
except json.JSONDecodeError:
|
|
447
|
+
continue
|
|
448
|
+
if isinstance(data, dict) and data.get("type") == "metadata":
|
|
449
|
+
continue
|
|
450
|
+
count += 1
|
|
451
|
+
return count
|
|
452
|
+
|
|
453
|
+
def record_if_changed(
|
|
454
|
+
self,
|
|
455
|
+
state: State,
|
|
456
|
+
rank_to_proc: Dict[int, RankProcess],
|
|
457
|
+
snapshots: Dict[int, RankSnapshot],
|
|
458
|
+
) -> bool:
|
|
459
|
+
signature = snapshot_signature(state.ranks, snapshots)
|
|
460
|
+
if self.last_signature is not None and signature == self.last_signature:
|
|
461
|
+
return False
|
|
462
|
+
payload: Dict[str, object] = {"t": time.time(), "ranks": {}}
|
|
463
|
+
ranks_payload: Dict[str, object] = {}
|
|
464
|
+
for info in state.ranks:
|
|
465
|
+
rank = info.rank
|
|
466
|
+
proc = rank_to_proc.get(rank)
|
|
467
|
+
snapshot = snapshots.get(rank)
|
|
468
|
+
entry: Dict[str, object] = {"host": info.host}
|
|
469
|
+
if proc is not None:
|
|
470
|
+
entry["pid"] = proc.pid
|
|
471
|
+
entry["cmdline"] = proc.cmdline
|
|
472
|
+
entry["rss_kb"] = proc.rss_kb
|
|
473
|
+
if snapshot is None:
|
|
474
|
+
entry["error"] = "No data"
|
|
475
|
+
elif snapshot.error:
|
|
476
|
+
entry["error"] = snapshot.error
|
|
477
|
+
elif snapshot.output is not None:
|
|
478
|
+
entry["py_spy"] = snapshot.output
|
|
479
|
+
else:
|
|
480
|
+
entry["error"] = "No data"
|
|
481
|
+
ranks_payload[str(rank)] = entry
|
|
482
|
+
payload["ranks"] = ranks_payload
|
|
483
|
+
self.handle.write(json.dumps({"type": "event", "data": payload}) + "\n")
|
|
484
|
+
self.handle.flush()
|
|
485
|
+
self.last_signature = signature
|
|
486
|
+
self.event_count += 1
|
|
487
|
+
return True
|
|
488
|
+
|
|
489
|
+
def close(self) -> None:
|
|
490
|
+
try:
|
|
491
|
+
self.handle.close()
|
|
492
|
+
except Exception:
|
|
493
|
+
pass
|
|
494
|
+
|
|
206
495
|
def read_ps() -> List[Proc]:
|
|
207
496
|
result = subprocess.run(
|
|
208
497
|
["ps", "-eo", "pid=,ppid=,args="],
|
|
@@ -349,6 +638,28 @@ def parse_python_selector(args: str) -> ProgramSelector:
|
|
|
349
638
|
return ProgramSelector(module=module, script=script, display=display)
|
|
350
639
|
|
|
351
640
|
|
|
641
|
+
def selector_score(selector: ProgramSelector) -> Tuple[int, int, int, int]:
|
|
642
|
+
if not selector.display:
|
|
643
|
+
return (0, 0, 0, 0)
|
|
644
|
+
has_script = 1 if selector.script else 0
|
|
645
|
+
has_module = 1 if selector.module else 0
|
|
646
|
+
display = f" {selector.display} "
|
|
647
|
+
has_python_target = 1 if ".py" in selector.display or " -m " in display else 0
|
|
648
|
+
return (has_script, has_module, has_python_target, len(selector.display))
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def best_selector_from_procs(procs: Iterable[RankProcess]) -> Optional[ProgramSelector]:
|
|
652
|
+
best: Optional[ProgramSelector] = None
|
|
653
|
+
best_score = selector_score(best or ProgramSelector(module=None, script=None, display=""))
|
|
654
|
+
for proc in procs:
|
|
655
|
+
candidate = parse_python_selector(proc.cmdline)
|
|
656
|
+
score = selector_score(candidate)
|
|
657
|
+
if score > best_score:
|
|
658
|
+
best = candidate
|
|
659
|
+
best_score = score
|
|
660
|
+
return best
|
|
661
|
+
|
|
662
|
+
|
|
352
663
|
def extract_python_exe(cmdline: str) -> Optional[str]:
|
|
353
664
|
if not cmdline:
|
|
354
665
|
return None
|
|
@@ -1110,11 +1421,11 @@ def build_header(
|
|
|
1110
1421
|
state: State, last_update: str, errors: List[str], refresh: int, width: int
|
|
1111
1422
|
) -> Tuple[Text, int]:
|
|
1112
1423
|
program_lines = wrap_program_lines(state.selector, width)
|
|
1113
|
-
if program_lines:
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1424
|
+
if not program_lines:
|
|
1425
|
+
program_lines = [Text("python")]
|
|
1426
|
+
for line in program_lines:
|
|
1427
|
+
line.no_wrap = True
|
|
1428
|
+
line.overflow = "crop"
|
|
1118
1429
|
|
|
1119
1430
|
controls_plain = "q quit | space refresh | t threads | d details"
|
|
1120
1431
|
padding = max(0, width - len(controls_plain))
|
|
@@ -1136,6 +1447,8 @@ def build_header(
|
|
|
1136
1447
|
text.append_text(line)
|
|
1137
1448
|
text.append("\n")
|
|
1138
1449
|
text.append_text(line2)
|
|
1450
|
+
text.no_wrap = True
|
|
1451
|
+
text.overflow = "crop"
|
|
1139
1452
|
return text, len(program_lines) + 1
|
|
1140
1453
|
|
|
1141
1454
|
|
|
@@ -1227,6 +1540,338 @@ def build_details_text(
|
|
|
1227
1540
|
return output
|
|
1228
1541
|
|
|
1229
1542
|
|
|
1543
|
+
def format_elapsed(start: Optional[float]) -> str:
|
|
1544
|
+
if start is None:
|
|
1545
|
+
return "0:00"
|
|
1546
|
+
elapsed = max(0, int(time.time() - start))
|
|
1547
|
+
return format_duration(elapsed)
|
|
1548
|
+
|
|
1549
|
+
|
|
1550
|
+
def format_duration(elapsed: int) -> str:
|
|
1551
|
+
hours = elapsed // 3600
|
|
1552
|
+
minutes = (elapsed % 3600) // 60
|
|
1553
|
+
seconds = elapsed % 60
|
|
1554
|
+
if hours:
|
|
1555
|
+
return f"{hours}:{minutes:02d}:{seconds:02d}"
|
|
1556
|
+
return f"{minutes}:{seconds:02d}"
|
|
1557
|
+
|
|
1558
|
+
|
|
1559
|
+
def build_live_header(
|
|
1560
|
+
state: State,
|
|
1561
|
+
last_update: str,
|
|
1562
|
+
refresh: int,
|
|
1563
|
+
record_line: Optional[str],
|
|
1564
|
+
width: int,
|
|
1565
|
+
) -> Tuple[Text, int]:
|
|
1566
|
+
program_lines = wrap_program_lines(state.selector, width)
|
|
1567
|
+
if not program_lines:
|
|
1568
|
+
program_lines = [Text("python")]
|
|
1569
|
+
for line in program_lines:
|
|
1570
|
+
line.no_wrap = True
|
|
1571
|
+
line.overflow = "crop"
|
|
1572
|
+
|
|
1573
|
+
record_text = None
|
|
1574
|
+
if record_line:
|
|
1575
|
+
record_text = Text()
|
|
1576
|
+
record_text.append("REC", style="bold red")
|
|
1577
|
+
record_text.append(" recording: ")
|
|
1578
|
+
record_text.append(record_line)
|
|
1579
|
+
record_text.truncate(width)
|
|
1580
|
+
record_text.no_wrap = True
|
|
1581
|
+
record_text.overflow = "crop"
|
|
1582
|
+
|
|
1583
|
+
controls_plain = "q quit | space refresh | t threads | d details | r record"
|
|
1584
|
+
padding = max(0, width - len(controls_plain))
|
|
1585
|
+
controls_line = Text(" " * padding + controls_plain)
|
|
1586
|
+
for token in ["q", "space", "t", "d", "r"]:
|
|
1587
|
+
start = controls_plain.find(token)
|
|
1588
|
+
if start != -1:
|
|
1589
|
+
controls_line.stylize(KEY_STYLE, padding + start, padding + start + len(token))
|
|
1590
|
+
controls_line.truncate(width)
|
|
1591
|
+
controls_line.no_wrap = True
|
|
1592
|
+
controls_line.overflow = "crop"
|
|
1593
|
+
|
|
1594
|
+
text = Text()
|
|
1595
|
+
for idx, line in enumerate(program_lines):
|
|
1596
|
+
if idx:
|
|
1597
|
+
text.append("\n")
|
|
1598
|
+
text.append_text(line)
|
|
1599
|
+
text.append("\n")
|
|
1600
|
+
if record_text is not None:
|
|
1601
|
+
text.append_text(record_text)
|
|
1602
|
+
text.append("\n")
|
|
1603
|
+
text.append_text(controls_line)
|
|
1604
|
+
text.no_wrap = True
|
|
1605
|
+
text.overflow = "crop"
|
|
1606
|
+
extra_lines = 2 if record_text is not None else 1
|
|
1607
|
+
return text, len(program_lines) + extra_lines
|
|
1608
|
+
|
|
1609
|
+
|
|
1610
|
+
def build_review_header(
|
|
1611
|
+
state: State,
|
|
1612
|
+
event_index: int,
|
|
1613
|
+
event_total: int,
|
|
1614
|
+
event_time: str,
|
|
1615
|
+
timeline_lines: List[Text],
|
|
1616
|
+
width: int,
|
|
1617
|
+
) -> Tuple[Text, int]:
|
|
1618
|
+
program_lines = wrap_program_lines(state.selector, width)
|
|
1619
|
+
if not program_lines:
|
|
1620
|
+
program_lines = [Text("python")]
|
|
1621
|
+
status_line = Text(
|
|
1622
|
+
f"review {event_index + 1}/{event_total} | {event_time}"
|
|
1623
|
+
)
|
|
1624
|
+
status_line.truncate(width)
|
|
1625
|
+
|
|
1626
|
+
controls_plain = "q quit | left/right move | down zoom | up zoom out | t threads | d details"
|
|
1627
|
+
padding = max(0, width - len(controls_plain))
|
|
1628
|
+
controls_line = Text(" " * padding + controls_plain)
|
|
1629
|
+
for token in ["q", "left/right", "down", "up", "t", "d"]:
|
|
1630
|
+
start = controls_plain.find(token)
|
|
1631
|
+
if start != -1:
|
|
1632
|
+
controls_line.stylize(KEY_STYLE, padding + start, padding + start + len(token))
|
|
1633
|
+
controls_line.truncate(width)
|
|
1634
|
+
controls_line.no_wrap = True
|
|
1635
|
+
controls_line.overflow = "crop"
|
|
1636
|
+
|
|
1637
|
+
text = Text()
|
|
1638
|
+
for idx, line in enumerate(program_lines):
|
|
1639
|
+
if idx:
|
|
1640
|
+
text.append("\n")
|
|
1641
|
+
text.append_text(line)
|
|
1642
|
+
text.append("\n")
|
|
1643
|
+
text.append_text(status_line)
|
|
1644
|
+
for line in timeline_lines:
|
|
1645
|
+
text.append("\n")
|
|
1646
|
+
text.append_text(line)
|
|
1647
|
+
text.append("\n")
|
|
1648
|
+
text.append_text(controls_line)
|
|
1649
|
+
text.no_wrap = True
|
|
1650
|
+
text.overflow = "crop"
|
|
1651
|
+
return text, len(program_lines) + 1 + len(timeline_lines) + 1
|
|
1652
|
+
|
|
1653
|
+
|
|
1654
|
+
def build_buckets(start: int, end: int, width: int) -> List[Tuple[int, int]]:
|
|
1655
|
+
count = max(0, end - start)
|
|
1656
|
+
if count == 0:
|
|
1657
|
+
return []
|
|
1658
|
+
bucket_count = max(1, min(width, count))
|
|
1659
|
+
base = count // bucket_count
|
|
1660
|
+
remainder = count % bucket_count
|
|
1661
|
+
buckets: List[Tuple[int, int]] = []
|
|
1662
|
+
current = start
|
|
1663
|
+
for idx in range(bucket_count):
|
|
1664
|
+
size = base + (1 if idx < remainder else 0)
|
|
1665
|
+
buckets.append((current, current + size))
|
|
1666
|
+
current += size
|
|
1667
|
+
return buckets
|
|
1668
|
+
|
|
1669
|
+
|
|
1670
|
+
def divergence_color(ratio: float) -> str:
|
|
1671
|
+
clamped = min(1.0, max(0.0, ratio))
|
|
1672
|
+
intensity = clamped ** 0.7
|
|
1673
|
+
base = (170, 170, 170)
|
|
1674
|
+
hot = (255, 122, 0)
|
|
1675
|
+
r = int(base[0] + (hot[0] - base[0]) * intensity)
|
|
1676
|
+
g = int(base[1] + (hot[1] - base[1]) * intensity)
|
|
1677
|
+
b = int(base[2] + (hot[2] - base[2]) * intensity)
|
|
1678
|
+
return f"#{r:02x}{g:02x}{b:02x}"
|
|
1679
|
+
|
|
1680
|
+
|
|
1681
|
+
def compute_event_metrics(
|
|
1682
|
+
events: List[SessionEvent],
|
|
1683
|
+
ranks: List[RankInfo],
|
|
1684
|
+
show_threads: bool,
|
|
1685
|
+
) -> Tuple[List[int], List[float], List[int]]:
|
|
1686
|
+
max_stack_lens: List[int] = []
|
|
1687
|
+
divergence_ratios: List[float] = []
|
|
1688
|
+
common_prefixes: List[int] = []
|
|
1689
|
+
for event in events:
|
|
1690
|
+
stacks_by_rank: Dict[int, List[str]] = {}
|
|
1691
|
+
for info in ranks:
|
|
1692
|
+
payload = event.ranks.get(info.rank, {})
|
|
1693
|
+
if payload.get("error"):
|
|
1694
|
+
stacks_by_rank[info.rank] = []
|
|
1695
|
+
continue
|
|
1696
|
+
output = payload.get("py_spy")
|
|
1697
|
+
if not output:
|
|
1698
|
+
stacks_by_rank[info.rank] = []
|
|
1699
|
+
continue
|
|
1700
|
+
lines, _details = render_pyspy_output(str(output), show_threads)
|
|
1701
|
+
stacks_by_rank[info.rank] = extract_stack_lines(lines)
|
|
1702
|
+
max_len = max((len(stack) for stack in stacks_by_rank.values()), default=0)
|
|
1703
|
+
common_len = common_prefix_length(stacks_by_rank)
|
|
1704
|
+
similarity = float(common_len) / float(max_len) if max_len else 0.0
|
|
1705
|
+
ratio = 1.0 - similarity if max_len else 0.0
|
|
1706
|
+
max_stack_lens.append(max_len)
|
|
1707
|
+
divergence_ratios.append(ratio)
|
|
1708
|
+
common_prefixes.append(common_len)
|
|
1709
|
+
return max_stack_lens, divergence_ratios, common_prefixes
|
|
1710
|
+
|
|
1711
|
+
|
|
1712
|
+
def render_timeline_lines(
|
|
1713
|
+
levels: List[TimelineLevel],
|
|
1714
|
+
max_stack_lens: List[int],
|
|
1715
|
+
divergence_ratios: List[float],
|
|
1716
|
+
width: int,
|
|
1717
|
+
) -> List[Text]:
|
|
1718
|
+
lines: List[Text] = []
|
|
1719
|
+
for level_index, level in enumerate(levels):
|
|
1720
|
+
level.buckets = build_buckets(level.start, level.end, width)
|
|
1721
|
+
if level.buckets:
|
|
1722
|
+
level.selected = max(0, min(level.selected, len(level.buckets) - 1))
|
|
1723
|
+
stats: List[Tuple[int, float]] = []
|
|
1724
|
+
for start, end in level.buckets:
|
|
1725
|
+
bucket_heights = max_stack_lens[start:end]
|
|
1726
|
+
bucket_ratios = divergence_ratios[start:end]
|
|
1727
|
+
height = max(bucket_heights) if bucket_heights else 0
|
|
1728
|
+
ratio = max(bucket_ratios) if bucket_ratios else 0.0
|
|
1729
|
+
stats.append((height, ratio))
|
|
1730
|
+
max_height = max((height for height, _ in stats), default=1)
|
|
1731
|
+
if max_height <= 0:
|
|
1732
|
+
max_height = 1
|
|
1733
|
+
text = Text()
|
|
1734
|
+
for idx, (height, ratio) in enumerate(stats):
|
|
1735
|
+
normalized = float(height) / float(max_height) if max_height else 0.0
|
|
1736
|
+
level_idx = int(round(normalized * (len(SPARKLINE_CHARS) - 1)))
|
|
1737
|
+
level_idx = max(0, min(level_idx, len(SPARKLINE_CHARS) - 1))
|
|
1738
|
+
char = SPARKLINE_CHARS[level_idx]
|
|
1739
|
+
style = divergence_color(ratio)
|
|
1740
|
+
if idx == level.selected:
|
|
1741
|
+
if level_index == len(levels) - 1:
|
|
1742
|
+
style = f"{style} bold underline"
|
|
1743
|
+
else:
|
|
1744
|
+
style = f"{style} underline"
|
|
1745
|
+
text.append(char, style=style)
|
|
1746
|
+
text.no_wrap = True
|
|
1747
|
+
text.overflow = "crop"
|
|
1748
|
+
lines.append(text)
|
|
1749
|
+
return lines
|
|
1750
|
+
|
|
1751
|
+
|
|
1752
|
+
def event_snapshots_from_event(
|
|
1753
|
+
event: SessionEvent,
|
|
1754
|
+
ranks: List[RankInfo],
|
|
1755
|
+
show_threads: bool,
|
|
1756
|
+
) -> Dict[int, RankSnapshot]:
|
|
1757
|
+
snapshots: Dict[int, RankSnapshot] = {}
|
|
1758
|
+
for info in ranks:
|
|
1759
|
+
payload = event.ranks.get(info.rank)
|
|
1760
|
+
if not payload:
|
|
1761
|
+
snapshots[info.rank] = RankSnapshot(
|
|
1762
|
+
output=None,
|
|
1763
|
+
error="No data",
|
|
1764
|
+
stack_lines=["No data"],
|
|
1765
|
+
details=[],
|
|
1766
|
+
)
|
|
1767
|
+
continue
|
|
1768
|
+
if payload.get("error"):
|
|
1769
|
+
snapshots[info.rank] = RankSnapshot(
|
|
1770
|
+
output=None,
|
|
1771
|
+
error=str(payload.get("error")),
|
|
1772
|
+
stack_lines=[str(payload.get("error"))],
|
|
1773
|
+
details=[],
|
|
1774
|
+
)
|
|
1775
|
+
continue
|
|
1776
|
+
output = payload.get("py_spy")
|
|
1777
|
+
if not output:
|
|
1778
|
+
snapshots[info.rank] = RankSnapshot(
|
|
1779
|
+
output=None,
|
|
1780
|
+
error="No data",
|
|
1781
|
+
stack_lines=["No data"],
|
|
1782
|
+
details=[],
|
|
1783
|
+
)
|
|
1784
|
+
continue
|
|
1785
|
+
lines, details = render_pyspy_output(str(output), show_threads)
|
|
1786
|
+
snapshots[info.rank] = RankSnapshot(
|
|
1787
|
+
output=str(output),
|
|
1788
|
+
error=None,
|
|
1789
|
+
stack_lines=lines,
|
|
1790
|
+
details=details,
|
|
1791
|
+
)
|
|
1792
|
+
return snapshots
|
|
1793
|
+
|
|
1794
|
+
|
|
1795
|
+
def rank_to_proc_from_event(
|
|
1796
|
+
event: SessionEvent,
|
|
1797
|
+
ranks: List[RankInfo],
|
|
1798
|
+
) -> Dict[int, RankProcess]:
|
|
1799
|
+
rank_to_proc: Dict[int, RankProcess] = {}
|
|
1800
|
+
for info in ranks:
|
|
1801
|
+
payload = event.ranks.get(info.rank)
|
|
1802
|
+
if not payload:
|
|
1803
|
+
continue
|
|
1804
|
+
pid = payload.get("pid")
|
|
1805
|
+
cmdline = payload.get("cmdline")
|
|
1806
|
+
rss_kb = payload.get("rss_kb")
|
|
1807
|
+
if pid is None or cmdline is None:
|
|
1808
|
+
continue
|
|
1809
|
+
try:
|
|
1810
|
+
pid_value = int(pid)
|
|
1811
|
+
except (TypeError, ValueError):
|
|
1812
|
+
continue
|
|
1813
|
+
rss_value = None
|
|
1814
|
+
if rss_kb is not None:
|
|
1815
|
+
try:
|
|
1816
|
+
rss_value = int(rss_kb)
|
|
1817
|
+
except (TypeError, ValueError):
|
|
1818
|
+
rss_value = None
|
|
1819
|
+
rank_to_proc[info.rank] = RankProcess(
|
|
1820
|
+
pid=pid_value,
|
|
1821
|
+
cmdline=str(cmdline),
|
|
1822
|
+
rss_kb=rss_value,
|
|
1823
|
+
python_exe=None,
|
|
1824
|
+
env={},
|
|
1825
|
+
)
|
|
1826
|
+
return rank_to_proc
|
|
1827
|
+
|
|
1828
|
+
|
|
1829
|
+
def compute_divergence_from_snapshots(
|
|
1830
|
+
ranks: List[RankInfo], snapshots: Dict[int, RankSnapshot]
|
|
1831
|
+
) -> Tuple[float, int, int]:
|
|
1832
|
+
stack_lines_by_rank = {
|
|
1833
|
+
info.rank: extract_stack_lines(snapshots.get(info.rank, RankSnapshot(None, "No data", [], [])).stack_lines)
|
|
1834
|
+
for info in ranks
|
|
1835
|
+
}
|
|
1836
|
+
max_len = max((len(stack) for stack in stack_lines_by_rank.values()), default=0)
|
|
1837
|
+
common_len = common_prefix_length(stack_lines_by_rank)
|
|
1838
|
+
similarity = float(common_len) / float(max_len) if max_len else 0.0
|
|
1839
|
+
divergence = 1.0 - similarity if max_len else 0.0
|
|
1840
|
+
return divergence, common_len, max_len
|
|
1841
|
+
|
|
1842
|
+
|
|
1843
|
+
def read_key(timeout: float) -> Optional[str]:
|
|
1844
|
+
if sys.stdin not in select_with_timeout(timeout):
|
|
1845
|
+
return None
|
|
1846
|
+
key = sys.stdin.read(1)
|
|
1847
|
+
if key != "\x1b":
|
|
1848
|
+
return key
|
|
1849
|
+
seq = key
|
|
1850
|
+
for _ in range(2):
|
|
1851
|
+
if sys.stdin in select_with_timeout(0.01):
|
|
1852
|
+
seq += sys.stdin.read(1)
|
|
1853
|
+
if seq == "\x1b[A":
|
|
1854
|
+
return "up"
|
|
1855
|
+
if seq == "\x1b[B":
|
|
1856
|
+
return "down"
|
|
1857
|
+
if seq == "\x1b[C":
|
|
1858
|
+
return "right"
|
|
1859
|
+
if seq == "\x1b[D":
|
|
1860
|
+
return "left"
|
|
1861
|
+
return None
|
|
1862
|
+
|
|
1863
|
+
|
|
1864
|
+
def is_pid_alive(pid: int) -> bool:
|
|
1865
|
+
if pid <= 0:
|
|
1866
|
+
return False
|
|
1867
|
+
try:
|
|
1868
|
+
os.kill(pid, 0)
|
|
1869
|
+
except ProcessLookupError:
|
|
1870
|
+
return False
|
|
1871
|
+
except PermissionError:
|
|
1872
|
+
return True
|
|
1873
|
+
return True
|
|
1874
|
+
|
|
1230
1875
|
def detect_state(args: argparse.Namespace) -> State:
|
|
1231
1876
|
procs = read_ps()
|
|
1232
1877
|
prte = find_prterun(procs, args.prterun_pid)
|
|
@@ -1278,29 +1923,40 @@ def collect_stacks(
|
|
|
1278
1923
|
pythonpath: str,
|
|
1279
1924
|
show_threads: bool,
|
|
1280
1925
|
install_attempted: set,
|
|
1281
|
-
) -> Tuple[Dict[int,
|
|
1282
|
-
|
|
1283
|
-
details_by_rank: Dict[int, List[str]] = {}
|
|
1926
|
+
) -> Tuple[Dict[int, RankSnapshot], List[str]]:
|
|
1927
|
+
snapshots: Dict[int, RankSnapshot] = {}
|
|
1284
1928
|
errors: List[str] = []
|
|
1285
1929
|
for entry in state.ranks:
|
|
1286
1930
|
proc = rank_to_proc.get(entry.rank)
|
|
1287
1931
|
if proc is None:
|
|
1288
|
-
|
|
1289
|
-
|
|
1932
|
+
snapshots[entry.rank] = RankSnapshot(
|
|
1933
|
+
output=None,
|
|
1934
|
+
error="No process",
|
|
1935
|
+
stack_lines=["No process"],
|
|
1936
|
+
details=[],
|
|
1937
|
+
)
|
|
1290
1938
|
continue
|
|
1291
1939
|
output, error = run_py_spy(entry.host, proc, pythonpath, install_attempted)
|
|
1292
1940
|
if error:
|
|
1293
1941
|
errors.append(error)
|
|
1294
|
-
|
|
1295
|
-
|
|
1942
|
+
snapshots[entry.rank] = RankSnapshot(
|
|
1943
|
+
output=None,
|
|
1944
|
+
error=error,
|
|
1945
|
+
stack_lines=[error],
|
|
1946
|
+
details=[],
|
|
1947
|
+
)
|
|
1296
1948
|
continue
|
|
1297
1949
|
lines, details = render_pyspy_output(output or "", show_threads)
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1950
|
+
snapshots[entry.rank] = RankSnapshot(
|
|
1951
|
+
output=output,
|
|
1952
|
+
error=None,
|
|
1953
|
+
stack_lines=lines,
|
|
1954
|
+
details=details,
|
|
1955
|
+
)
|
|
1956
|
+
return snapshots, errors
|
|
1301
1957
|
|
|
1302
1958
|
|
|
1303
|
-
def
|
|
1959
|
+
def parse_live_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
|
1304
1960
|
parser = argparse.ArgumentParser(description="Show MPI Python stacks across hosts.")
|
|
1305
1961
|
parser.add_argument("--rankfile", help="Override rankfile path")
|
|
1306
1962
|
parser.add_argument("--prterun-pid", type=int, help="PID of prterun/mpirun")
|
|
@@ -1309,11 +1965,61 @@ def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
|
|
1309
1965
|
"--pythonpath",
|
|
1310
1966
|
help="PYTHONPATH to export remotely (defaults to local PYTHONPATH)",
|
|
1311
1967
|
)
|
|
1968
|
+
parser.add_argument(
|
|
1969
|
+
"--out",
|
|
1970
|
+
help="Output path for recordings (.jsonl file or directory)",
|
|
1971
|
+
)
|
|
1312
1972
|
return parser.parse_args(argv)
|
|
1313
1973
|
|
|
1314
1974
|
|
|
1315
|
-
def
|
|
1316
|
-
|
|
1975
|
+
def parse_review_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
|
1976
|
+
parser = argparse.ArgumentParser(description="Review a recorded mpiptop session.")
|
|
1977
|
+
parser.add_argument("path", help="Path to a recorded session (.jsonl file or directory)")
|
|
1978
|
+
return parser.parse_args(argv)
|
|
1979
|
+
|
|
1980
|
+
|
|
1981
|
+
def parse_summarize_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
|
1982
|
+
parser = argparse.ArgumentParser(description="Summarize a recorded mpiptop session.")
|
|
1983
|
+
parser.add_argument("path", help="Path to a recorded session (.jsonl file or directory)")
|
|
1984
|
+
parser.add_argument(
|
|
1985
|
+
"--format",
|
|
1986
|
+
choices=["text", "json"],
|
|
1987
|
+
default="text",
|
|
1988
|
+
help="Output format",
|
|
1989
|
+
)
|
|
1990
|
+
parser.add_argument(
|
|
1991
|
+
"--top",
|
|
1992
|
+
type=int,
|
|
1993
|
+
default=5,
|
|
1994
|
+
help="Top signatures to report",
|
|
1995
|
+
)
|
|
1996
|
+
return parser.parse_args(argv)
|
|
1997
|
+
|
|
1998
|
+
|
|
1999
|
+
def parse_record_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
|
2000
|
+
parser = argparse.ArgumentParser(description="Record an mpiptop session.")
|
|
2001
|
+
parser.add_argument("--rankfile", help="Override rankfile path")
|
|
2002
|
+
parser.add_argument("--prterun-pid", type=int, help="PID of prterun/mpirun")
|
|
2003
|
+
parser.add_argument("--refresh", type=int, default=10, help="Refresh interval (seconds)")
|
|
2004
|
+
parser.add_argument(
|
|
2005
|
+
"--pythonpath",
|
|
2006
|
+
help="PYTHONPATH to export remotely (defaults to local PYTHONPATH)",
|
|
2007
|
+
)
|
|
2008
|
+
parser.add_argument(
|
|
2009
|
+
"--out",
|
|
2010
|
+
help="Output path for recordings (.jsonl file or directory)",
|
|
2011
|
+
)
|
|
2012
|
+
parser.add_argument(
|
|
2013
|
+
"--quiet",
|
|
2014
|
+
action="store_true",
|
|
2015
|
+
help="Only print start/stop lines",
|
|
2016
|
+
)
|
|
2017
|
+
args = parser.parse_args(argv)
|
|
2018
|
+
args.record = True
|
|
2019
|
+
return args
|
|
2020
|
+
|
|
2021
|
+
|
|
2022
|
+
def run_live(args: argparse.Namespace) -> int:
|
|
1317
2023
|
pythonpath = args.pythonpath if args.pythonpath is not None else os.environ.get("PYTHONPATH", "")
|
|
1318
2024
|
|
|
1319
2025
|
state = detect_state(args)
|
|
@@ -1322,6 +2028,10 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
|
1322
2028
|
show_threads = False
|
|
1323
2029
|
show_details = False
|
|
1324
2030
|
install_attempted: set = set()
|
|
2031
|
+
record_session: Optional[RecordSession] = None
|
|
2032
|
+
recording_enabled = bool(getattr(args, "record", False))
|
|
2033
|
+
record_started_at: Optional[float] = None
|
|
2034
|
+
record_path = args.out
|
|
1325
2035
|
|
|
1326
2036
|
def handle_sigint(_sig, _frame):
|
|
1327
2037
|
raise KeyboardInterrupt
|
|
@@ -1338,29 +2048,60 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
|
1338
2048
|
last_update = "never"
|
|
1339
2049
|
next_refresh = 0.0
|
|
1340
2050
|
|
|
2051
|
+
def start_recording() -> None:
|
|
2052
|
+
nonlocal record_session, recording_enabled, record_started_at, record_path
|
|
2053
|
+
if record_session is None:
|
|
2054
|
+
record_path = record_path or default_session_path()
|
|
2055
|
+
record_session = RecordSession(record_path, state, refresh, pythonpath)
|
|
2056
|
+
recording_enabled = True
|
|
2057
|
+
if record_started_at is None:
|
|
2058
|
+
record_started_at = time.time()
|
|
2059
|
+
|
|
2060
|
+
def stop_recording() -> None:
|
|
2061
|
+
nonlocal recording_enabled, record_started_at
|
|
2062
|
+
recording_enabled = False
|
|
2063
|
+
record_started_at = None
|
|
2064
|
+
|
|
2065
|
+
if recording_enabled:
|
|
2066
|
+
start_recording()
|
|
2067
|
+
|
|
1341
2068
|
def refresh_view() -> None:
|
|
1342
|
-
nonlocal last_update
|
|
1343
|
-
rank_to_proc,
|
|
1344
|
-
|
|
2069
|
+
nonlocal last_update, state, record_session
|
|
2070
|
+
rank_to_proc, _pid_errors = collect_rank_pids(state)
|
|
2071
|
+
candidate = best_selector_from_procs(rank_to_proc.values())
|
|
2072
|
+
if candidate and selector_score(candidate) > selector_score(state.selector):
|
|
2073
|
+
state = dataclasses.replace(state, selector=candidate)
|
|
2074
|
+
snapshots, _stack_errors = collect_stacks(
|
|
1345
2075
|
state, rank_to_proc, pythonpath, show_threads, install_attempted
|
|
1346
2076
|
)
|
|
2077
|
+
if recording_enabled and record_session is not None:
|
|
2078
|
+
record_session.record_if_changed(state, rank_to_proc, snapshots)
|
|
1347
2079
|
stacks_text: Dict[int, Text] = {}
|
|
1348
|
-
stack_lines_by_rank = {
|
|
2080
|
+
stack_lines_by_rank = {
|
|
2081
|
+
rank: extract_stack_lines(snapshot.stack_lines)
|
|
2082
|
+
for rank, snapshot in snapshots.items()
|
|
2083
|
+
}
|
|
1349
2084
|
prefix_len = common_prefix_length(stack_lines_by_rank)
|
|
1350
2085
|
diff_index = None
|
|
1351
2086
|
if any(stack_lines_by_rank.values()):
|
|
1352
|
-
if prefix_len > 0
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
diff_index = 0
|
|
1356
|
-
for rank, lines in stacks.items():
|
|
2087
|
+
diff_index = max(0, prefix_len - 1) if prefix_len > 0 else 0
|
|
2088
|
+
for rank, snapshot in snapshots.items():
|
|
2089
|
+
lines = snapshot.stack_lines
|
|
1357
2090
|
marked = mark_diff_line(lines, diff_index) if diff_index is not None else lines
|
|
1358
2091
|
stacks_text[rank] = style_lines(marked)
|
|
1359
|
-
|
|
2092
|
+
details_by_rank = {
|
|
2093
|
+
rank: snapshot.details for rank, snapshot in snapshots.items()
|
|
2094
|
+
}
|
|
1360
2095
|
last_update = time.strftime("%H:%M:%S")
|
|
1361
2096
|
width, height = shutil.get_terminal_size((120, 40))
|
|
1362
2097
|
content_width = max(0, width - 4)
|
|
1363
|
-
|
|
2098
|
+
record_line = None
|
|
2099
|
+
if record_session is not None and recording_enabled:
|
|
2100
|
+
record_line = f"{record_session.log_path} | events {record_session.event_count} | {format_elapsed(record_started_at)}"
|
|
2101
|
+
record_line = shorten(record_line, max(10, content_width - 12))
|
|
2102
|
+
header, header_lines = build_live_header(
|
|
2103
|
+
state, last_update, refresh, record_line, content_width
|
|
2104
|
+
)
|
|
1364
2105
|
header_height = header_lines + 2
|
|
1365
2106
|
header_height = max(3, min(header_height, max(3, height - 1)))
|
|
1366
2107
|
layout["header"].size = header_height
|
|
@@ -1390,26 +2131,398 @@ def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
|
1390
2131
|
refresh_view()
|
|
1391
2132
|
next_refresh = now + refresh
|
|
1392
2133
|
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
2134
|
+
key = read_key(0.1)
|
|
2135
|
+
if key is None:
|
|
2136
|
+
continue
|
|
2137
|
+
if key == "q":
|
|
2138
|
+
return 0
|
|
2139
|
+
if key == " ":
|
|
2140
|
+
next_refresh = 0.0
|
|
2141
|
+
if key == "t":
|
|
2142
|
+
show_threads = not show_threads
|
|
2143
|
+
next_refresh = 0.0
|
|
2144
|
+
if key == "d":
|
|
2145
|
+
show_details = not show_details
|
|
2146
|
+
next_refresh = 0.0
|
|
2147
|
+
if key == "r":
|
|
2148
|
+
if recording_enabled:
|
|
2149
|
+
stop_recording()
|
|
2150
|
+
else:
|
|
2151
|
+
start_recording()
|
|
2152
|
+
next_refresh = 0.0
|
|
1405
2153
|
except KeyboardInterrupt:
|
|
1406
2154
|
return 0
|
|
1407
2155
|
finally:
|
|
1408
2156
|
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
|
2157
|
+
if record_session is not None:
|
|
2158
|
+
record_session.close()
|
|
2159
|
+
if record_session.event_count > 0:
|
|
2160
|
+
print(f"Recording saved to: {record_session.log_path}")
|
|
1409
2161
|
|
|
1410
2162
|
return 0
|
|
1411
2163
|
|
|
1412
2164
|
|
|
2165
|
+
def run_record_batch(args: argparse.Namespace) -> int:
|
|
2166
|
+
pythonpath = args.pythonpath if args.pythonpath is not None else os.environ.get("PYTHONPATH", "")
|
|
2167
|
+
state = detect_state(args)
|
|
2168
|
+
refresh = max(1, args.refresh)
|
|
2169
|
+
record_path = args.out or default_session_path()
|
|
2170
|
+
record_session = RecordSession(record_path, state, refresh, pythonpath)
|
|
2171
|
+
quiet = bool(args.quiet)
|
|
2172
|
+
install_attempted: set = set()
|
|
2173
|
+
start_time = time.time()
|
|
2174
|
+
last_change: Optional[float] = None
|
|
2175
|
+
last_heartbeat = start_time
|
|
2176
|
+
last_divergence_time = 0.0
|
|
2177
|
+
stop_reason = "completed"
|
|
2178
|
+
|
|
2179
|
+
target = state.selector.display or "python"
|
|
2180
|
+
target = shorten(target, 120)
|
|
2181
|
+
print(
|
|
2182
|
+
f"recording start | path={record_session.log_path} | ranks={len(state.ranks)} | "
|
|
2183
|
+
f"refresh={refresh}s | target={target}"
|
|
2184
|
+
)
|
|
2185
|
+
|
|
2186
|
+
try:
|
|
2187
|
+
while True:
|
|
2188
|
+
loop_start = time.time()
|
|
2189
|
+
if not is_pid_alive(state.prte_pid):
|
|
2190
|
+
stop_reason = "prterun-exited"
|
|
2191
|
+
break
|
|
2192
|
+
rank_to_proc, _pid_errors = collect_rank_pids(state)
|
|
2193
|
+
snapshots, _stack_errors = collect_stacks(
|
|
2194
|
+
state, rank_to_proc, pythonpath, False, install_attempted
|
|
2195
|
+
)
|
|
2196
|
+
if record_session.record_if_changed(state, rank_to_proc, snapshots):
|
|
2197
|
+
last_change = time.time()
|
|
2198
|
+
divergence, common_len, max_len = compute_divergence_from_snapshots(state.ranks, snapshots)
|
|
2199
|
+
now = time.time()
|
|
2200
|
+
if not quiet and now - last_heartbeat >= HEARTBEAT_INTERVAL:
|
|
2201
|
+
last_change_age = "never"
|
|
2202
|
+
if last_change is not None:
|
|
2203
|
+
last_change_age = format_duration(int(now - last_change))
|
|
2204
|
+
elapsed = format_duration(int(now - start_time))
|
|
2205
|
+
print(
|
|
2206
|
+
f"heartbeat | events={record_session.event_count} | "
|
|
2207
|
+
f"last_change={last_change_age} | elapsed={elapsed}"
|
|
2208
|
+
)
|
|
2209
|
+
last_heartbeat = now
|
|
2210
|
+
if (
|
|
2211
|
+
not quiet
|
|
2212
|
+
and divergence >= DIVERGENCE_THRESHOLD
|
|
2213
|
+
and now - last_divergence_time >= DIVERGENCE_INTERVAL
|
|
2214
|
+
):
|
|
2215
|
+
print(
|
|
2216
|
+
f"divergence | ratio={divergence:.2f} | common={common_len} | max={max_len}"
|
|
2217
|
+
)
|
|
2218
|
+
last_divergence_time = now
|
|
2219
|
+
elapsed = time.time() - loop_start
|
|
2220
|
+
sleep_for = refresh - elapsed
|
|
2221
|
+
if sleep_for > 0:
|
|
2222
|
+
time.sleep(sleep_for)
|
|
2223
|
+
except KeyboardInterrupt:
|
|
2224
|
+
stop_reason = "interrupted"
|
|
2225
|
+
finally:
|
|
2226
|
+
record_session.close()
|
|
2227
|
+
elapsed = format_duration(int(time.time() - start_time))
|
|
2228
|
+
print(
|
|
2229
|
+
f"recording stop | reason={stop_reason} | events={record_session.event_count} | "
|
|
2230
|
+
f"elapsed={elapsed} | path={record_session.log_path}"
|
|
2231
|
+
)
|
|
2232
|
+
|
|
2233
|
+
return 0
|
|
2234
|
+
|
|
2235
|
+
|
|
2236
|
+
def run_review(args: argparse.Namespace) -> int:
|
|
2237
|
+
metadata = load_session_metadata(args.path)
|
|
2238
|
+
ranks = [
|
|
2239
|
+
RankInfo(rank=int(item["rank"]), host=str(item["host"]))
|
|
2240
|
+
for item in metadata.get("ranks", [])
|
|
2241
|
+
if "rank" in item and "host" in item
|
|
2242
|
+
]
|
|
2243
|
+
if not ranks:
|
|
2244
|
+
raise SystemExit("no ranks found in metadata")
|
|
2245
|
+
selector_payload = metadata.get("selector", {}) if isinstance(metadata.get("selector"), dict) else {}
|
|
2246
|
+
selector = ProgramSelector(
|
|
2247
|
+
module=selector_payload.get("module"),
|
|
2248
|
+
script=selector_payload.get("script"),
|
|
2249
|
+
display=selector_payload.get("display", ""),
|
|
2250
|
+
)
|
|
2251
|
+
state = State(
|
|
2252
|
+
prte_pid=int(metadata.get("prte_pid", 0) or 0),
|
|
2253
|
+
rankfile=str(metadata.get("rankfile", "")),
|
|
2254
|
+
ranks=ranks,
|
|
2255
|
+
selector=selector,
|
|
2256
|
+
)
|
|
2257
|
+
events = load_session_events(args.path)
|
|
2258
|
+
if not events:
|
|
2259
|
+
raise SystemExit("no events recorded")
|
|
2260
|
+
|
|
2261
|
+
console = Console()
|
|
2262
|
+
show_threads = False
|
|
2263
|
+
show_details = False
|
|
2264
|
+
levels = [TimelineLevel(0, len(events), selected=0)]
|
|
2265
|
+
max_stack_lens, divergence_ratios, _ = compute_event_metrics(
|
|
2266
|
+
events, ranks, show_threads
|
|
2267
|
+
)
|
|
2268
|
+
|
|
2269
|
+
def handle_sigint(_sig, _frame):
|
|
2270
|
+
raise KeyboardInterrupt
|
|
2271
|
+
|
|
2272
|
+
signal.signal(signal.SIGINT, handle_sigint)
|
|
2273
|
+
|
|
2274
|
+
fd = sys.stdin.fileno()
|
|
2275
|
+
old_settings = termios.tcgetattr(fd)
|
|
2276
|
+
tty.setcbreak(fd)
|
|
2277
|
+
|
|
2278
|
+
layout = Layout()
|
|
2279
|
+
layout.split_column(Layout(name="header", size=HEADER_HEIGHT), Layout(name="body"))
|
|
2280
|
+
|
|
2281
|
+
def refresh_view() -> None:
|
|
2282
|
+
width, height = shutil.get_terminal_size((120, 40))
|
|
2283
|
+
content_width = max(0, width - 4)
|
|
2284
|
+
timeline_lines = render_timeline_lines(levels, max_stack_lens, divergence_ratios, content_width)
|
|
2285
|
+
active_level = levels[-1]
|
|
2286
|
+
if not active_level.buckets:
|
|
2287
|
+
return
|
|
2288
|
+
current_index = active_level.buckets[active_level.selected][0]
|
|
2289
|
+
current_index = max(0, min(current_index, len(events) - 1))
|
|
2290
|
+
event = events[current_index]
|
|
2291
|
+
snapshots = event_snapshots_from_event(event, ranks, show_threads)
|
|
2292
|
+
rank_to_proc = rank_to_proc_from_event(event, ranks)
|
|
2293
|
+
stack_lines_by_rank = {
|
|
2294
|
+
rank: extract_stack_lines(snapshot.stack_lines)
|
|
2295
|
+
for rank, snapshot in snapshots.items()
|
|
2296
|
+
}
|
|
2297
|
+
prefix_len = common_prefix_length(stack_lines_by_rank)
|
|
2298
|
+
diff_index = None
|
|
2299
|
+
if any(stack_lines_by_rank.values()):
|
|
2300
|
+
diff_index = max(0, prefix_len - 1) if prefix_len > 0 else 0
|
|
2301
|
+
stacks_text: Dict[int, Text] = {}
|
|
2302
|
+
for rank, snapshot in snapshots.items():
|
|
2303
|
+
lines = snapshot.stack_lines
|
|
2304
|
+
marked = mark_diff_line(lines, diff_index) if diff_index is not None else lines
|
|
2305
|
+
stacks_text[rank] = style_lines(marked)
|
|
2306
|
+
details_by_rank = {
|
|
2307
|
+
rank: snapshot.details for rank, snapshot in snapshots.items()
|
|
2308
|
+
}
|
|
2309
|
+
event_time = iso_timestamp(event.timestamp)
|
|
2310
|
+
header, header_lines = build_review_header(
|
|
2311
|
+
state,
|
|
2312
|
+
current_index,
|
|
2313
|
+
len(events),
|
|
2314
|
+
event_time,
|
|
2315
|
+
timeline_lines,
|
|
2316
|
+
content_width,
|
|
2317
|
+
)
|
|
2318
|
+
header_height = header_lines + 2
|
|
2319
|
+
header_height = max(3, min(header_height, max(3, height - 1)))
|
|
2320
|
+
layout["header"].size = header_height
|
|
2321
|
+
body_height = max(1, height - header_height)
|
|
2322
|
+
total_columns = len(ranks) + (1 if show_details else 0)
|
|
2323
|
+
column_width = max(1, content_width // max(1, total_columns))
|
|
2324
|
+
inner_width = max(1, column_width - 4)
|
|
2325
|
+
details_text = (
|
|
2326
|
+
build_details_text(ranks, rank_to_proc, details_by_rank, inner_width)
|
|
2327
|
+
if show_details
|
|
2328
|
+
else None
|
|
2329
|
+
)
|
|
2330
|
+
layout["header"].update(
|
|
2331
|
+
Panel(header, padding=(0, 1), border_style=BORDER_STYLE)
|
|
2332
|
+
)
|
|
2333
|
+
layout["body"].update(
|
|
2334
|
+
render_columns(ranks, stacks_text, details_text, body_height, rank_to_proc)
|
|
2335
|
+
)
|
|
2336
|
+
|
|
2337
|
+
try:
|
|
2338
|
+
refresh_view()
|
|
2339
|
+
with Live(layout, console=console, refresh_per_second=10, screen=True):
|
|
2340
|
+
while True:
|
|
2341
|
+
key = read_key(0.1)
|
|
2342
|
+
if key is None:
|
|
2343
|
+
continue
|
|
2344
|
+
if key == "q":
|
|
2345
|
+
return 0
|
|
2346
|
+
if key == "t":
|
|
2347
|
+
show_threads = not show_threads
|
|
2348
|
+
max_stack_lens, divergence_ratios, _ = compute_event_metrics(
|
|
2349
|
+
events, ranks, show_threads
|
|
2350
|
+
)
|
|
2351
|
+
refresh_view()
|
|
2352
|
+
if key == "d":
|
|
2353
|
+
show_details = not show_details
|
|
2354
|
+
refresh_view()
|
|
2355
|
+
if key == "left":
|
|
2356
|
+
level = levels[-1]
|
|
2357
|
+
level.selected = max(0, level.selected - 1)
|
|
2358
|
+
refresh_view()
|
|
2359
|
+
if key == "right":
|
|
2360
|
+
level = levels[-1]
|
|
2361
|
+
level.selected = min(max(0, len(level.buckets) - 1), level.selected + 1)
|
|
2362
|
+
refresh_view()
|
|
2363
|
+
if key == "down":
|
|
2364
|
+
level = levels[-1]
|
|
2365
|
+
if not level.buckets:
|
|
2366
|
+
continue
|
|
2367
|
+
bucket = level.buckets[level.selected]
|
|
2368
|
+
if bucket[1] - bucket[0] <= 1:
|
|
2369
|
+
continue
|
|
2370
|
+
levels.append(TimelineLevel(bucket[0], bucket[1], selected=0))
|
|
2371
|
+
refresh_view()
|
|
2372
|
+
if key == "up":
|
|
2373
|
+
if len(levels) > 1:
|
|
2374
|
+
levels.pop()
|
|
2375
|
+
refresh_view()
|
|
2376
|
+
except KeyboardInterrupt:
|
|
2377
|
+
return 0
|
|
2378
|
+
finally:
|
|
2379
|
+
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
|
2380
|
+
|
|
2381
|
+
return 0
|
|
2382
|
+
|
|
2383
|
+
|
|
2384
|
+
def run_summarize(args: argparse.Namespace) -> int:
|
|
2385
|
+
metadata = load_session_metadata(args.path)
|
|
2386
|
+
events = load_session_events(args.path)
|
|
2387
|
+
ranks = [
|
|
2388
|
+
RankInfo(rank=int(item["rank"]), host=str(item["host"]))
|
|
2389
|
+
for item in metadata.get("ranks", [])
|
|
2390
|
+
if "rank" in item and "host" in item
|
|
2391
|
+
]
|
|
2392
|
+
if not ranks:
|
|
2393
|
+
raise SystemExit("no ranks found in metadata")
|
|
2394
|
+
if not events:
|
|
2395
|
+
raise SystemExit("no events recorded")
|
|
2396
|
+
|
|
2397
|
+
rank_order = [info.rank for info in ranks]
|
|
2398
|
+
signature_counts: Dict[Tuple[str, ...], int] = {}
|
|
2399
|
+
signature_examples: Dict[Tuple[str, ...], Dict[int, str]] = {}
|
|
2400
|
+
rank_change_counts: Dict[int, int] = {rank: 0 for rank in rank_order}
|
|
2401
|
+
previous_rank_signature: Dict[int, str] = {rank: "" for rank in rank_order}
|
|
2402
|
+
max_stack_lens, divergence_ratios, common_prefixes = compute_event_metrics(
|
|
2403
|
+
events, ranks, show_threads=False
|
|
2404
|
+
)
|
|
2405
|
+
|
|
2406
|
+
for event in events:
|
|
2407
|
+
per_rank_signature: Dict[int, str] = {}
|
|
2408
|
+
per_rank_top_frame: Dict[int, str] = {}
|
|
2409
|
+
for info in ranks:
|
|
2410
|
+
payload = event.ranks.get(info.rank, {})
|
|
2411
|
+
if payload.get("error"):
|
|
2412
|
+
signature = f"error:{payload.get('error')}"
|
|
2413
|
+
top_frame = signature
|
|
2414
|
+
else:
|
|
2415
|
+
output = payload.get("py_spy")
|
|
2416
|
+
if output:
|
|
2417
|
+
lines, _details = render_pyspy_output(str(output), show_threads=False)
|
|
2418
|
+
stack_lines = extract_stack_lines(lines)
|
|
2419
|
+
signature = hashlib.sha1(
|
|
2420
|
+
"\n".join(stack_lines).encode("utf-8", errors="ignore")
|
|
2421
|
+
).hexdigest()
|
|
2422
|
+
top_frame = stack_lines[0].strip() if stack_lines else "empty"
|
|
2423
|
+
else:
|
|
2424
|
+
signature = "empty"
|
|
2425
|
+
top_frame = "empty"
|
|
2426
|
+
per_rank_signature[info.rank] = signature
|
|
2427
|
+
per_rank_top_frame[info.rank] = top_frame
|
|
2428
|
+
|
|
2429
|
+
for rank, signature in per_rank_signature.items():
|
|
2430
|
+
if previous_rank_signature.get(rank) != signature:
|
|
2431
|
+
rank_change_counts[rank] = rank_change_counts.get(rank, 0) + 1
|
|
2432
|
+
previous_rank_signature[rank] = signature
|
|
2433
|
+
|
|
2434
|
+
signature_key = tuple(per_rank_signature[rank] for rank in rank_order)
|
|
2435
|
+
signature_counts[signature_key] = signature_counts.get(signature_key, 0) + 1
|
|
2436
|
+
if signature_key not in signature_examples:
|
|
2437
|
+
signature_examples[signature_key] = per_rank_top_frame
|
|
2438
|
+
|
|
2439
|
+
sorted_signatures = sorted(
|
|
2440
|
+
signature_counts.items(), key=lambda item: item[1], reverse=True
|
|
2441
|
+
)
|
|
2442
|
+
top_signatures = sorted_signatures[: max(1, args.top)]
|
|
2443
|
+
total_events = len(events)
|
|
2444
|
+
start_time = iso_timestamp(events[0].timestamp)
|
|
2445
|
+
end_time = iso_timestamp(events[-1].timestamp)
|
|
2446
|
+
|
|
2447
|
+
if args.format == "json":
|
|
2448
|
+
payload = {
|
|
2449
|
+
"metadata": metadata,
|
|
2450
|
+
"event_count": total_events,
|
|
2451
|
+
"time_range": {"start": start_time, "end": end_time},
|
|
2452
|
+
"rank_change_counts": rank_change_counts,
|
|
2453
|
+
"top_signatures": [
|
|
2454
|
+
{
|
|
2455
|
+
"count": count,
|
|
2456
|
+
"ratio": count / float(total_events),
|
|
2457
|
+
"example_top_frames": signature_examples.get(signature_key, {}),
|
|
2458
|
+
}
|
|
2459
|
+
for signature_key, count in top_signatures
|
|
2460
|
+
],
|
|
2461
|
+
"most_divergent": sorted(
|
|
2462
|
+
[
|
|
2463
|
+
{
|
|
2464
|
+
"index": idx,
|
|
2465
|
+
"timestamp": iso_timestamp(events[idx].timestamp),
|
|
2466
|
+
"divergence_ratio": divergence_ratios[idx],
|
|
2467
|
+
"common_prefix_len": common_prefixes[idx],
|
|
2468
|
+
"max_stack_len": max_stack_lens[idx],
|
|
2469
|
+
}
|
|
2470
|
+
for idx in range(total_events)
|
|
2471
|
+
],
|
|
2472
|
+
key=lambda item: item["divergence_ratio"],
|
|
2473
|
+
reverse=True,
|
|
2474
|
+
)[:5],
|
|
2475
|
+
}
|
|
2476
|
+
print(json.dumps(payload, indent=2, sort_keys=True))
|
|
2477
|
+
return 0
|
|
2478
|
+
|
|
2479
|
+
print(f"Session: {args.path}")
|
|
2480
|
+
print(f"Events: {total_events} ({start_time} -> {end_time})")
|
|
2481
|
+
print(f"Ranks: {', '.join(str(rank) for rank in rank_order)}")
|
|
2482
|
+
print("")
|
|
2483
|
+
print("Top stack signatures:")
|
|
2484
|
+
for idx, (signature_key, count) in enumerate(top_signatures, start=1):
|
|
2485
|
+
ratio = count / float(total_events)
|
|
2486
|
+
print(f"{idx}. {count} events ({ratio:.1%})")
|
|
2487
|
+
example = signature_examples.get(signature_key, {})
|
|
2488
|
+
for rank in rank_order:
|
|
2489
|
+
frame = example.get(rank, "")
|
|
2490
|
+
frame = shorten(frame, 120)
|
|
2491
|
+
print(f" rank {rank}: {frame}")
|
|
2492
|
+
print("")
|
|
2493
|
+
print("Rank change counts:")
|
|
2494
|
+
for rank in rank_order:
|
|
2495
|
+
print(f" rank {rank}: {rank_change_counts.get(rank, 0)}")
|
|
2496
|
+
print("")
|
|
2497
|
+
print("Most divergent events:")
|
|
2498
|
+
divergent = sorted(
|
|
2499
|
+
range(total_events),
|
|
2500
|
+
key=lambda idx: divergence_ratios[idx],
|
|
2501
|
+
reverse=True,
|
|
2502
|
+
)[:5]
|
|
2503
|
+
for idx in divergent:
|
|
2504
|
+
print(
|
|
2505
|
+
f" #{idx + 1} @ {iso_timestamp(events[idx].timestamp)} | "
|
|
2506
|
+
f"ratio {divergence_ratios[idx]:.2f} | "
|
|
2507
|
+
f"common {common_prefixes[idx]} | "
|
|
2508
|
+
f"max {max_stack_lens[idx]}"
|
|
2509
|
+
)
|
|
2510
|
+
return 0
|
|
2511
|
+
|
|
2512
|
+
|
|
2513
|
+
def main(argv: Optional[Sequence[str]] = None) -> int:
|
|
2514
|
+
argv = list(argv) if argv is not None else sys.argv[1:]
|
|
2515
|
+
if argv and argv[0] in {"review", "summarize", "record"}:
|
|
2516
|
+
command = argv[0]
|
|
2517
|
+
sub_args = argv[1:]
|
|
2518
|
+
if command == "review":
|
|
2519
|
+
return run_review(parse_review_args(sub_args))
|
|
2520
|
+
if command == "record":
|
|
2521
|
+
return run_record_batch(parse_record_args(sub_args))
|
|
2522
|
+
return run_summarize(parse_summarize_args(sub_args))
|
|
2523
|
+
return run_live(parse_live_args(argv))
|
|
2524
|
+
|
|
2525
|
+
|
|
1413
2526
|
def select_with_timeout(timeout: float):
|
|
1414
2527
|
import select
|
|
1415
2528
|
|
mpiptop-0.1.0.dist-info/RECORD
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
mpiptop.py,sha256=nz8YMs0j54vfxDr0bfjwON76QWZ_QAtR2rp0VkuhrFM,45815
|
|
2
|
-
mpiptop-0.1.0.dist-info/licenses/LICENSE,sha256=ChKmQ8qCXxdXRR_HIJECjIA5NLWlUTEJWh7Xkhm2wAA,1069
|
|
3
|
-
mpiptop-0.1.0.dist-info/METADATA,sha256=VauVpnkAiQokz6X5OXncDhygTbeJPrU_9rOTFya_oMQ,1477
|
|
4
|
-
mpiptop-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
-
mpiptop-0.1.0.dist-info/entry_points.txt,sha256=RsGsr8GBLfUNpb432YWS5gz4MWfWdK9xJRr1SmdnLo8,41
|
|
6
|
-
mpiptop-0.1.0.dist-info/top_level.txt,sha256=c2Vdu6tTg0DEPUWD8Odyods7fXsPWMQ2kSvjdKiTClc,8
|
|
7
|
-
mpiptop-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|