wafer-cli 0.2.29__py3-none-any.whl → 0.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wafer/baseline.py ADDED
@@ -0,0 +1,661 @@
1
+ """Baseline CLI commands.
2
+
3
+ Discover what kernel PyTorch dispatches to for a given operation.
4
+ Helps understand the baseline performance you need to beat.
5
+ """
6
+
7
+ import asyncio
8
+
9
+ import typer
10
+
11
+ from wafer_core.tools.dispatch_baseline.client import (
12
+ lookup_baseline,
13
+ store_baseline,
14
+ )
15
+ from wafer_core.tools.dispatch_baseline.codegen import (
16
+ parse_op_string,
17
+ update_dtypes,
18
+ update_shapes,
19
+ )
20
+ from wafer_core.tools.dispatch_baseline.dtypes import KernelTraceConfig
21
+ from wafer_core.tools.dispatch_baseline.executor import trace_kernel_local
22
+ from wafer_core.tools.dispatch_baseline.roofline import HARDWARE_SPECS, get_hardware_spec
23
+
24
+ baseline_app = typer.Typer(
25
+ help="""Discover what kernel PyTorch dispatches to for a given operation.
26
+
27
+ This helps you understand the baseline performance you need to beat when writing
28
+ custom kernels. Run a PyTorch op, profile it, and see:
29
+ - What kernel PyTorch uses (cuBLAS, cuDNN, Triton, etc.)
30
+ - How fast it runs
31
+ - What % of peak hardware performance it achieves
32
+
33
+ Results are stored in a shared database - once traced, everyone benefits.
34
+
35
+ Examples:
36
+ # Run baseline trace
37
+ wafer baseline run "torch.matmul(A, B)" -s A=4096,4096 -s B=4096,4096 --target b200-dev
38
+
39
+ # Show supported hardware
40
+ wafer baseline hardware"""
41
+ )
42
+
43
+
44
+ def _parse_shape(shape_str: str) -> tuple[str, tuple[int, ...]]:
45
+ """Parse shape string like 'A=4096,4096' into (name, shape)."""
46
+ if "=" not in shape_str:
47
+ raise typer.BadParameter(f"Invalid shape format: {shape_str}. Expected: name=dim1,dim2,...")
48
+
49
+ name, dims_str = shape_str.split("=", 1)
50
+ try:
51
+ dims = tuple(int(d.strip()) for d in dims_str.split(","))
52
+ except ValueError:
53
+ raise typer.BadParameter(f"Invalid dimensions in shape: {dims_str}")
54
+
55
+ return name.strip(), dims
56
+
57
+
58
+ def _complete_target_name(incomplete: str) -> list[str]:
59
+ """Autocomplete target names from ~/.wafer/targets/*.toml"""
60
+ from pathlib import Path
61
+
62
+ targets_dir = Path.home() / ".wafer" / "targets"
63
+ if not targets_dir.exists():
64
+ return []
65
+ return [f.stem for f in targets_dir.glob("*.toml") if f.stem.startswith(incomplete)]
66
+
67
+
68
+ @baseline_app.command("run")
69
+ def baseline_run_cmd(
70
+ op: str = typer.Argument(
71
+ ...,
72
+ help='PyTorch operation to trace, e.g., "torch.matmul(A, B)"',
73
+ ),
74
+ shape: list[str] = typer.Option(
75
+ [],
76
+ "--shape",
77
+ "-s",
78
+ help="Tensor shape as name=dim1,dim2,... (can specify multiple)",
79
+ ),
80
+ dtype: str = typer.Option(
81
+ "float16",
82
+ "--dtype",
83
+ "-d",
84
+ help="Data type for tensors (float16, float32, bfloat16, etc.)",
85
+ ),
86
+ hardware: str = typer.Option(
87
+ None,
88
+ "--hardware",
89
+ help="Hardware name for roofline analysis (auto-detected from target if not specified)",
90
+ ),
91
+ target: str = typer.Option(
92
+ None,
93
+ "--target",
94
+ "-t",
95
+ help="GPU target name (see 'wafer config targets list')",
96
+ autocompletion=_complete_target_name,
97
+ ),
98
+ workspace: str = typer.Option(
99
+ None,
100
+ "--workspace",
101
+ "-w",
102
+ help="Workspace name (see 'wafer workspaces list')",
103
+ ),
104
+ num_warmup: int = typer.Option(
105
+ 10,
106
+ "--warmup",
107
+ help="Number of warmup iterations",
108
+ ),
109
+ num_runs: int = typer.Option(
110
+ 100,
111
+ "--runs",
112
+ help="Number of profiling runs",
113
+ ),
114
+ no_cache: bool = typer.Option(
115
+ False,
116
+ "--no-cache",
117
+ help="Skip cache and always run fresh trace",
118
+ ),
119
+ json_output: bool = typer.Option(
120
+ False,
121
+ "--json",
122
+ help="Output as JSON for programmatic use",
123
+ ),
124
+ verbose: bool = typer.Option(
125
+ False,
126
+ "--verbose",
127
+ "-v",
128
+ help="Show verbose output including raw profiler data",
129
+ ),
130
+ timeout: int = typer.Option(
131
+ 120,
132
+ "--timeout",
133
+ help="Timeout in seconds for profiling (default: 120)",
134
+ ),
135
+ ) -> None:
136
+ """Discover what kernel PyTorch dispatches to for a given operation.
137
+
138
+ This runs the operation on your GPU with profiling and reports:
139
+ - Which kernel(s) PyTorch dispatches to
140
+ - Duration of each kernel
141
+ - Library that provides the kernel (cuBLAS, cuDNN, etc.)
142
+ - Roofline analysis (% of peak compute/memory bandwidth)
143
+
144
+ Examples:
145
+ # Run on a target
146
+ wafer baseline run "torch.matmul(A, B)" -s A=4096,4096 -s B=4096,4096 --target b200-dev
147
+
148
+ # Run on a workspace
149
+ wafer baseline run "torch.matmul(A, B)" -s A=4096,4096 -s B=4096,4096 --workspace cutlass-b200-eval
150
+
151
+ # Run locally (requires local GPU)
152
+ wafer baseline run "torch.matmul(A, B)" -s A=4096,4096 -s B=4096,4096
153
+
154
+ # With specific hardware for roofline
155
+ wafer baseline run "torch.matmul(A, B)" -s A=4096,4096 -s B=4096,4096 --target b200-dev --hardware B200
156
+ """
157
+ # Validate mutually exclusive options
158
+ if target and workspace:
159
+ typer.echo("Error: Cannot specify both --target and --workspace", err=True)
160
+ raise typer.Exit(1)
161
+
162
+ # Dispatch to appropriate execution mode
163
+ if target:
164
+ asyncio.run(_run_on_target(
165
+ op, shape, dtype, hardware, target, num_warmup, num_runs, no_cache, json_output, verbose, timeout
166
+ ))
167
+ elif workspace:
168
+ asyncio.run(_run_on_workspace(
169
+ op, shape, dtype, hardware, workspace, num_warmup, num_runs, no_cache, json_output, verbose, timeout
170
+ ))
171
+ else:
172
+ _run_locally(op, shape, dtype, hardware, num_warmup, num_runs, no_cache, json_output, verbose, timeout)
173
+
174
+
175
+ def _run_locally(
176
+ op: str,
177
+ shape: list[str],
178
+ dtype: str,
179
+ hardware: str | None,
180
+ num_warmup: int,
181
+ num_runs: int,
182
+ no_cache: bool,
183
+ json_output: bool,
184
+ verbose: bool,
185
+ timeout: int,
186
+ ) -> None:
187
+ """Run baseline trace on local GPU."""
188
+ import torch
189
+
190
+ # Check CUDA availability
191
+ if not torch.cuda.is_available():
192
+ typer.echo("Error: CUDA not available on this machine", err=True)
193
+ typer.echo("Use --target or --workspace to run on a remote GPU.", err=True)
194
+ raise typer.Exit(1)
195
+
196
+ # Auto-detect hardware if not specified
197
+ if hardware is None:
198
+ hardware = _detect_local_hardware()
199
+ if hardware:
200
+ if not json_output:
201
+ typer.echo(f"Auto-detected hardware: {hardware}")
202
+ else:
203
+ gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "unknown"
204
+ if not json_output:
205
+ typer.echo(f"Warning: No roofline specs for '{gpu_name}'", err=True)
206
+ typer.echo(f"Supported hardware: {', '.join(HARDWARE_SPECS.keys())}", err=True)
207
+ typer.echo("Roofline analysis will be skipped.", err=True)
208
+ typer.echo("")
209
+
210
+ # Parse operation
211
+ try:
212
+ op_spec = parse_op_string(op)
213
+ except ValueError as e:
214
+ typer.echo(f"Error parsing operation: {e}", err=True)
215
+ raise typer.Exit(1)
216
+
217
+ # Parse shapes
218
+ shapes: dict[str, tuple[int, ...]] = {}
219
+ for shape_str in shape:
220
+ try:
221
+ name, dims = _parse_shape(shape_str)
222
+ shapes[name] = dims
223
+ except typer.BadParameter as e:
224
+ typer.echo(f"Error: {e}", err=True)
225
+ raise typer.Exit(1)
226
+
227
+ # Update op_spec with shapes and dtype
228
+ if shapes:
229
+ op_spec = update_shapes(op_spec, shapes)
230
+ op_spec = update_dtypes(op_spec, dtype)
231
+
232
+ # Validate hardware
233
+ hw_spec = get_hardware_spec(hardware)
234
+ if hw_spec is None:
235
+ typer.echo(f"Warning: Unknown hardware '{hardware}', roofline analysis will be skipped", err=True)
236
+ typer.echo(f"Supported hardware: {', '.join(HARDWARE_SPECS.keys())}", err=True)
237
+
238
+ # Get current environment for cache lookup
239
+ pytorch_version = torch.__version__
240
+ props = torch.cuda.get_device_properties(0)
241
+
242
+ # Detect runtime version and architecture (CUDA vs ROCm)
243
+ if hasattr(torch.version, 'hip') and torch.version.hip:
244
+ runtime_version = torch.version.hip
245
+ gpu_arch = getattr(props, 'gcnArchName', f"gfx{props.major}{props.minor}")
246
+ else:
247
+ runtime_version = torch.version.cuda or "unknown"
248
+ gpu_arch = f"sm_{props.major}{props.minor}"
249
+
250
+ # Check cache first (unless --no-cache)
251
+ from_cache = False
252
+ if not no_cache:
253
+ cached = lookup_baseline(op_spec, hardware, pytorch_version, runtime_version, gpu_arch)
254
+ if cached is not None:
255
+ from_cache = True
256
+ # Re-compute roofline with current hardware specs (in case they've been updated)
257
+ config = KernelTraceConfig(op_spec=op_spec, hardware=hardware, num_warmup=0, num_runs=0)
258
+ from wafer_core.tools.dispatch_baseline.executor import _add_roofline_analysis
259
+ result = _add_roofline_analysis(cached, config)
260
+ if not json_output:
261
+ typer.echo(f"Using cached result (key: {pytorch_version}/{runtime_version}/{gpu_arch})")
262
+ typer.echo("")
263
+
264
+ if not from_cache:
265
+ # Create config
266
+ config = KernelTraceConfig(
267
+ op_spec=op_spec,
268
+ hardware=hardware,
269
+ num_warmup=num_warmup,
270
+ num_runs=num_runs,
271
+ timeout_seconds=timeout,
272
+ )
273
+
274
+ # Run trace
275
+ if not json_output:
276
+ typer.echo(f"Profiling: {op_spec}")
277
+ typer.echo(f"Hardware: {hardware}")
278
+ typer.echo("")
279
+
280
+ exec_result = trace_kernel_local(config)
281
+ result = exec_result.result
282
+
283
+ # Cache the result
284
+ if not result.error:
285
+ store_baseline(
286
+ result,
287
+ exec_result.pytorch_version,
288
+ exec_result.runtime_version,
289
+ exec_result.gpu_arch,
290
+ )
291
+
292
+ # Output results
293
+ _output_result(result, json_output, verbose, from_cache)
294
+
295
+
296
+ def _detect_local_hardware() -> str:
297
+ """Detect GPU hardware name from local CUDA device.
298
+
299
+ Only returns hardware names that we have specs for (B200, MI300X).
300
+ Returns None for unsupported hardware.
301
+ """
302
+ import torch
303
+
304
+ if not torch.cuda.is_available():
305
+ return None
306
+
307
+ gpu_name = torch.cuda.get_device_name(0).upper()
308
+
309
+ # Only return hardware we have roofline specs for
310
+ if "B200" in gpu_name:
311
+ return "B200"
312
+ elif "MI300X" in gpu_name:
313
+ return "MI300X"
314
+ else:
315
+ return None # Unsupported hardware
316
+
317
+
318
+ def _detect_hardware_from_target(target_config) -> str | None:
319
+ """Detect hardware from target configuration.
320
+
321
+ Only returns hardware names that we have specs for (B200, MI300X).
322
+ """
323
+ gpu_type = getattr(target_config, "gpu_type", None)
324
+ if gpu_type:
325
+ gpu_upper = gpu_type.upper()
326
+ if gpu_upper in HARDWARE_SPECS:
327
+ return gpu_upper
328
+ return None
329
+
330
+
331
+ async def _run_on_target(
332
+ op: str,
333
+ shape: list[str],
334
+ dtype: str,
335
+ hardware: str | None,
336
+ target_name: str,
337
+ num_warmup: int,
338
+ num_runs: int,
339
+ no_cache: bool,
340
+ json_output: bool,
341
+ verbose: bool,
342
+ timeout: int,
343
+ ) -> None:
344
+ """Run baseline trace on a configured target via SSH."""
345
+ from wafer_core.ssh import SSHClient
346
+ from wafer_core.tools.dispatch_baseline.codegen import generate_trace_script
347
+ from wafer_core.tools.dispatch_baseline.executor import trace_kernel_remote
348
+
349
+ from .targets import load_target
350
+ from .targets_ops import TargetExecError, get_target_ssh_info
351
+
352
+ # Load target config
353
+ try:
354
+ target_config = load_target(target_name)
355
+ except FileNotFoundError:
356
+ typer.echo(f"Error: Target '{target_name}' not found", err=True)
357
+ typer.echo("Run 'wafer config targets list' to see available targets", err=True)
358
+ raise typer.Exit(1)
359
+
360
+ # Auto-detect hardware from target if not specified
361
+ if hardware is None:
362
+ hardware = _detect_hardware_from_target(target_config)
363
+ if hardware:
364
+ if not json_output:
365
+ typer.echo(f"Auto-detected hardware from target: {hardware}")
366
+ else:
367
+ if not json_output:
368
+ typer.echo(f"Warning: No roofline specs for target's GPU", err=True)
369
+ typer.echo(f"Supported hardware: {', '.join(HARDWARE_SPECS.keys())}", err=True)
370
+ typer.echo("Roofline analysis will be skipped.", err=True)
371
+ typer.echo("")
372
+
373
+ # Get SSH info
374
+ try:
375
+ ssh_info = await get_target_ssh_info(target_config)
376
+ except TargetExecError as e:
377
+ typer.echo(f"Error: {e}", err=True)
378
+ raise typer.Exit(1)
379
+
380
+ # Parse operation and create config
381
+ try:
382
+ op_spec = parse_op_string(op)
383
+ except ValueError as e:
384
+ typer.echo(f"Error parsing operation: {e}", err=True)
385
+ raise typer.Exit(1)
386
+
387
+ shapes: dict[str, tuple[int, ...]] = {}
388
+ for shape_str in shape:
389
+ try:
390
+ name, dims = _parse_shape(shape_str)
391
+ shapes[name] = dims
392
+ except typer.BadParameter as e:
393
+ typer.echo(f"Error: {e}", err=True)
394
+ raise typer.Exit(1)
395
+
396
+ if shapes:
397
+ op_spec = update_shapes(op_spec, shapes)
398
+ op_spec = update_dtypes(op_spec, dtype)
399
+
400
+ config = KernelTraceConfig(
401
+ op_spec=op_spec,
402
+ hardware=hardware,
403
+ num_warmup=num_warmup,
404
+ num_runs=num_runs,
405
+ )
406
+
407
+ if not json_output:
408
+ typer.echo(f"Profiling: {op_spec}")
409
+ typer.echo(f"Target: {target_name}")
410
+ typer.echo(f"Hardware: {hardware}")
411
+ typer.echo("")
412
+
413
+ # Create SSH client and run trace
414
+ ssh_client = SSHClient(
415
+ host=ssh_info.host,
416
+ port=ssh_info.port,
417
+ username=ssh_info.user,
418
+ key_path=str(ssh_info.key_path),
419
+ )
420
+
421
+ try:
422
+ ssh_client.connect()
423
+ exec_result = trace_kernel_remote(config, ssh_client)
424
+ result = exec_result.result
425
+
426
+ # Cache the result
427
+ if not result.error and not no_cache:
428
+ store_baseline(
429
+ result,
430
+ exec_result.pytorch_version,
431
+ exec_result.runtime_version,
432
+ exec_result.gpu_arch,
433
+ )
434
+ finally:
435
+ ssh_client.close()
436
+
437
+ _output_result(result, json_output, verbose, from_cache=False)
438
+
439
+
440
+ async def _run_on_workspace(
441
+ op: str,
442
+ shape: list[str],
443
+ dtype: str,
444
+ hardware: str | None,
445
+ workspace_name: str,
446
+ num_warmup: int,
447
+ num_runs: int,
448
+ no_cache: bool,
449
+ json_output: bool,
450
+ verbose: bool,
451
+ timeout: int,
452
+ ) -> None:
453
+ """Run baseline trace on a workspace."""
454
+ import subprocess
455
+ import tempfile
456
+ from pathlib import Path
457
+
458
+ from wafer_core.tools.dispatch_baseline.analyzer import parse_trace_output
459
+ from wafer_core.tools.dispatch_baseline.codegen import generate_trace_script
460
+ from wafer_core.tools.dispatch_baseline.executor import _add_roofline_analysis
461
+
462
+ # Parse operation and create config
463
+ try:
464
+ op_spec = parse_op_string(op)
465
+ except ValueError as e:
466
+ typer.echo(f"Error parsing operation: {e}", err=True)
467
+ raise typer.Exit(1)
468
+
469
+ shapes: dict[str, tuple[int, ...]] = {}
470
+ for shape_str in shape:
471
+ try:
472
+ name, dims = _parse_shape(shape_str)
473
+ shapes[name] = dims
474
+ except typer.BadParameter as e:
475
+ typer.echo(f"Error: {e}", err=True)
476
+ raise typer.Exit(1)
477
+
478
+ if shapes:
479
+ op_spec = update_shapes(op_spec, shapes)
480
+ op_spec = update_dtypes(op_spec, dtype)
481
+
482
+ # Default hardware for workspaces (can be overridden)
483
+ if hardware is None:
484
+ # Try to detect from workspace name (only supported hardware)
485
+ ws_lower = workspace_name.lower()
486
+ if "b200" in ws_lower:
487
+ hardware = "B200"
488
+ elif "mi300" in ws_lower:
489
+ hardware = "MI300X"
490
+ else:
491
+ hardware = None
492
+
493
+ if hardware:
494
+ if not json_output:
495
+ typer.echo(f"Auto-detected hardware from workspace name: {hardware}")
496
+ else:
497
+ if not json_output:
498
+ typer.echo(f"Warning: Could not detect hardware from workspace name '{workspace_name}'", err=True)
499
+ typer.echo(f"Supported hardware: {', '.join(HARDWARE_SPECS.keys())}", err=True)
500
+ typer.echo("Roofline analysis will be skipped.", err=True)
501
+ typer.echo("")
502
+
503
+ config = KernelTraceConfig(
504
+ op_spec=op_spec,
505
+ hardware=hardware,
506
+ num_warmup=num_warmup,
507
+ num_runs=num_runs,
508
+ )
509
+
510
+ if not json_output:
511
+ typer.echo(f"Profiling: {op_spec}")
512
+ typer.echo(f"Workspace: {workspace_name}")
513
+ typer.echo(f"Hardware: {hardware}")
514
+ typer.echo("")
515
+
516
+ # Generate script
517
+ script = generate_trace_script(config)
518
+
519
+ # Write to temp file and sync to workspace
520
+ with tempfile.TemporaryDirectory() as tmpdir:
521
+ script_path = Path(tmpdir) / "baseline_trace.py"
522
+ script_path.write_text(script)
523
+
524
+ # Sync to workspace using wafer CLI
525
+ sync_result = subprocess.run(
526
+ ["wafer", "workspaces", "sync", workspace_name, str(tmpdir)],
527
+ capture_output=True,
528
+ text=True,
529
+ )
530
+ if sync_result.returncode != 0:
531
+ typer.echo(f"Error syncing to workspace: {sync_result.stderr}", err=True)
532
+ raise typer.Exit(1)
533
+
534
+ # Execute on workspace
535
+ exec_result = subprocess.run(
536
+ ["wafer", "workspaces", "exec", "--timeout", str(timeout), workspace_name,
537
+ "python /workspace/baseline_trace.py"],
538
+ capture_output=True,
539
+ text=True,
540
+ )
541
+
542
+ output = exec_result.stdout + exec_result.stderr
543
+
544
+ # Parse result
545
+ parsed = parse_trace_output(output, op_spec, hardware)
546
+ result = _add_roofline_analysis(parsed.result, config)
547
+
548
+ # Cache the result
549
+ if not result.error and not no_cache:
550
+ store_baseline(
551
+ result,
552
+ parsed.pytorch_version,
553
+ parsed.runtime_version,
554
+ parsed.gpu_arch,
555
+ )
556
+
557
+ _output_result(result, json_output, verbose, from_cache=False)
558
+
559
+
560
+ def _output_result(result, json_output: bool, verbose: bool, from_cache: bool = False) -> None:
561
+ """Output trace result in the requested format."""
562
+ if json_output:
563
+ import json
564
+
565
+ output = {
566
+ "op": str(result.op_spec),
567
+ "hardware": result.hardware,
568
+ "total_duration_us": result.total_duration_us,
569
+ "from_cache": from_cache,
570
+ "kernels": [
571
+ {
572
+ "name": k.name,
573
+ "duration_us": k.duration_us,
574
+ }
575
+ for k in result.kernels
576
+ ],
577
+ "primary_kernel": {
578
+ "name": result.primary_kernel.name,
579
+ "duration_us": result.primary_kernel.duration_us,
580
+ }
581
+ if result.primary_kernel
582
+ else None,
583
+ "roofline": {
584
+ "achieved_tflops": result.roofline.achieved_tflops,
585
+ "achieved_memory_bw_tbps": result.roofline.achieved_memory_bw_tbps,
586
+ "compute_pct_of_peak": result.roofline.compute_pct_of_peak,
587
+ "memory_bw_pct_of_peak": result.roofline.memory_bw_pct_of_peak,
588
+ "bottleneck": result.roofline.bottleneck,
589
+ }
590
+ if result.roofline
591
+ else None,
592
+ "error": result.error,
593
+ }
594
+ typer.echo(json.dumps(output, indent=2))
595
+ else:
596
+ if result.error:
597
+ typer.echo(f"Error: {result.error}", err=True)
598
+ if verbose and result.raw_output:
599
+ typer.echo("\nRaw output:")
600
+ typer.echo(result.raw_output)
601
+ raise typer.Exit(1)
602
+
603
+ if from_cache:
604
+ typer.echo("(from cache)")
605
+ typer.echo("")
606
+
607
+ typer.echo(result.summary())
608
+
609
+ if verbose and result.raw_output:
610
+ typer.echo("\n--- Raw Profiler Output ---")
611
+ typer.echo(result.raw_output)
612
+
613
+
614
+ @baseline_app.command("hardware")
615
+ def hardware_cmd(
616
+ json_output: bool = typer.Option(
617
+ False,
618
+ "--json",
619
+ help="Output as JSON",
620
+ ),
621
+ ) -> None:
622
+ """List supported hardware and their specifications.
623
+
624
+ Shows peak FLOPS and memory bandwidth for each supported GPU,
625
+ used for roofline analysis calculations.
626
+
627
+ Examples:
628
+ wafer baseline hardware
629
+ wafer baseline hardware --json
630
+ """
631
+ if json_output:
632
+ import json
633
+
634
+ output = {
635
+ name: {
636
+ "peak_fp16_tflops": spec.peak_fp16_tflops,
637
+ "peak_fp32_tflops": spec.peak_fp32_tflops,
638
+ "peak_memory_bw_tbps": spec.peak_memory_bw_tbps,
639
+ "peak_fp8_tflops": spec.peak_fp8_tflops,
640
+ "peak_int8_tops": spec.peak_int8_tops,
641
+ }
642
+ for name, spec in HARDWARE_SPECS.items()
643
+ }
644
+ typer.echo(json.dumps(output, indent=2))
645
+ else:
646
+ typer.echo("Supported Hardware for Roofline Analysis")
647
+ typer.echo("=" * 60)
648
+ typer.echo("")
649
+ typer.echo(f"{'Name':<12} {'FP16 TFLOPS':<14} {'FP32 TFLOPS':<14} {'Mem BW (TB/s)':<14}")
650
+ typer.echo("-" * 60)
651
+
652
+ for name, spec in sorted(HARDWARE_SPECS.items()):
653
+ typer.echo(
654
+ f"{name:<12} {spec.peak_fp16_tflops:<14.1f} {spec.peak_fp32_tflops:<14.1f} {spec.peak_memory_bw_tbps:<14.2f}"
655
+ )
656
+
657
+ typer.echo("")
658
+ typer.echo("Note: FP16 TFLOPS shown without sparsity for most GPUs.")
659
+ typer.echo("Use --json for complete specifications.")
660
+
661
+