wafer-core 0.1.29__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,7 +145,17 @@ def analyze_traces_from_loaded(
145
145
  trace2_total = trace2_agg["total_us"] / 1000
146
146
  trace1_count = int(trace1_agg["count"])
147
147
  trace2_count = int(trace2_agg["count"])
148
- ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
148
+ # Speedup: ratio of total times (not per-call averages)
149
+ # Shows how many times faster/slower trace1 is compared to trace2
150
+ # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
151
+ # Using total time instead of avg time per call because operations may have
152
+ # vastly different call counts (e.g., fused vs unfused operations)
153
+ if trace2_total > 0:
154
+ ratio = trace1_total / trace2_total
155
+ elif trace1_total > 0:
156
+ ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
157
+ else:
158
+ ratio = 1.0 # Both are zero
149
159
  gap_ms = trace1_total - trace2_total
150
160
 
151
161
  trace1_pattern = list(
@@ -446,6 +456,11 @@ def analyze_traces_aligned(
446
456
  )
447
457
  same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
448
458
 
459
+ # Note: amd_kernels = trace1's kernels (filtered if phase_filter != "all")
460
+ # nvidia_kernels = trace2's kernels (filtered if phase_filter != "all")
461
+ # The variable names are misleading but trace1_* should use amd_kernels,
462
+ # and trace2_* should use nvidia_kernels to match the filtered kernel counts/totals.
463
+
449
464
  return {
450
465
  "metadata": {
451
466
  "amd_gpu": amd_trace.gpu_name,
@@ -462,10 +477,10 @@ def analyze_traces_aligned(
462
477
  "trace2_platform": trace2.platform,
463
478
  "trace2_gpu": trace2.gpu_name,
464
479
  "trace2_device": trace2.device_props,
465
- "trace1_kernels": len(amd_trace.kernel_events),
466
- "trace2_kernels": len(nvidia_trace.kernel_events),
467
- "trace1_total_ms": sum(k.get("dur", 0) for k in amd_trace.kernel_events) / 1000,
468
- "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_trace.kernel_events) / 1000,
480
+ "trace1_kernels": len(amd_kernels),
481
+ "trace2_kernels": len(nvidia_kernels),
482
+ "trace1_total_ms": sum(k.get("dur", 0) for k in amd_kernels) / 1000,
483
+ "trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_kernels) / 1000,
469
484
  "phase": phase_filter,
470
485
  "trace1_layers": alignment.num_layers,
471
486
  "trace2_layers": alignment.num_layers,
@@ -579,7 +594,17 @@ def analyze_traces_aligned(
579
594
  trace2_total = trace2_agg["total_us"] / 1000
580
595
  trace1_count = int(trace1_agg["count"])
581
596
  trace2_count = int(trace2_agg["count"])
582
- ratio = trace1_avg / trace2_avg if trace2_avg > 0 else 1
597
+ # Speedup: ratio of total times (not per-call averages)
598
+ # Shows how many times faster/slower trace1 is compared to trace2
599
+ # > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
600
+ # Using total time instead of avg time per call because operations may have
601
+ # vastly different call counts (e.g., fused vs unfused operations)
602
+ if trace2_total > 0:
603
+ ratio = trace1_total / trace2_total
604
+ elif trace1_total > 0:
605
+ ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
606
+ else:
607
+ ratio = 1.0 # Both are zero
583
608
  gap_ms = trace1_total - trace2_total
584
609
 
585
610
  trace1_pattern = list(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: wafer-core
3
- Version: 0.1.29
3
+ Version: 0.1.30
4
4
  Summary: Core utilities and environments for Wafer GPU kernel optimization
5
5
  Requires-Python: >=3.10
6
6
  Requires-Dist: aiohttp>=3.9.0
@@ -321,7 +321,7 @@ wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h
321
321
  wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
322
322
  wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
323
323
  wafer_core/lib/trace_compare/aligner.py,sha256=1S8Ob3RaEsIjN0HdqEx0yGsW5uf_lMrJVSH_MnZhKok,13788
324
- wafer_core/lib/trace_compare/analyzer.py,sha256=YkuOPA3HFX_7mNUEhE9CMOtEMGLQd12lvUkvqqeQF14,29698
324
+ wafer_core/lib/trace_compare/analyzer.py,sha256=Ou_gooG027YVuYVF5oddAkMsObXrrPQLBPHUzSMA4Vg,31078
325
325
  wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
326
326
  wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
327
327
  wafer_core/lib/trace_compare/classifier.py,sha256=cYAmDW8S75N6cE3mJNZM-UKCJSX7rFP-8klVrukBvNQ,17504
@@ -697,6 +697,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
697
697
  wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
698
698
  wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
699
699
  wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
700
- wafer_core-0.1.29.dist-info/METADATA,sha256=Qjyx92KhI1joutpM8lF0G1zgPou-d8CdzWI80QQqKYg,1477
701
- wafer_core-0.1.29.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
702
- wafer_core-0.1.29.dist-info/RECORD,,
700
+ wafer_core-0.1.30.dist-info/METADATA,sha256=YuF3VyyP3tvmv2S-7E8epi1J2_1e2yXJfapS1uGQ0Zs,1477
701
+ wafer_core-0.1.30.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
702
+ wafer_core-0.1.30.dist-info/RECORD,,