wafer-core 0.1.29__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -145,7 +145,17 @@ def analyze_traces_from_loaded(
|
|
|
145
145
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
146
146
|
trace1_count = int(trace1_agg["count"])
|
|
147
147
|
trace2_count = int(trace2_agg["count"])
|
|
148
|
-
|
|
148
|
+
# Speedup: ratio of total times (not per-call averages)
|
|
149
|
+
# Shows how many times faster/slower trace1 is compared to trace2
|
|
150
|
+
# > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
|
|
151
|
+
# Using total time instead of avg time per call because operations may have
|
|
152
|
+
# vastly different call counts (e.g., fused vs unfused operations)
|
|
153
|
+
if trace2_total > 0:
|
|
154
|
+
ratio = trace1_total / trace2_total
|
|
155
|
+
elif trace1_total > 0:
|
|
156
|
+
ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
|
|
157
|
+
else:
|
|
158
|
+
ratio = 1.0 # Both are zero
|
|
149
159
|
gap_ms = trace1_total - trace2_total
|
|
150
160
|
|
|
151
161
|
trace1_pattern = list(
|
|
@@ -446,6 +456,11 @@ def analyze_traces_aligned(
|
|
|
446
456
|
)
|
|
447
457
|
same_kernel_result = analyze_same_kernels_from_alignment(alignment.layer_alignments)
|
|
448
458
|
|
|
459
|
+
# Note: amd_kernels = trace1's kernels (filtered if phase_filter != "all")
|
|
460
|
+
# nvidia_kernels = trace2's kernels (filtered if phase_filter != "all")
|
|
461
|
+
# The variable names are misleading but trace1_* should use amd_kernels,
|
|
462
|
+
# and trace2_* should use nvidia_kernels to match the filtered kernel counts/totals.
|
|
463
|
+
|
|
449
464
|
return {
|
|
450
465
|
"metadata": {
|
|
451
466
|
"amd_gpu": amd_trace.gpu_name,
|
|
@@ -462,10 +477,10 @@ def analyze_traces_aligned(
|
|
|
462
477
|
"trace2_platform": trace2.platform,
|
|
463
478
|
"trace2_gpu": trace2.gpu_name,
|
|
464
479
|
"trace2_device": trace2.device_props,
|
|
465
|
-
"trace1_kernels": len(
|
|
466
|
-
"trace2_kernels": len(
|
|
467
|
-
"trace1_total_ms": sum(k.get("dur", 0) for k in
|
|
468
|
-
"trace2_total_ms": sum(k.get("dur", 0) for k in
|
|
480
|
+
"trace1_kernels": len(amd_kernels),
|
|
481
|
+
"trace2_kernels": len(nvidia_kernels),
|
|
482
|
+
"trace1_total_ms": sum(k.get("dur", 0) for k in amd_kernels) / 1000,
|
|
483
|
+
"trace2_total_ms": sum(k.get("dur", 0) for k in nvidia_kernels) / 1000,
|
|
469
484
|
"phase": phase_filter,
|
|
470
485
|
"trace1_layers": alignment.num_layers,
|
|
471
486
|
"trace2_layers": alignment.num_layers,
|
|
@@ -579,7 +594,17 @@ def analyze_traces_aligned(
|
|
|
579
594
|
trace2_total = trace2_agg["total_us"] / 1000
|
|
580
595
|
trace1_count = int(trace1_agg["count"])
|
|
581
596
|
trace2_count = int(trace2_agg["count"])
|
|
582
|
-
|
|
597
|
+
# Speedup: ratio of total times (not per-call averages)
|
|
598
|
+
# Shows how many times faster/slower trace1 is compared to trace2
|
|
599
|
+
# > 1.0 means trace1 is slower, < 1.0 means trace1 is faster
|
|
600
|
+
# Using total time instead of avg time per call because operations may have
|
|
601
|
+
# vastly different call counts (e.g., fused vs unfused operations)
|
|
602
|
+
if trace2_total > 0:
|
|
603
|
+
ratio = trace1_total / trace2_total
|
|
604
|
+
elif trace1_total > 0:
|
|
605
|
+
ratio = float("inf") # trace2 has no time, trace1 is infinitely slower
|
|
606
|
+
else:
|
|
607
|
+
ratio = 1.0 # Both are zero
|
|
583
608
|
gap_ms = trace1_total - trace2_total
|
|
584
609
|
|
|
585
610
|
trace1_pattern = list(
|
|
@@ -321,7 +321,7 @@ wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h
|
|
|
321
321
|
wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
|
|
322
322
|
wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
|
|
323
323
|
wafer_core/lib/trace_compare/aligner.py,sha256=1S8Ob3RaEsIjN0HdqEx0yGsW5uf_lMrJVSH_MnZhKok,13788
|
|
324
|
-
wafer_core/lib/trace_compare/analyzer.py,sha256=
|
|
324
|
+
wafer_core/lib/trace_compare/analyzer.py,sha256=Ou_gooG027YVuYVF5oddAkMsObXrrPQLBPHUzSMA4Vg,31078
|
|
325
325
|
wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
|
|
326
326
|
wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
|
|
327
327
|
wafer_core/lib/trace_compare/classifier.py,sha256=cYAmDW8S75N6cE3mJNZM-UKCJSX7rFP-8klVrukBvNQ,17504
|
|
@@ -697,6 +697,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
|
|
|
697
697
|
wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
|
|
698
698
|
wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
|
|
699
699
|
wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
|
|
700
|
-
wafer_core-0.1.
|
|
701
|
-
wafer_core-0.1.
|
|
702
|
-
wafer_core-0.1.
|
|
700
|
+
wafer_core-0.1.30.dist-info/METADATA,sha256=YuF3VyyP3tvmv2S-7E8epi1J2_1e2yXJfapS1uGQ0Zs,1477
|
|
701
|
+
wafer_core-0.1.30.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
702
|
+
wafer_core-0.1.30.dist-info/RECORD,,
|
|
File without changes
|