wafer-core 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wafer_core/lib/trace_compare/PERFORMANCE.md +148 -0
- wafer_core/lib/trace_compare/__init__.py +22 -9
- wafer_core/lib/trace_compare/aligner.py +369 -0
- wafer_core/lib/trace_compare/analyzer.py +549 -159
- wafer_core/lib/trace_compare/api.py +225 -0
- wafer_core/lib/trace_compare/architecture.py +77 -0
- wafer_core/lib/trace_compare/classifier.py +307 -13
- wafer_core/lib/trace_compare/fusion_analyzer.py +311 -845
- wafer_core/lib/trace_compare/kernel_registry.yaml +349 -0
- wafer_core/lib/trace_compare/layer_segmentation.py +114 -0
- wafer_core/lib/trace_compare/loader.py +526 -227
- wafer_core/lib/trace_compare/same_kernel_analyzer.py +119 -0
- wafer_core/lib/trace_compare/warnings.py +99 -0
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/METADATA +3 -1
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/RECORD +16 -8
- {wafer_core-0.1.26.dist-info → wafer_core-0.1.27.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Same kernel analysis - comparing identical kernel names across platforms.
|
|
2
|
+
|
|
3
|
+
Identifies kernels where AMD and NVIDIA use the same kernel name/pattern
|
|
4
|
+
and compares their performance directly.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .aligner import KernelPair, LayerAlignment
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class SameKernelComparison:
|
|
16
|
+
"""Comparison of identical kernels across platforms."""
|
|
17
|
+
|
|
18
|
+
layer: int
|
|
19
|
+
kernel_name: str
|
|
20
|
+
operation: str
|
|
21
|
+
amd_avg_us: float
|
|
22
|
+
nvidia_avg_us: float
|
|
23
|
+
ratio: float
|
|
24
|
+
gap_us: float
|
|
25
|
+
amd_count: int
|
|
26
|
+
nvidia_count: int
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class SameKernelAnalysis:
|
|
31
|
+
"""Complete same kernel analysis result."""
|
|
32
|
+
|
|
33
|
+
kernels: list[SameKernelComparison] = field(default_factory=list)
|
|
34
|
+
summary: dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def analyze_same_kernels(
|
|
38
|
+
layer_alignments: list[LayerAlignment],
|
|
39
|
+
) -> SameKernelAnalysis:
|
|
40
|
+
"""Find and compare kernels with identical names across platforms.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
layer_alignments: List of aligned layers
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
SameKernelAnalysis with comparisons
|
|
47
|
+
"""
|
|
48
|
+
same_kernels: list[SameKernelComparison] = []
|
|
49
|
+
|
|
50
|
+
for layer_alignment in layer_alignments:
|
|
51
|
+
for pair in layer_alignment.kernel_pairs:
|
|
52
|
+
if pair.is_same_kernel and pair.amd_kernel and pair.nvidia_kernel:
|
|
53
|
+
same_kernels.append(
|
|
54
|
+
SameKernelComparison(
|
|
55
|
+
layer=layer_alignment.layer,
|
|
56
|
+
kernel_name=pair.amd_kernel,
|
|
57
|
+
operation=pair.operation,
|
|
58
|
+
amd_avg_us=pair.amd_avg_us,
|
|
59
|
+
nvidia_avg_us=pair.nvidia_avg_us,
|
|
60
|
+
ratio=pair.ratio,
|
|
61
|
+
gap_us=pair.gap_us,
|
|
62
|
+
amd_count=pair.amd_count,
|
|
63
|
+
nvidia_count=pair.nvidia_count,
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
if same_kernels:
|
|
68
|
+
ratios = [k.ratio for k in same_kernels if k.ratio != float("inf")]
|
|
69
|
+
avg_ratio = sum(ratios) / len(ratios) if ratios else 1.0
|
|
70
|
+
amd_faster = sum(1 for k in same_kernels if k.ratio < 1.0)
|
|
71
|
+
nvidia_faster = sum(1 for k in same_kernels if k.ratio > 1.0)
|
|
72
|
+
else:
|
|
73
|
+
avg_ratio = 1.0
|
|
74
|
+
amd_faster = 0
|
|
75
|
+
nvidia_faster = 0
|
|
76
|
+
|
|
77
|
+
return SameKernelAnalysis(
|
|
78
|
+
kernels=same_kernels,
|
|
79
|
+
summary={
|
|
80
|
+
"total_same_kernels": len(same_kernels),
|
|
81
|
+
"avg_ratio": avg_ratio,
|
|
82
|
+
"kernels_where_amd_faster": amd_faster,
|
|
83
|
+
"kernels_where_nvidia_faster": nvidia_faster,
|
|
84
|
+
},
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def analyze_same_kernels_from_alignment(
|
|
89
|
+
layer_alignments: list[LayerAlignment],
|
|
90
|
+
) -> dict[str, Any]:
|
|
91
|
+
"""Analyze same kernels from alignment data (for API compatibility).
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
layer_alignments: List of aligned layers
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Dictionary with same kernel analysis results
|
|
98
|
+
"""
|
|
99
|
+
analysis = analyze_same_kernels(layer_alignments)
|
|
100
|
+
|
|
101
|
+
kernels = [
|
|
102
|
+
{
|
|
103
|
+
"layer": k.layer,
|
|
104
|
+
"kernel_name": k.kernel_name,
|
|
105
|
+
"operation": k.operation,
|
|
106
|
+
"amd_avg_us": k.amd_avg_us,
|
|
107
|
+
"nvidia_avg_us": k.nvidia_avg_us,
|
|
108
|
+
"ratio": k.ratio,
|
|
109
|
+
"gap_us": k.gap_us,
|
|
110
|
+
"amd_count": k.amd_count,
|
|
111
|
+
"nvidia_count": k.nvidia_count,
|
|
112
|
+
}
|
|
113
|
+
for k in analysis.kernels
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
return {
|
|
117
|
+
"kernels": kernels,
|
|
118
|
+
"summary": analysis.summary,
|
|
119
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Warning detection and reporting for trace analysis.
|
|
2
|
+
|
|
3
|
+
Detects issues with trace data quality and provides actionable suggestions.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class TraceWarning:
|
|
12
|
+
"""A warning about trace data quality or analysis limitations."""
|
|
13
|
+
|
|
14
|
+
code: str # e.g., "NO_PHASE_ANNOTATIONS", "NO_PYTHON_STACKS"
|
|
15
|
+
severity: Literal["info", "warning", "error"]
|
|
16
|
+
message: str
|
|
17
|
+
suggestion: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def detect_warnings(
|
|
21
|
+
events: list[dict],
|
|
22
|
+
kernel_names: list[str],
|
|
23
|
+
phases: list[dict] | None = None,
|
|
24
|
+
layers_detected: int = 0,
|
|
25
|
+
total_kernels: int = 0,
|
|
26
|
+
) -> list[TraceWarning]:
|
|
27
|
+
"""Detect warnings from trace data.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
events: All trace events
|
|
31
|
+
kernel_names: List of all kernel names
|
|
32
|
+
phases: Optional list of phase events (for checking phase annotations)
|
|
33
|
+
layers_detected: Number of layers detected
|
|
34
|
+
total_kernels: Total number of kernels
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
List of warnings
|
|
38
|
+
"""
|
|
39
|
+
warnings: list[TraceWarning] = []
|
|
40
|
+
|
|
41
|
+
# Check for phase annotations
|
|
42
|
+
has_phase_annotations = any(
|
|
43
|
+
ev.get("cat") == "user_annotation" and ev.get("name", "").startswith("execute_context")
|
|
44
|
+
for ev in events
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
if not has_phase_annotations:
|
|
48
|
+
warnings.append(
|
|
49
|
+
TraceWarning(
|
|
50
|
+
code="NO_PHASE_ANNOTATIONS",
|
|
51
|
+
severity="warning",
|
|
52
|
+
message="No vLLM phase annotations found. Phase analysis (prefill/decode) will be unavailable.",
|
|
53
|
+
suggestion="Ensure you're using vLLM v1.0+ with profiling enabled. Re-profile with torch.profiler.profile() to capture phase markers.",
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Check for Python stack traces
|
|
58
|
+
has_python_stacks = any(
|
|
59
|
+
ev.get("cat") == "python_function"
|
|
60
|
+
for ev in events
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if not has_python_stacks:
|
|
64
|
+
warnings.append(
|
|
65
|
+
TraceWarning(
|
|
66
|
+
code="NO_PYTHON_STACKS",
|
|
67
|
+
severity="info",
|
|
68
|
+
message="No Python stack traces available. CPU→kernel mapping will be limited.",
|
|
69
|
+
suggestion="Re-profile with with_stack=True: torch.profiler.profile(with_stack=True) for better CPU operator identification.",
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Check for high percentage of unknown kernels
|
|
74
|
+
if total_kernels > 0:
|
|
75
|
+
unknown_count = sum(1 for name in kernel_names if "unknown" in name.lower() or name == "Other")
|
|
76
|
+
unknown_percentage = (unknown_count / total_kernels) * 100
|
|
77
|
+
|
|
78
|
+
if unknown_percentage > 20:
|
|
79
|
+
warnings.append(
|
|
80
|
+
TraceWarning(
|
|
81
|
+
code="HIGH_UNKNOWN_KERNELS",
|
|
82
|
+
severity="warning",
|
|
83
|
+
message=f"{unknown_percentage:.1f}% of kernels are classified as 'Unknown'. Kernel registry may be outdated.",
|
|
84
|
+
suggestion="Update kernel pattern registry or report unrecognized kernel patterns for support.",
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Check for layer detection failure
|
|
89
|
+
if layers_detected == 0 and total_kernels > 100:
|
|
90
|
+
warnings.append(
|
|
91
|
+
TraceWarning(
|
|
92
|
+
code="LAYER_DETECTION_FAILED",
|
|
93
|
+
severity="warning",
|
|
94
|
+
message="No transformer layers detected. Layer-wise analysis unavailable.",
|
|
95
|
+
suggestion="This may indicate a non-transformer model (e.g., SSM/Mamba) or insufficient correlation data. Check model architecture.",
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return warnings
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: wafer-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.27
|
|
4
4
|
Summary: Core utilities and environments for Wafer GPU kernel optimization
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Requires-Dist: aiohttp>=3.9.0
|
|
@@ -15,12 +15,14 @@ Requires-Dist: dash-svg>=0.0.11
|
|
|
15
15
|
Requires-Dist: dash>=3.0.0
|
|
16
16
|
Requires-Dist: dnspython>=2.8.0
|
|
17
17
|
Requires-Dist: httpx>=0.25.0
|
|
18
|
+
Requires-Dist: ijson>=3.2.0
|
|
18
19
|
Requires-Dist: kaleido==0.2.1
|
|
19
20
|
Requires-Dist: markdownify>=0.11.0
|
|
20
21
|
Requires-Dist: matplotlib>=3.0.0
|
|
21
22
|
Requires-Dist: modal>=0.64.0
|
|
22
23
|
Requires-Dist: numpy>=1.17.5
|
|
23
24
|
Requires-Dist: openai>=1.0.0
|
|
25
|
+
Requires-Dist: orjson>=3.9.0
|
|
24
26
|
Requires-Dist: pandas~=3.0.0
|
|
25
27
|
Requires-Dist: paramiko>=3.0.0
|
|
26
28
|
Requires-Dist: paramiko>=3.4.0
|
|
@@ -318,12 +318,20 @@ wafer_core/lib/rocprofiler/systems/run/analyzer.py,sha256=Qg3M8-kCKdV82ehn6Ta20N
|
|
|
318
318
|
wafer_core/lib/rocprofiler/systems/run/profiler.py,sha256=aiQLsDnfQHSeCM5zLnO4VlbTmREYnAtiuT50Eq6uWfg,8387
|
|
319
319
|
wafer_core/lib/rocprofiler/systems/sample/__init__.py,sha256=31rNmLPQ7OVhvlOEEOwPKgk8_qrCidj6AmzDXexQJ_o,288
|
|
320
320
|
wafer_core/lib/rocprofiler/systems/sample/profiler.py,sha256=CYZPTzNXd48LoCfmY6h_5RSYEdWYccuv3-t4YncHJLE,7384
|
|
321
|
-
wafer_core/lib/trace_compare/
|
|
322
|
-
wafer_core/lib/trace_compare/
|
|
323
|
-
wafer_core/lib/trace_compare/
|
|
321
|
+
wafer_core/lib/trace_compare/PERFORMANCE.md,sha256=jkJh7ApZi8H7NKTcz8v0LNtwSFtIUqY88e3QbL749ww,3823
|
|
322
|
+
wafer_core/lib/trace_compare/__init__.py,sha256=CyUPbPQDYhVLCFFA7S_jNSilG3OgqYjmHSKfR5X11go,1377
|
|
323
|
+
wafer_core/lib/trace_compare/aligner.py,sha256=6HplOHCUIb0cMXA-Lu-91T-hKVTMK4bk8Ei-v7HE1G4,13471
|
|
324
|
+
wafer_core/lib/trace_compare/analyzer.py,sha256=m-waAiU5S72M9J4kUwIy9fPWUecg_oOUczri8Na6xUY,29360
|
|
325
|
+
wafer_core/lib/trace_compare/api.py,sha256=JSRTcd7eZK1Z8l18TFEiA5A8ENJS1TMz7oIiw1KBbAs,8796
|
|
326
|
+
wafer_core/lib/trace_compare/architecture.py,sha256=8bqlAJQeJLBHblyXvFV-w55PIKiVQDPjDQZ8Jx4tuGg,2110
|
|
327
|
+
wafer_core/lib/trace_compare/classifier.py,sha256=CDGzY9TY-I5wRuEGsu4mTCdljqVTOnLWyFLyNgmkGXI,16864
|
|
324
328
|
wafer_core/lib/trace_compare/formatter.py,sha256=GNrCZ45ueBN05CEXjOtTuKvTI8z-g-ZZFil-ni3sWVY,37962
|
|
325
|
-
wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=
|
|
326
|
-
wafer_core/lib/trace_compare/
|
|
329
|
+
wafer_core/lib/trace_compare/fusion_analyzer.py,sha256=bD_CJ3JoVg_N6vxJJULd6G8l_-O5qnLuXKDEDItcQtg,15489
|
|
330
|
+
wafer_core/lib/trace_compare/kernel_registry.yaml,sha256=0-knXwsF3pR1x1JdIz-aWaH-5xDgTylh53E47Kf6nHo,9808
|
|
331
|
+
wafer_core/lib/trace_compare/layer_segmentation.py,sha256=kI_Y1e9nrKZfdwfcrGo4h7gpMxqXI_xkgXk46zuFen4,4642
|
|
332
|
+
wafer_core/lib/trace_compare/loader.py,sha256=zBHI0r7CX_wJ2mz0_-s0lm9KGSdaVaq7OKyxUL6KIlw,23997
|
|
333
|
+
wafer_core/lib/trace_compare/same_kernel_analyzer.py,sha256=sp81NJGVJeYdAfRQRgMbB5HcGTOneF1Rau3rbLPfpv4,3489
|
|
334
|
+
wafer_core/lib/trace_compare/warnings.py,sha256=B1HxFt-v1mDqLT2aD5bSm1Yn88bfPYnM-wui0WBF3xM,3548
|
|
327
335
|
wafer_core/lib/tracelens/__init__.py,sha256=AkHdmOnKlBO4RpsAqVVGe7MOfv6E6uhEaC_iKrYeMPI,2002
|
|
328
336
|
wafer_core/lib/tracelens/comparator.py,sha256=71YEPfjBi7_24u1oQuPerNtFsN0sDQ5CT_uBi0XLllw,3460
|
|
329
337
|
wafer_core/lib/tracelens/finder.py,sha256=HpbN8TuRNbbBytPYOmkBkfsFVBReQqVgsvFX-mBrln4,2459
|
|
@@ -679,6 +687,6 @@ wafer_core/utils/modal_execution/modal_app.py,sha256=VfS2cX8gHtnlPXemmMcEwDPeQdh
|
|
|
679
687
|
wafer_core/utils/modal_execution/modal_config.py,sha256=7cGX9TGqilQ3qxI3OFGXV5orjtyRU-PEDOJ4vP2oxno,4421
|
|
680
688
|
wafer_core/utils/modal_execution/modal_execution.py,sha256=gChjnV6jqA3A7IRP3DfvV5cSfm_MN0X4f7JZufXgdZE,24594
|
|
681
689
|
wafer_core/utils/modal_execution/test_modal.py,sha256=_jqou_hrLs1Daf1590Pnb0a_lXMMa2rczAPpW9HpoNQ,8153
|
|
682
|
-
wafer_core-0.1.
|
|
683
|
-
wafer_core-0.1.
|
|
684
|
-
wafer_core-0.1.
|
|
690
|
+
wafer_core-0.1.27.dist-info/METADATA,sha256=NYiI9hCaVd9RCCAfd8Ys0UwTMju6wiyaT7nsk4gsB8A,1477
|
|
691
|
+
wafer_core-0.1.27.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
692
|
+
wafer_core-0.1.27.dist-info/RECORD,,
|
|
File without changes
|