spark-advisor-parser 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spark_advisor_parser-0.1.14/.gitignore +42 -0
- spark_advisor_parser-0.1.14/PKG-INFO +25 -0
- spark_advisor_parser-0.1.14/pyproject.toml +79 -0
- spark_advisor_parser-0.1.14/src/spark_advisor_parser/__init__.py +3 -0
- spark_advisor_parser-0.1.14/src/spark_advisor_parser/parser.py +274 -0
- spark_advisor_parser-0.1.14/src/spark_advisor_parser/py.typed +0 -0
- spark_advisor_parser-0.1.14/tests/test_parser.py +266 -0
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
**/CLAUDE.md
|
|
2
|
+
.claude/
|
|
3
|
+
|
|
4
|
+
# Python
|
|
5
|
+
__pycache__/
|
|
6
|
+
*.py[cod]
|
|
7
|
+
*.pyo
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
|
|
12
|
+
# Virtual environment
|
|
13
|
+
.venv/
|
|
14
|
+
.envrc
|
|
15
|
+
|
|
16
|
+
# Testing
|
|
17
|
+
.coverage
|
|
18
|
+
.pytest_cache/
|
|
19
|
+
htmlcov/
|
|
20
|
+
|
|
21
|
+
# Type checking
|
|
22
|
+
.mypy_cache/
|
|
23
|
+
|
|
24
|
+
# Ruff
|
|
25
|
+
.ruff_cache/
|
|
26
|
+
|
|
27
|
+
# IDE
|
|
28
|
+
.idea/
|
|
29
|
+
.vscode/
|
|
30
|
+
*.swp
|
|
31
|
+
*.swo
|
|
32
|
+
*~
|
|
33
|
+
|
|
34
|
+
# OS
|
|
35
|
+
.DS_Store
|
|
36
|
+
Thumbs.db
|
|
37
|
+
/.claude/
|
|
38
|
+
tasks
|
|
39
|
+
|
|
40
|
+
# Frontend
|
|
41
|
+
node_modules/
|
|
42
|
+
packages/spark-advisor-frontend/dist/
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spark-advisor-parser
|
|
3
|
+
Version: 0.1.14
|
|
4
|
+
Summary: Spark event log parser with compression support (.gz, .lz4, .snappy, .zstd)
|
|
5
|
+
Project-URL: Homepage, https://github.com/pstysz/spark-advisor
|
|
6
|
+
Project-URL: Repository, https://github.com/pstysz/spark-advisor
|
|
7
|
+
Author: Pawel Stysz
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Keywords: apache-spark,event-log,parser,spark
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Requires-Dist: lz4>=4.0
|
|
19
|
+
Requires-Dist: orjson>=3.10
|
|
20
|
+
Requires-Dist: python-snappy>=0.7
|
|
21
|
+
Requires-Dist: spark-advisor-models==0.1.14
|
|
22
|
+
Requires-Dist: zstandard>=0.23
|
|
23
|
+
Description-Content-Type: text/plain
|
|
24
|
+
|
|
25
|
+
Spark event log parser with compression support
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "spark-advisor-parser"
|
|
3
|
+
version = "0.1.14" # x-release-please-version
|
|
4
|
+
description = "Spark event log parser with compression support (.gz, .lz4, .snappy, .zstd)"
|
|
5
|
+
readme = { text = "Spark event log parser with compression support", content-type = "text/plain" }
|
|
6
|
+
license = "Apache-2.0"
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Pawel Stysz" },
|
|
10
|
+
]
|
|
11
|
+
keywords = ["spark", "apache-spark", "event-log", "parser"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 4 - Beta",
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"License :: OSI Approved :: Apache Software License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Typing :: Typed",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"spark-advisor-models==0.1.14", # x-release-please-version
|
|
23
|
+
"orjson>=3.10",
|
|
24
|
+
"lz4>=4.0",
|
|
25
|
+
"python-snappy>=0.7",
|
|
26
|
+
"zstandard>=0.23",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[tool.uv.sources]
|
|
30
|
+
spark-advisor-models = { workspace = true }
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/pstysz/spark-advisor"
|
|
34
|
+
Repository = "https://github.com/pstysz/spark-advisor"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["hatchling"]
|
|
38
|
+
build-backend = "hatchling.build"
|
|
39
|
+
|
|
40
|
+
[tool.hatch.build.targets.wheel]
|
|
41
|
+
packages = ["src/spark_advisor_parser"]
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=8.3",
|
|
46
|
+
"pytest-cov>=6.1",
|
|
47
|
+
"mypy>=1.15",
|
|
48
|
+
"ruff>=0.11",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
testpaths = ["tests"]
|
|
53
|
+
pythonpath = ["src"]
|
|
54
|
+
addopts = [
|
|
55
|
+
"-v",
|
|
56
|
+
"--strict-markers",
|
|
57
|
+
"--tb=short",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
[tool.ruff]
|
|
61
|
+
target-version = "py312"
|
|
62
|
+
line-length = 120
|
|
63
|
+
src = ["src", "tests"]
|
|
64
|
+
|
|
65
|
+
[tool.ruff.lint]
|
|
66
|
+
select = ["E", "W", "F", "I", "UP", "B", "SIM", "TCH", "RUF"]
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint.flake8-type-checking]
|
|
69
|
+
runtime-evaluated-base-classes = ["pydantic.BaseModel", "pydantic_settings.BaseSettings"]
|
|
70
|
+
|
|
71
|
+
[tool.ruff.lint.isort]
|
|
72
|
+
known-first-party = ["spark_advisor_parser", "spark_advisor_models"]
|
|
73
|
+
|
|
74
|
+
[tool.mypy]
|
|
75
|
+
python_version = "3.12"
|
|
76
|
+
strict = true
|
|
77
|
+
warn_return_any = true
|
|
78
|
+
warn_unused_configs = true
|
|
79
|
+
plugins = ["pydantic.mypy"]
|
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import io
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import IO, Any
|
|
9
|
+
|
|
10
|
+
import lz4.frame # type: ignore[import-untyped]
|
|
11
|
+
import orjson
|
|
12
|
+
import snappy # type: ignore[import-untyped]
|
|
13
|
+
import zstandard
|
|
14
|
+
|
|
15
|
+
from spark_advisor_models.model import (
|
|
16
|
+
ExecutorMetrics,
|
|
17
|
+
JobAnalysis,
|
|
18
|
+
Quantiles,
|
|
19
|
+
SparkConfig,
|
|
20
|
+
StageMetrics,
|
|
21
|
+
TaskMetrics,
|
|
22
|
+
TaskMetricsDistributions,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_event_log(path: Path) -> JobAnalysis:
|
|
27
|
+
state = _ParserState()
|
|
28
|
+
|
|
29
|
+
with _open_event_log(path) as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
line = line.strip()
|
|
32
|
+
if not line:
|
|
33
|
+
continue
|
|
34
|
+
try:
|
|
35
|
+
event = orjson.loads(line)
|
|
36
|
+
except orjson.JSONDecodeError:
|
|
37
|
+
continue
|
|
38
|
+
_process_event(event, state)
|
|
39
|
+
|
|
40
|
+
return state.build()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@contextmanager
|
|
44
|
+
def _open_event_log(path: Path) -> Iterator[IO[str]]:
|
|
45
|
+
suffix = path.suffix.lower()
|
|
46
|
+
suffixes = [s.lower() for s in path.suffixes]
|
|
47
|
+
|
|
48
|
+
if suffix == ".gz" or ".gz" in suffixes:
|
|
49
|
+
with gzip.open(path, "rt", encoding="utf-8") as f:
|
|
50
|
+
yield f
|
|
51
|
+
elif suffix == ".lz4":
|
|
52
|
+
with lz4.frame.open(path, "rt", encoding="utf-8") as f:
|
|
53
|
+
yield f
|
|
54
|
+
elif suffix == ".snappy":
|
|
55
|
+
raw = path.read_bytes()
|
|
56
|
+
decompressed = snappy.decompress(raw)
|
|
57
|
+
yield io.StringIO(decompressed.decode("utf-8"))
|
|
58
|
+
elif suffix == ".zstd" or suffix == ".zst":
|
|
59
|
+
dctx = zstandard.ZstdDecompressor()
|
|
60
|
+
with open(path, "rb") as raw_f, dctx.stream_reader(raw_f) as reader:
|
|
61
|
+
yield io.TextIOWrapper(reader, encoding="utf-8")
|
|
62
|
+
else:
|
|
63
|
+
with open(path, encoding="utf-8") as f:
|
|
64
|
+
yield f
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _quantiles_from_list(values: list[int]) -> Quantiles:
|
|
68
|
+
if not values:
|
|
69
|
+
return Quantiles()
|
|
70
|
+
s = sorted(values)
|
|
71
|
+
n = len(s)
|
|
72
|
+
|
|
73
|
+
def _percentile(p: float) -> int:
|
|
74
|
+
idx = p * (n - 1)
|
|
75
|
+
lo = int(idx)
|
|
76
|
+
hi = min(lo + 1, n - 1)
|
|
77
|
+
frac = idx - lo
|
|
78
|
+
return int(s[lo] + frac * (s[hi] - s[lo]))
|
|
79
|
+
|
|
80
|
+
return Quantiles(
|
|
81
|
+
min=s[0],
|
|
82
|
+
p25=_percentile(0.25),
|
|
83
|
+
median=_percentile(0.5),
|
|
84
|
+
p75=_percentile(0.75),
|
|
85
|
+
max=s[-1],
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class _StageAccumulator:
|
|
91
|
+
durations: list[int] = field(default_factory=list)
|
|
92
|
+
executor_run_times: list[int] = field(default_factory=list)
|
|
93
|
+
gc_times: list[int] = field(default_factory=list)
|
|
94
|
+
task_count: int = 0
|
|
95
|
+
total_gc_time_ms: int = 0
|
|
96
|
+
total_executor_run_time_ms: int = 0
|
|
97
|
+
shuffle_read_bytes: int = 0
|
|
98
|
+
shuffle_write_bytes: int = 0
|
|
99
|
+
spill_disk_bytes: int = 0
|
|
100
|
+
spill_memory_bytes: int = 0
|
|
101
|
+
failed_count: int = 0
|
|
102
|
+
killed_count: int = 0
|
|
103
|
+
input_records: int = 0
|
|
104
|
+
output_records: int = 0
|
|
105
|
+
shuffle_read_records: int = 0
|
|
106
|
+
shuffle_write_records: int = 0
|
|
107
|
+
|
|
108
|
+
def add_task(self, task_info: dict[str, Any], task_metrics: dict[str, Any], event: dict[str, Any]) -> None:
|
|
109
|
+
if task_info.get("Failed", False):
|
|
110
|
+
self.failed_count += 1
|
|
111
|
+
|
|
112
|
+
task_end_reason = event.get("Task End Reason", {})
|
|
113
|
+
if task_end_reason.get("Reason") == "TaskKilled":
|
|
114
|
+
self.killed_count += 1
|
|
115
|
+
|
|
116
|
+
duration = task_info.get("Finish Time", 0) - task_info.get("Launch Time", 0)
|
|
117
|
+
self.durations.append(duration)
|
|
118
|
+
self.task_count += 1
|
|
119
|
+
|
|
120
|
+
gc_time = task_metrics.get("JVM GC Time", 0)
|
|
121
|
+
self.gc_times.append(gc_time)
|
|
122
|
+
self.total_gc_time_ms += gc_time
|
|
123
|
+
|
|
124
|
+
executor_run_time = task_metrics.get("Executor Run Time", 0)
|
|
125
|
+
self.executor_run_times.append(executor_run_time)
|
|
126
|
+
self.total_executor_run_time_ms += executor_run_time
|
|
127
|
+
|
|
128
|
+
shuffle_read = task_metrics.get("Shuffle Read Metrics", {})
|
|
129
|
+
self.shuffle_read_bytes += shuffle_read.get("Remote Bytes Read", 0)
|
|
130
|
+
self.shuffle_read_bytes += shuffle_read.get("Local Bytes Read", 0)
|
|
131
|
+
self.shuffle_read_records += shuffle_read.get("Total Records Read", 0)
|
|
132
|
+
|
|
133
|
+
shuffle_write = task_metrics.get("Shuffle Write Metrics", {})
|
|
134
|
+
self.shuffle_write_bytes += shuffle_write.get("Shuffle Bytes Written", 0)
|
|
135
|
+
self.shuffle_write_records += shuffle_write.get("Shuffle Records Written", 0)
|
|
136
|
+
|
|
137
|
+
input_m = task_metrics.get("Input Metrics", {})
|
|
138
|
+
self.input_records += input_m.get("Records Read", 0)
|
|
139
|
+
|
|
140
|
+
output_m = task_metrics.get("Output Metrics", {})
|
|
141
|
+
self.output_records += output_m.get("Records Written", 0)
|
|
142
|
+
|
|
143
|
+
self.spill_disk_bytes += task_metrics.get("Disk Bytes Spilled", 0)
|
|
144
|
+
self.spill_memory_bytes += task_metrics.get("Memory Bytes Spilled", 0)
|
|
145
|
+
|
|
146
|
+
def build_stage_metrics(self, stage_id: int, stage_info: dict[str, Any]) -> StageMetrics:
|
|
147
|
+
distributions = TaskMetricsDistributions(
|
|
148
|
+
duration=_quantiles_from_list(self.durations),
|
|
149
|
+
executor_run_time=_quantiles_from_list(self.executor_run_times),
|
|
150
|
+
jvm_gc_time=_quantiles_from_list(self.gc_times),
|
|
151
|
+
)
|
|
152
|
+
return StageMetrics(
|
|
153
|
+
stage_id=stage_id,
|
|
154
|
+
stage_name=stage_info.get("name", f"Stage {stage_id}"),
|
|
155
|
+
sum_executor_run_time_ms=self.total_executor_run_time_ms,
|
|
156
|
+
total_gc_time_ms=self.total_gc_time_ms,
|
|
157
|
+
total_shuffle_read_bytes=self.shuffle_read_bytes,
|
|
158
|
+
total_shuffle_write_bytes=self.shuffle_write_bytes,
|
|
159
|
+
spill_to_disk_bytes=self.spill_disk_bytes,
|
|
160
|
+
spill_to_memory_bytes=self.spill_memory_bytes,
|
|
161
|
+
failed_task_count=self.failed_count,
|
|
162
|
+
killed_task_count=self.killed_count,
|
|
163
|
+
input_bytes=stage_info.get("input_bytes", 0),
|
|
164
|
+
input_records=self.input_records,
|
|
165
|
+
output_bytes=stage_info.get("output_bytes", 0),
|
|
166
|
+
output_records=self.output_records,
|
|
167
|
+
shuffle_read_records=self.shuffle_read_records,
|
|
168
|
+
shuffle_write_records=self.shuffle_write_records,
|
|
169
|
+
tasks=TaskMetrics(
|
|
170
|
+
task_count=self.task_count,
|
|
171
|
+
distributions=distributions,
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class _ParserState:
|
|
177
|
+
def __init__(self) -> None:
|
|
178
|
+
self.app_id: str = ""
|
|
179
|
+
self.app_name: str = ""
|
|
180
|
+
self.spark_version: str = ""
|
|
181
|
+
self.start_time: int = 0
|
|
182
|
+
self.end_time: int = 0
|
|
183
|
+
self.config: dict[str, str] = {}
|
|
184
|
+
self.stage_info: dict[int, dict[str, Any]] = {}
|
|
185
|
+
self.stage_tasks: dict[int, _StageAccumulator] = defaultdict(_StageAccumulator)
|
|
186
|
+
self.executor_count: int = 0
|
|
187
|
+
|
|
188
|
+
def _resolve_end_time(self) -> int:
|
|
189
|
+
if self.end_time > self.start_time:
|
|
190
|
+
return self.end_time
|
|
191
|
+
if self.stage_info:
|
|
192
|
+
return max(
|
|
193
|
+
(info.get("completion_time", 0) for info in self.stage_info.values()),
|
|
194
|
+
default=self.start_time,
|
|
195
|
+
)
|
|
196
|
+
return self.start_time
|
|
197
|
+
|
|
198
|
+
def build(self) -> JobAnalysis:
|
|
199
|
+
end_time = self._resolve_end_time()
|
|
200
|
+
|
|
201
|
+
stages = []
|
|
202
|
+
for stage_id, info in sorted(self.stage_info.items()):
|
|
203
|
+
acc = self.stage_tasks[stage_id]
|
|
204
|
+
stages.append(acc.build_stage_metrics(stage_id, info))
|
|
205
|
+
|
|
206
|
+
total_task_time = sum(acc.total_executor_run_time_ms for acc in self.stage_tasks.values())
|
|
207
|
+
|
|
208
|
+
return JobAnalysis(
|
|
209
|
+
app_id=self.app_id,
|
|
210
|
+
app_name=self.app_name,
|
|
211
|
+
spark_version=self.spark_version,
|
|
212
|
+
duration_ms=end_time - self.start_time,
|
|
213
|
+
config=SparkConfig(raw=self.config),
|
|
214
|
+
stages=stages,
|
|
215
|
+
executors=ExecutorMetrics(
|
|
216
|
+
executor_count=self.executor_count,
|
|
217
|
+
peak_memory_bytes_sum=0,
|
|
218
|
+
allocated_memory_bytes_sum=0,
|
|
219
|
+
total_task_time_ms=total_task_time,
|
|
220
|
+
),
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _process_event(event: dict[str, Any], state: _ParserState) -> None:
|
|
225
|
+
event_type = event.get("Event", "")
|
|
226
|
+
|
|
227
|
+
match event_type:
|
|
228
|
+
case "SparkListenerApplicationStart":
|
|
229
|
+
state.app_id = event.get("App ID", "")
|
|
230
|
+
state.app_name = event.get("App Name", "")
|
|
231
|
+
state.start_time = event.get("Timestamp", 0)
|
|
232
|
+
|
|
233
|
+
case "SparkListenerApplicationEnd":
|
|
234
|
+
state.end_time = event.get("Timestamp", 0)
|
|
235
|
+
|
|
236
|
+
case "SparkListenerLogStart":
|
|
237
|
+
state.spark_version = event.get("Spark Version", "")
|
|
238
|
+
|
|
239
|
+
case "SparkListenerEnvironmentUpdate":
|
|
240
|
+
for key, value in event.get("Spark Properties", {}).items():
|
|
241
|
+
state.config[key] = value
|
|
242
|
+
|
|
243
|
+
case "SparkListenerStageCompleted":
|
|
244
|
+
stage_info = event.get("Stage Info", {})
|
|
245
|
+
stage_id = stage_info.get("Stage ID", -1)
|
|
246
|
+
state.stage_info[stage_id] = {
|
|
247
|
+
"name": stage_info.get("Stage Name", ""),
|
|
248
|
+
"submission_time": stage_info.get("Submission Time", 0),
|
|
249
|
+
"completion_time": stage_info.get("Completion Time", 0),
|
|
250
|
+
"input_bytes": _extract_accumulator(stage_info, "internal.metrics.input.bytesRead"),
|
|
251
|
+
"output_bytes": _extract_accumulator(stage_info, "internal.metrics.output.bytesWritten")
|
|
252
|
+
if stage_info.get("Accumulables")
|
|
253
|
+
else 0,
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
case "SparkListenerTaskEnd":
|
|
257
|
+
stage_id = event.get("Stage ID", -1)
|
|
258
|
+
task_info = event.get("Task Info", {})
|
|
259
|
+
task_metrics = event.get("Task Metrics", {})
|
|
260
|
+
|
|
261
|
+
if not task_info or not task_metrics:
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
state.stage_tasks[stage_id].add_task(task_info, task_metrics, event)
|
|
265
|
+
|
|
266
|
+
case "SparkListenerExecutorAdded":
|
|
267
|
+
state.executor_count += 1
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _extract_accumulator(stage_info: dict[str, Any], name: str) -> int:
|
|
271
|
+
for acc in stage_info.get("Accumulables", []):
|
|
272
|
+
if acc.get("Name") == name:
|
|
273
|
+
return int(acc.get("Value", 0))
|
|
274
|
+
return 0
|
|
File without changes
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import gzip
|
|
2
|
+
import json
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import lz4.frame
|
|
8
|
+
import snappy
|
|
9
|
+
import zstandard
|
|
10
|
+
|
|
11
|
+
from spark_advisor_parser import parse_event_log
|
|
12
|
+
|
|
13
|
+
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
|
|
14
|
+
SAMPLE_LOG = _REPO_ROOT / "sample_event_logs" / "sample_etl_job.json"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _write_events(events: list[dict[str, Any]]) -> Path:
|
|
18
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
19
|
+
for e in events:
|
|
20
|
+
f.write(json.dumps(e) + "\n")
|
|
21
|
+
return Path(f.name)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _events_as_bytes(events: list[dict[str, Any]]) -> bytes:
|
|
25
|
+
return "\n".join(json.dumps(e) for e in events).encode("utf-8")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _base_events() -> list[dict[str, Any]]:
|
|
29
|
+
return [
|
|
30
|
+
{"Event": "SparkListenerLogStart", "Spark Version": "3.5.0"},
|
|
31
|
+
{
|
|
32
|
+
"Event": "SparkListenerApplicationStart",
|
|
33
|
+
"App ID": "app-1",
|
|
34
|
+
"App Name": "Test",
|
|
35
|
+
"Timestamp": 1000,
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"Event": "SparkListenerEnvironmentUpdate",
|
|
39
|
+
"Spark Properties": {"spark.executor.memory": "4g"},
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"Event": "SparkListenerStageCompleted",
|
|
43
|
+
"Stage Info": {"Stage ID": 0, "Stage Name": "read", "Accumulables": []},
|
|
44
|
+
},
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _task_event(stage_id: int = 0, **metric_overrides: Any) -> dict[str, Any]:
|
|
49
|
+
metrics: dict[str, Any] = {
|
|
50
|
+
"Executor Run Time": 900,
|
|
51
|
+
"JVM GC Time": 10,
|
|
52
|
+
"Shuffle Read Metrics": {"Remote Bytes Read": 0},
|
|
53
|
+
"Shuffle Write Metrics": {"Shuffle Bytes Written": 0},
|
|
54
|
+
"Disk Bytes Spilled": 0,
|
|
55
|
+
"Memory Bytes Spilled": 0,
|
|
56
|
+
}
|
|
57
|
+
metrics.update(metric_overrides)
|
|
58
|
+
event: dict[str, Any] = {
|
|
59
|
+
"Event": "SparkListenerTaskEnd",
|
|
60
|
+
"Stage ID": stage_id,
|
|
61
|
+
"Task Info": {"Launch Time": 1000, "Finish Time": 2000, "Failed": False},
|
|
62
|
+
"Task Metrics": metrics,
|
|
63
|
+
}
|
|
64
|
+
return event
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
_APP_END: dict[str, Any] = {"Event": "SparkListenerApplicationEnd", "Timestamp": 5000}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TestParseEventLog:
|
|
71
|
+
def test_parses_app_metadata(self) -> None:
|
|
72
|
+
job = parse_event_log(SAMPLE_LOG)
|
|
73
|
+
assert job.app_id == "application_1234567890_0001"
|
|
74
|
+
assert job.app_name == "SampleETLJob"
|
|
75
|
+
|
|
76
|
+
def test_parses_config(self) -> None:
|
|
77
|
+
job = parse_event_log(SAMPLE_LOG)
|
|
78
|
+
assert job.config.executor_memory == "4g"
|
|
79
|
+
assert job.config.shuffle_partitions == 200
|
|
80
|
+
assert job.config.aqe_enabled is False
|
|
81
|
+
|
|
82
|
+
def test_parses_stages(self) -> None:
|
|
83
|
+
job = parse_event_log(SAMPLE_LOG)
|
|
84
|
+
assert len(job.stages) == 2
|
|
85
|
+
|
|
86
|
+
def test_aggregates_task_metrics(self) -> None:
|
|
87
|
+
job = parse_event_log(SAMPLE_LOG)
|
|
88
|
+
stage1 = next(s for s in job.stages if s.stage_id == 1)
|
|
89
|
+
assert stage1.tasks.task_count == 3
|
|
90
|
+
assert stage1.spill_to_disk_bytes > 0
|
|
91
|
+
|
|
92
|
+
def test_detects_skew_in_sample(self) -> None:
|
|
93
|
+
job = parse_event_log(SAMPLE_LOG)
|
|
94
|
+
stage1 = next(s for s in job.stages if s.stage_id == 1)
|
|
95
|
+
assert stage1.tasks.duration_skew_ratio > 10
|
|
96
|
+
|
|
97
|
+
def test_calculates_duration(self) -> None:
|
|
98
|
+
job = parse_event_log(SAMPLE_LOG)
|
|
99
|
+
assert job.duration_ms > 0
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class TestEventParserRecords:
|
|
103
|
+
def test_collects_input_and_output_records(self) -> None:
|
|
104
|
+
task = _task_event(
|
|
105
|
+
**{
|
|
106
|
+
"Input Metrics": {"Bytes Read": 1024, "Records Read": 500},
|
|
107
|
+
"Output Metrics": {"Bytes Written": 512, "Records Written": 250},
|
|
108
|
+
},
|
|
109
|
+
)
|
|
110
|
+
events = [*_base_events(), task, _APP_END]
|
|
111
|
+
job = parse_event_log(_write_events(events))
|
|
112
|
+
stage = job.stages[0]
|
|
113
|
+
assert stage.input_records == 500
|
|
114
|
+
assert stage.output_records == 250
|
|
115
|
+
|
|
116
|
+
def test_collects_shuffle_records(self) -> None:
|
|
117
|
+
task = _task_event(
|
|
118
|
+
**{
|
|
119
|
+
"Shuffle Read Metrics": {
|
|
120
|
+
"Remote Bytes Read": 100,
|
|
121
|
+
"Local Bytes Read": 200,
|
|
122
|
+
"Total Records Read": 1000,
|
|
123
|
+
},
|
|
124
|
+
"Shuffle Write Metrics": {
|
|
125
|
+
"Shuffle Bytes Written": 300,
|
|
126
|
+
"Shuffle Records Written": 800,
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
events = [*_base_events(), task, _APP_END]
|
|
131
|
+
job = parse_event_log(_write_events(events))
|
|
132
|
+
stage = job.stages[0]
|
|
133
|
+
assert stage.total_shuffle_read_bytes == 300
|
|
134
|
+
assert stage.shuffle_read_records == 1000
|
|
135
|
+
assert stage.shuffle_write_records == 800
|
|
136
|
+
|
|
137
|
+
def test_counts_killed_tasks(self) -> None:
|
|
138
|
+
task = _task_event()
|
|
139
|
+
task["Task End Reason"] = {"Reason": "TaskKilled"}
|
|
140
|
+
events = [*_base_events(), task, _APP_END]
|
|
141
|
+
job = parse_event_log(_write_events(events))
|
|
142
|
+
stage = job.stages[0]
|
|
143
|
+
assert stage.killed_task_count == 1
|
|
144
|
+
|
|
145
|
+
def test_local_shuffle_read_included(self) -> None:
|
|
146
|
+
task = _task_event(
|
|
147
|
+
**{
|
|
148
|
+
"Shuffle Read Metrics": {
|
|
149
|
+
"Remote Bytes Read": 500,
|
|
150
|
+
"Local Bytes Read": 300,
|
|
151
|
+
},
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
events = [*_base_events(), task, _APP_END]
|
|
155
|
+
job = parse_event_log(_write_events(events))
|
|
156
|
+
stage = job.stages[0]
|
|
157
|
+
assert stage.total_shuffle_read_bytes == 800
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class TestMalformedLines:
|
|
161
|
+
def test_skips_malformed_json(self) -> None:
|
|
162
|
+
path = _write_events(_base_events())
|
|
163
|
+
with open(path, "a") as f:
|
|
164
|
+
f.write("this is not json\n")
|
|
165
|
+
f.write(json.dumps(_APP_END) + "\n")
|
|
166
|
+
job = parse_event_log(path)
|
|
167
|
+
assert job.app_id == "app-1"
|
|
168
|
+
|
|
169
|
+
def test_skips_empty_lines(self) -> None:
|
|
170
|
+
path = _write_events(_base_events())
|
|
171
|
+
with open(path, "a") as f:
|
|
172
|
+
f.write("\n\n\n")
|
|
173
|
+
f.write(json.dumps(_APP_END) + "\n")
|
|
174
|
+
job = parse_event_log(path)
|
|
175
|
+
assert job.app_id == "app-1"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class TestIncompleteLogs:
|
|
179
|
+
def test_missing_app_end_uses_stage_completion_time(self) -> None:
|
|
180
|
+
events = [
|
|
181
|
+
{"Event": "SparkListenerApplicationStart", "App ID": "app-1", "App Name": "Test", "Timestamp": 1000},
|
|
182
|
+
{
|
|
183
|
+
"Event": "SparkListenerStageCompleted",
|
|
184
|
+
"Stage Info": {
|
|
185
|
+
"Stage ID": 0,
|
|
186
|
+
"Stage Name": "read",
|
|
187
|
+
"Completion Time": 4000,
|
|
188
|
+
"Accumulables": [],
|
|
189
|
+
},
|
|
190
|
+
},
|
|
191
|
+
_task_event(),
|
|
192
|
+
]
|
|
193
|
+
job = parse_event_log(_write_events(events))
|
|
194
|
+
assert job.duration_ms == 3000
|
|
195
|
+
|
|
196
|
+
def test_missing_app_end_no_stages(self) -> None:
|
|
197
|
+
events = [
|
|
198
|
+
{"Event": "SparkListenerApplicationStart", "App ID": "app-1", "App Name": "Test", "Timestamp": 1000},
|
|
199
|
+
]
|
|
200
|
+
job = parse_event_log(_write_events(events))
|
|
201
|
+
assert job.duration_ms == 0
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class TestGzipSupport:
|
|
205
|
+
def test_parses_gzipped_event_log(self) -> None:
|
|
206
|
+
events = [*_base_events(), _task_event(), _APP_END]
|
|
207
|
+
with tempfile.NamedTemporaryFile(suffix=".json.gz", delete=False) as f:
|
|
208
|
+
with gzip.open(f.name, "wt", encoding="utf-8") as gz:
|
|
209
|
+
for e in events:
|
|
210
|
+
gz.write(json.dumps(e) + "\n")
|
|
211
|
+
path = Path(f.name)
|
|
212
|
+
job = parse_event_log(path)
|
|
213
|
+
assert job.app_id == "app-1"
|
|
214
|
+
assert len(job.stages) == 1
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class TestLz4Support:
|
|
218
|
+
def test_parses_lz4_event_log(self) -> None:
|
|
219
|
+
events = [*_base_events(), _task_event(), _APP_END]
|
|
220
|
+
data = _events_as_bytes(events)
|
|
221
|
+
with tempfile.NamedTemporaryFile(suffix=".lz4", delete=False) as f:
|
|
222
|
+
with lz4.frame.open(f.name, "wb") as lz4f:
|
|
223
|
+
lz4f.write(data)
|
|
224
|
+
path = Path(f.name)
|
|
225
|
+
job = parse_event_log(path)
|
|
226
|
+
assert job.app_id == "app-1"
|
|
227
|
+
assert len(job.stages) == 1
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class TestSnappySupport:
|
|
231
|
+
def test_parses_snappy_event_log(self) -> None:
|
|
232
|
+
events = [*_base_events(), _task_event(), _APP_END]
|
|
233
|
+
data = _events_as_bytes(events)
|
|
234
|
+
compressed = snappy.compress(data)
|
|
235
|
+
with tempfile.NamedTemporaryFile(suffix=".snappy", delete=False) as f:
|
|
236
|
+
f.write(compressed)
|
|
237
|
+
path = Path(f.name)
|
|
238
|
+
job = parse_event_log(path)
|
|
239
|
+
assert job.app_id == "app-1"
|
|
240
|
+
assert len(job.stages) == 1
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class TestZstdSupport:
|
|
244
|
+
def test_parses_zstd_event_log(self) -> None:
|
|
245
|
+
events = [*_base_events(), _task_event(), _APP_END]
|
|
246
|
+
data = _events_as_bytes(events)
|
|
247
|
+
cctx = zstandard.ZstdCompressor()
|
|
248
|
+
compressed = cctx.compress(data)
|
|
249
|
+
with tempfile.NamedTemporaryFile(suffix=".zstd", delete=False) as f:
|
|
250
|
+
f.write(compressed)
|
|
251
|
+
path = Path(f.name)
|
|
252
|
+
job = parse_event_log(path)
|
|
253
|
+
assert job.app_id == "app-1"
|
|
254
|
+
assert len(job.stages) == 1
|
|
255
|
+
|
|
256
|
+
def test_parses_zst_extension(self) -> None:
|
|
257
|
+
events = [*_base_events(), _task_event(), _APP_END]
|
|
258
|
+
data = _events_as_bytes(events)
|
|
259
|
+
cctx = zstandard.ZstdCompressor()
|
|
260
|
+
compressed = cctx.compress(data)
|
|
261
|
+
with tempfile.NamedTemporaryFile(suffix=".zst", delete=False) as f:
|
|
262
|
+
f.write(compressed)
|
|
263
|
+
path = Path(f.name)
|
|
264
|
+
job = parse_event_log(path)
|
|
265
|
+
assert job.app_id == "app-1"
|
|
266
|
+
assert len(job.stages) == 1
|