spark-advisor-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ CLAUDE.md
2
+ .claude/
3
+
4
+ # Python
5
+ __pycache__/
6
+ *.py[cod]
7
+ *.pyo
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+
12
+ # Virtual environment
13
+ .venv/
14
+ .envrc
15
+
16
+ # uv
17
+ uv.lock
18
+
19
+ # Testing
20
+ .coverage
21
+ .pytest_cache/
22
+ htmlcov/
23
+
24
+ # Type checking
25
+ .mypy_cache/
26
+
27
+ # Ruff
28
+ .ruff_cache/
29
+
30
+ # IDE
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+ /.claude/
41
+ tasks
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.4
2
+ Name: spark-advisor-cli
3
+ Version: 0.1.0
4
+ Summary: AI-powered Apache Spark job analyzer and configuration advisor
5
+ Project-URL: Homepage, https://github.com/pstysz/spark-advisor
6
+ Project-URL: Repository, https://github.com/pstysz/spark-advisor
7
+ Project-URL: Issues, https://github.com/pstysz/spark-advisor/issues
8
+ Project-URL: Documentation, https://github.com/pstysz/spark-advisor/blob/main/docs/architecture.md
9
+ Author: Pawel Stysz
10
+ License-Expression: Apache-2.0
11
+ Keywords: ai,apache-spark,claude,mcp,optimization,performance,spark
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: System Administrators
16
+ Classifier: License :: OSI Approved :: Apache Software License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Quality Assurance
21
+ Classifier: Topic :: System :: Monitoring
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.12
24
+ Requires-Dist: anthropic>=0.52
25
+ Requires-Dist: httpx>=0.28
26
+ Requires-Dist: orjson>=3.10
27
+ Requires-Dist: rich>=14
28
+ Requires-Dist: spark-advisor-analyzer
29
+ Requires-Dist: spark-advisor-hs-connector
30
+ Requires-Dist: spark-advisor-models
31
+ Requires-Dist: spark-advisor-rules
32
+ Requires-Dist: typer>=0.15
33
+ Description-Content-Type: text/markdown
34
+
35
+ # spark-advisor
36
+
37
+ AI-powered Apache Spark job analyzer and configuration advisor.
38
+
39
+ **Stop guessing Spark configs. Let data and AI tell you what's wrong.**
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ pip install spark-advisor-cli
45
+ ```
46
+
47
+ ## Quick Start
48
+
49
+ ```bash
50
+ # Analyze from event log file (rules-only, free)
51
+ spark-advisor analyze /path/to/event-log.json.gz --no-ai
52
+
53
+ # Analyze with AI recommendations
54
+ export ANTHROPIC_API_KEY=sk-ant-...
55
+ spark-advisor analyze /path/to/event-log.json.gz
56
+
57
+ # Analyze from History Server
58
+ spark-advisor analyze app-20250101120000-0001 -hs http://yarn:18080
59
+
60
+ # Agent mode (multi-turn AI analysis)
61
+ spark-advisor analyze /path/to/event-log.json.gz --agent
62
+
63
+ # Scan recent jobs
64
+ spark-advisor scan -hs http://yarn:18080 --limit 20
65
+ ```
66
+
67
+ ## What it detects
68
+
69
+ 11 deterministic rules: data skew, disk spill, GC pressure, shuffle partitions, executor idle, task failures, small files, broadcast join threshold, serializer choice, dynamic allocation, memory overhead.
70
+
71
+ ## Links
72
+
73
+ - [Full documentation and architecture](https://github.com/pstysz/spark-advisor)
74
+ - [MCP Server setup (Claude Desktop / Cursor)](https://github.com/pstysz/spark-advisor/blob/main/docs/mcp-setup.md)
75
+ - [Contributing](https://github.com/pstysz/spark-advisor/blob/main/CONTRIBUTING.md)
76
+
77
+ ## License
78
+
79
+ Apache 2.0
@@ -0,0 +1,45 @@
1
+ # spark-advisor
2
+
3
+ AI-powered Apache Spark job analyzer and configuration advisor.
4
+
5
+ **Stop guessing Spark configs. Let data and AI tell you what's wrong.**
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install spark-advisor-cli
11
+ ```
12
+
13
+ ## Quick Start
14
+
15
+ ```bash
16
+ # Analyze from event log file (rules-only, free)
17
+ spark-advisor analyze /path/to/event-log.json.gz --no-ai
18
+
19
+ # Analyze with AI recommendations
20
+ export ANTHROPIC_API_KEY=sk-ant-...
21
+ spark-advisor analyze /path/to/event-log.json.gz
22
+
23
+ # Analyze from History Server
24
+ spark-advisor analyze app-20250101120000-0001 -hs http://yarn:18080
25
+
26
+ # Agent mode (multi-turn AI analysis)
27
+ spark-advisor analyze /path/to/event-log.json.gz --agent
28
+
29
+ # Scan recent jobs
30
+ spark-advisor scan -hs http://yarn:18080 --limit 20
31
+ ```
32
+
33
+ ## What it detects
34
+
35
+ 11 deterministic rules: data skew, disk spill, GC pressure, shuffle partitions, executor idle, task failures, small files, broadcast join threshold, serializer choice, dynamic allocation, memory overhead.
36
+
37
+ ## Links
38
+
39
+ - [Full documentation and architecture](https://github.com/pstysz/spark-advisor)
40
+ - [MCP Server setup (Claude Desktop / Cursor)](https://github.com/pstysz/spark-advisor/blob/main/docs/mcp-setup.md)
41
+ - [Contributing](https://github.com/pstysz/spark-advisor/blob/main/CONTRIBUTING.md)
42
+
43
+ ## License
44
+
45
+ Apache 2.0
@@ -0,0 +1,96 @@
1
+ [project]
2
+ name = "spark-advisor-cli"
3
+ version = "0.1.0"
4
+ description = "AI-powered Apache Spark job analyzer and configuration advisor"
5
+ readme = "README.md"
6
+ license = "Apache-2.0"
7
+ requires-python = ">=3.12"
8
+ authors = [
9
+ { name = "Pawel Stysz" },
10
+ ]
11
+ keywords = ["spark", "apache-spark", "performance", "optimization", "ai", "claude", "mcp"]
12
+ classifiers = [
13
+ "Development Status :: 4 - Beta",
14
+ "Environment :: Console",
15
+ "Intended Audience :: Developers",
16
+ "Intended Audience :: System Administrators",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: System :: Monitoring",
22
+ "Topic :: Software Development :: Quality Assurance",
23
+ "Typing :: Typed",
24
+ ]
25
+ dependencies = [
26
+ "spark-advisor-models",
27
+ "spark-advisor-rules",
28
+ "spark-advisor-hs-connector",
29
+ "spark-advisor-analyzer",
30
+ "typer>=0.15",
31
+ "rich>=14",
32
+ "httpx>=0.28",
33
+ "orjson>=3.10",
34
+ "anthropic>=0.52",
35
+ ]
36
+
37
+ [tool.uv.sources]
38
+ spark-advisor-models = { workspace = true }
39
+ spark-advisor-rules = { workspace = true }
40
+ spark-advisor-hs-connector = { workspace = true }
41
+ spark-advisor-analyzer = { workspace = true }
42
+
43
+ [project.scripts]
44
+ spark-advisor = "spark_advisor_cli.app:main"
45
+
46
+ [project.urls]
47
+ Homepage = "https://github.com/pstysz/spark-advisor"
48
+ Repository = "https://github.com/pstysz/spark-advisor"
49
+ Issues = "https://github.com/pstysz/spark-advisor/issues"
50
+ Documentation = "https://github.com/pstysz/spark-advisor/blob/main/docs/architecture.md"
51
+
52
+ [dependency-groups]
53
+ dev = [
54
+ "pytest>=8.3",
55
+ "pytest-cov>=6.1",
56
+ "mypy>=1.15",
57
+ "ruff>=0.11",
58
+ "respx>=0.22",
59
+ ]
60
+
61
+ [build-system]
62
+ requires = ["hatchling"]
63
+ build-backend = "hatchling.build"
64
+
65
+ [tool.hatch.build.targets.wheel]
66
+ packages = ["src/spark_advisor_cli"]
67
+
68
+ [tool.pytest.ini_options]
69
+ testpaths = ["tests"]
70
+ pythonpath = ["src"]
71
+ addopts = [
72
+ "-v",
73
+ "--strict-markers",
74
+ "--tb=short",
75
+ ]
76
+
77
+ [tool.ruff]
78
+ target-version = "py312"
79
+ line-length = 120
80
+ src = ["src", "tests"]
81
+
82
+ [tool.ruff.lint]
83
+ select = ["E", "W", "F", "I", "UP", "B", "SIM", "TCH", "RUF"]
84
+
85
+ [tool.ruff.lint.flake8-type-checking]
86
+ runtime-evaluated-base-classes = ["pydantic.BaseModel", "pydantic_settings.BaseSettings"]
87
+
88
+ [tool.ruff.lint.isort]
89
+ known-first-party = ["spark_advisor_cli", "spark_advisor_models", "spark_advisor_rules", "spark_advisor_hs_connector", "spark_advisor_analyzer"]
90
+
91
+ [tool.mypy]
92
+ python_version = "3.12"
93
+ strict = true
94
+ warn_return_any = true
95
+ warn_unused_configs = true
96
+ plugins = ["pydantic.mypy"]
@@ -0,0 +1 @@
1
+ """CLI tool for Spark job analysis."""
@@ -0,0 +1,19 @@
1
+ import typer
2
+
3
+ from spark_advisor_cli.commands.analyze import analyze
4
+ from spark_advisor_cli.commands.scan import scan
5
+ from spark_advisor_cli.commands.version import version
6
+
7
+ app = typer.Typer(
8
+ name="spark-advisor",
9
+ help="AI-powered Apache Spark job analyzer and configuration advisor",
10
+ no_args_is_help=True,
11
+ )
12
+
13
+ app.command()(analyze)
14
+ app.command()(scan)
15
+ app.command()(version)
16
+
17
+
18
+ def main() -> None:
19
+ app()
@@ -0,0 +1,158 @@
1
+ import os
2
+ import sys
3
+ from pathlib import Path
4
+ from typing import Annotated
5
+
6
+ import typer
7
+ from rich.console import Console
8
+
9
+ from spark_advisor_cli.event_log.parser import parse_event_log
10
+ from spark_advisor_cli.output.console import print_analysis_result, print_job_overview, print_stage_breakdown
11
+ from spark_advisor_models.config import AiSettings, Thresholds
12
+ from spark_advisor_models.model import AnalysisResult, JobAnalysis
13
+ from spark_advisor_models.model.output import AnalysisMode, OutputFormat
14
+ from spark_advisor_rules import StaticAnalysisService, rules_for_threshold
15
+
16
+ console = Console()
17
+
18
+
19
+ def _load_job(source: str, history_server: str | None) -> JobAnalysis:
20
+ if history_server:
21
+ return _fetch_from_history_server(source, history_server)
22
+ path = Path(source)
23
+ if not path.exists():
24
+ raise FileNotFoundError(f"Event log file not found: {source}")
25
+ return parse_event_log(path)
26
+
27
+
28
+ def _fetch_from_history_server(app_id: str, history_server_url: str) -> JobAnalysis:
29
+ from spark_advisor_hs_connector.history_server_client import HistoryServerClient
30
+ from spark_advisor_hs_connector.hs_fetcher import fetch_job_analysis
31
+
32
+ with HistoryServerClient(history_server_url) as client:
33
+ return fetch_job_analysis(client, app_id)
34
+
35
+
36
+ def _resolve_ai_enabled(no_ai: bool) -> bool:
37
+ if no_ai:
38
+ return False
39
+ return bool(os.environ.get("ANTHROPIC_API_KEY"))
40
+
41
+
42
+ def _run_analysis(
43
+ job: JobAnalysis,
44
+ thresholds: Thresholds,
45
+ *,
46
+ use_ai: bool,
47
+ mode: AnalysisMode,
48
+ model: str,
49
+ ai_timeout: float,
50
+ ) -> AnalysisResult:
51
+ static = StaticAnalysisService(rules_for_threshold(thresholds))
52
+
53
+ if use_ai:
54
+ from spark_advisor_analyzer.ai.client import AnthropicClient
55
+ from spark_advisor_analyzer.ai.service import LlmAnalysisService
56
+ from spark_advisor_analyzer.orchestrator import AdviceOrchestrator
57
+
58
+ ai_settings = AiSettings(model=model, api_timeout=ai_timeout)
59
+ with AnthropicClient(timeout=ai_settings.api_timeout) as ai_client:
60
+ llm_service: LlmAnalysisService | None = None
61
+ agent_orch = None
62
+
63
+ if mode == AnalysisMode.AGENT:
64
+ from spark_advisor_analyzer.agent.orchestrator import AgentOrchestrator
65
+
66
+ agent_orch = AgentOrchestrator(ai_client, static, ai_settings)
67
+ else:
68
+ llm_service = LlmAnalysisService(ai_client, ai_settings, thresholds)
69
+
70
+ orchestrator = AdviceOrchestrator(static, llm_service, agent_orch)
71
+ return orchestrator.run(job, mode=mode)
72
+
73
+ rule_results = static.analyze(job)
74
+ return AnalysisResult(app_id=job.app_id, job=job, rule_results=rule_results, ai_report=None)
75
+
76
+
77
+ def analyze(
78
+ source: Annotated[
79
+ str,
80
+ typer.Argument(help="App ID (with --history-server) or path to event log file (.json or .json.gz)"),
81
+ ],
82
+ history_server: Annotated[
83
+ str | None,
84
+ typer.Option("--history-server", "-hs", help="Spark History Server URL (e.g. http://yarn:18080)"),
85
+ ] = None,
86
+ no_ai: Annotated[
87
+ bool,
88
+ typer.Option("--no-ai", help="Disable AI analysis (rules only)"),
89
+ ] = False,
90
+ agent: Annotated[
91
+ bool,
92
+ typer.Option("--agent", help="Use agent mode (multi-turn AI analysis with tool use)"),
93
+ ] = False,
94
+ model: Annotated[
95
+ str,
96
+ typer.Option("--model", "-m", help="Claude model for AI analysis"),
97
+ ] = "claude-sonnet-4-6",
98
+ output: Annotated[
99
+ Path | None,
100
+ typer.Option("--output", "-o", help="Write suggested config to file (default console)"),
101
+ ] = None,
102
+ output_format: Annotated[
103
+ OutputFormat,
104
+ typer.Option("--format", "-f", help="Output format: text or json"),
105
+ ] = OutputFormat.TEXT,
106
+ verbose: Annotated[
107
+ bool,
108
+ typer.Option("--verbose", "-v", help="Show per-stage breakdown"),
109
+ ] = False,
110
+ ) -> None:
111
+ """Analyze a Spark job and get optimization recommendations."""
112
+ if agent and no_ai:
113
+ console.print("[red]Error: --agent requires AI (cannot use with --no-ai)[/]")
114
+ raise typer.Exit(code=1)
115
+
116
+ if agent and not os.environ.get("ANTHROPIC_API_KEY"):
117
+ console.print("[red]Error: --agent requires ANTHROPIC_API_KEY environment variable[/]")
118
+ raise typer.Exit(code=1)
119
+
120
+ with console.status("[bold blue]Loading job data...[/]"):
121
+ try:
122
+ job = _load_job(source, history_server)
123
+ except FileNotFoundError as e:
124
+ console.print(f"[red]Error: {e}[/]")
125
+ raise typer.Exit(code=1) from e
126
+ except Exception as e:
127
+ console.print(f"[red]Error fetching job data: {e}[/]")
128
+ raise typer.Exit(code=1) from e
129
+
130
+ thresholds = Thresholds()
131
+ use_ai = _resolve_ai_enabled(no_ai)
132
+ analysis_mode = AnalysisMode.AGENT if agent else AnalysisMode.STANDARD
133
+
134
+ if analysis_mode == AnalysisMode.AGENT:
135
+ status_msg = "[bold blue]Running agent analysis (multi-turn AI)...[/]"
136
+ elif use_ai:
137
+ status_msg = "[bold blue]Running analysis (rules + AI)...[/]"
138
+ else:
139
+ status_msg = "[bold blue]Running analysis...[/]"
140
+
141
+ with console.status(status_msg):
142
+ try:
143
+ result = _run_analysis(
144
+ job, thresholds, use_ai=use_ai, mode=analysis_mode, model=model, ai_timeout=90.0,
145
+ )
146
+ except Exception as e:
147
+ console.print(f"[red]Analysis error: {e}[/]")
148
+ raise typer.Exit(code=1) from e
149
+
150
+ if output_format == OutputFormat.JSON:
151
+ sys.stdout.write(result.model_dump_json(indent=2) + "\n")
152
+ else:
153
+ print_job_overview(console, job)
154
+ if verbose:
155
+ print_stage_breakdown(console, job)
156
+ print_analysis_result(
157
+ console, result, use_ai=use_ai or analysis_mode == AnalysisMode.AGENT, output_config=output,
158
+ )
@@ -0,0 +1,57 @@
1
+ from typing import Annotated
2
+
3
+ import typer
4
+ from rich.console import Console
5
+ from rich.table import Table
6
+
7
+ console = Console()
8
+
9
+
10
+ def scan(
11
+ history_server: Annotated[
12
+ str,
13
+ typer.Option("--history-server", "-hs", help="Spark History Server URL (e.g. http://yarn:18080)"),
14
+ ],
15
+ limit: Annotated[
16
+ int,
17
+ typer.Option("--limit", "-l", help="Maximum number of applications to list"),
18
+ ] = 20,
19
+ ) -> None:
20
+ """List recent Spark applications from History Server."""
21
+ from spark_advisor_hs_connector.history_server_client import HistoryServerClient
22
+
23
+ with console.status("[bold blue]Fetching applications...[/]"):
24
+ try:
25
+ with HistoryServerClient(history_server) as client:
26
+ apps = client.list_applications(limit=limit)
27
+ except Exception as e:
28
+ console.print(f"[red]Error connecting to History Server: {e}[/]")
29
+ raise typer.Exit(code=1) from e
30
+
31
+ if not apps:
32
+ console.print("[yellow]No applications found.[/]")
33
+ return
34
+
35
+ table = Table(title=f"Recent Spark Applications ({len(apps)})")
36
+ table.add_column("App ID", style="bold")
37
+ table.add_column("Name")
38
+ table.add_column("Duration", justify="right")
39
+ table.add_column("Status")
40
+ table.add_column("Spark Version")
41
+
42
+ for app in apps:
43
+ latest = app.attempts[-1] if app.attempts else None
44
+ duration = "-"
45
+ status = ""
46
+ spark_version = ""
47
+
48
+ if latest:
49
+ if latest.duration > 0:
50
+ duration_min = latest.duration / 60_000
51
+ duration = f"{duration_min:.1f} min"
52
+ status = "[green]completed[/]" if latest.completed else "[yellow]running[/]"
53
+ spark_version = latest.appSparkVersion
54
+
55
+ table.add_row(app.id, app.name, duration, status, spark_version)
56
+
57
+ console.print(table)
@@ -0,0 +1,11 @@
1
+ from importlib.metadata import version as pkg_version
2
+
3
+ from rich.console import Console
4
+
5
+ console = Console()
6
+
7
+
8
+ def version() -> None:
9
+ """Show version information."""
10
+ ver = pkg_version("spark-advisor-cli")
11
+ console.print(f"spark-advisor [bold]v{ver}[/]")