pyspark-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. pyspark_tools-0.0.1/LICENSE +21 -0
  2. pyspark_tools-0.0.1/PKG-INFO +33 -0
  3. pyspark_tools-0.0.1/README.md +131 -0
  4. pyspark_tools-0.0.1/pyproject.toml +76 -0
  5. pyspark_tools-0.0.1/pyspark_tools/__init__.py +11 -0
  6. pyspark_tools-0.0.1/pyspark_tools/advanced_optimizer.py +1143 -0
  7. pyspark_tools-0.0.1/pyspark_tools/aws_glue_integration.py +2592 -0
  8. pyspark_tools-0.0.1/pyspark_tools/batch_processor.py +793 -0
  9. pyspark_tools-0.0.1/pyspark_tools/code_reviewer.py +286 -0
  10. pyspark_tools-0.0.1/pyspark_tools/data_source_analyzer.py +720 -0
  11. pyspark_tools-0.0.1/pyspark_tools/duplicate_detector.py +592 -0
  12. pyspark_tools-0.0.1/pyspark_tools/file_utils.py +807 -0
  13. pyspark_tools-0.0.1/pyspark_tools/memory_manager.py +1003 -0
  14. pyspark_tools-0.0.1/pyspark_tools/py.typed +0 -0
  15. pyspark_tools-0.0.1/pyspark_tools/server.py +3793 -0
  16. pyspark_tools-0.0.1/pyspark_tools/sql_converter.py +1951 -0
  17. pyspark_tools-0.0.1/pyspark_tools.egg-info/PKG-INFO +33 -0
  18. pyspark_tools-0.0.1/pyspark_tools.egg-info/SOURCES.txt +34 -0
  19. pyspark_tools-0.0.1/pyspark_tools.egg-info/dependency_links.txt +1 -0
  20. pyspark_tools-0.0.1/pyspark_tools.egg-info/entry_points.txt +2 -0
  21. pyspark_tools-0.0.1/pyspark_tools.egg-info/requires.txt +23 -0
  22. pyspark_tools-0.0.1/pyspark_tools.egg-info/top_level.txt +1 -0
  23. pyspark_tools-0.0.1/setup.cfg +4 -0
  24. pyspark_tools-0.0.1/tests/test_advanced_optimizer.py +487 -0
  25. pyspark_tools-0.0.1/tests/test_aws_glue_integration.py +1223 -0
  26. pyspark_tools-0.0.1/tests/test_batch_processor.py +316 -0
  27. pyspark_tools-0.0.1/tests/test_ci_basic.py +283 -0
  28. pyspark_tools-0.0.1/tests/test_data_source_analyzer.py +659 -0
  29. pyspark_tools-0.0.1/tests/test_duplicate_detector.py +464 -0
  30. pyspark_tools-0.0.1/tests/test_enhanced_sql_converter.py +349 -0
  31. pyspark_tools-0.0.1/tests/test_file_utils.py +473 -0
  32. pyspark_tools-0.0.1/tests/test_integration_mcp.py +74 -0
  33. pyspark_tools-0.0.1/tests/test_minimal.py +137 -0
  34. pyspark_tools-0.0.1/tests/test_optimization_features.py +307 -0
  35. pyspark_tools-0.0.1/tests/test_server_components.py +391 -0
  36. pyspark_tools-0.0.1/tests/test_sql_conversion_fixes.py +222 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 PySpark MCP Server
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyspark-tools
3
+ Version: 0.0.1
4
+ Summary: MCP server for SQL migration, AWS Glue job generation, and PySpark optimization
5
+ Author-email: Annas Mazhar <annas.mazhar10@gmail.com>
6
+ Project-URL: Homepage, https://github.com/AnnasMazhar/pyspark_mcp
7
+ Project-URL: Repository, https://github.com/AnnasMazhar/pyspark_mcp
8
+ Project-URL: Issues, https://github.com/AnnasMazhar/pyspark_mcp/issues
9
+ Requires-Python: >=3.10
10
+ License-File: LICENSE
11
+ Requires-Dist: fastmcp>=3.2.3
12
+ Requires-Dist: sqlglot>=25.0
13
+ Requires-Dist: pyspark<5.0.0,>=3.5.0
14
+ Requires-Dist: sqlite-utils>=3.39
15
+ Requires-Dist: pypdf>=4.0.0
16
+ Requires-Dist: pdfplumber>=0.10.0
17
+ Provides-Extra: dev
18
+ Requires-Dist: black>=24.0; extra == "dev"
19
+ Requires-Dist: isort>=5.12.0; extra == "dev"
20
+ Requires-Dist: flake8>=7.0.0; extra == "dev"
21
+ Requires-Dist: mypy>=1.8.0; extra == "dev"
22
+ Requires-Dist: pytest>=8.0; extra == "dev"
23
+ Requires-Dist: pytest-cov>=7.1.0; extra == "dev"
24
+ Requires-Dist: pytest-xdist>=3.0.0; extra == "dev"
25
+ Requires-Dist: pytest-mock>=3.12.0; extra == "dev"
26
+ Requires-Dist: coverage>=7.0.0; extra == "dev"
27
+ Requires-Dist: safety>=3.0.0; extra == "dev"
28
+ Requires-Dist: build>=0.10.0; extra == "dev"
29
+ Requires-Dist: twine>=6.2.0; extra == "dev"
30
+ Requires-Dist: docker>=6.0.0; extra == "dev"
31
+ Requires-Dist: requests>=2.31.0; extra == "dev"
32
+ Requires-Dist: psutil>=5.9.0; extra == "dev"
33
+ Dynamic: license-file
@@ -0,0 +1,131 @@
1
+ # PySpark MCP Server
2
+
3
+ SQL migration assistance, AWS Glue job generation, and Spark code optimization — as an MCP server.
4
+
5
+ [![CI Pipeline](https://github.com/AnnasMazhar/pyspark_mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/AnnasMazhar/pyspark_mcp/actions/workflows/ci.yml)
6
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+
9
+ ## What It Does
10
+
11
+ - **SQL Dialect Transpilation** — Convert between PostgreSQL, Oracle, Redshift, MySQL, Snowflake, and Spark SQL using [SQLGlot](https://github.com/tobymao/sqlglot)
12
+ - **PySpark DataFrame API Generation** — Generate DataFrame API code from SQL with optimization hints
13
+ - **AWS Glue Integration** — Job templates, DynamicFrame conversions, Data Catalog definitions, S3 optimization strategies
14
+ - **Batch Processing** — Process hundreds of SQL files concurrently
15
+ - **Code Review & Optimization** — Analyze existing PySpark code for performance improvements
16
+ - **Pattern Detection** — Find code duplication and suggest refactoring
17
+
18
+ ## What It Doesn't Do
19
+
20
+ - Recursive CTEs → provides Spark SQL equivalent + guidance (PySpark has no native recursive CTE support)
21
+ - MERGE/PIVOT/CONNECT BY → transpiles to Spark SQL, provides DataFrame API guidance
22
+ - Perfect 1:1 DataFrame API transpilation for all SQL — complex queries get Spark SQL + optimization recommendations
23
+
24
+ ## Quick Start
25
+
26
+ ```bash
27
+ pip install -e .
28
+ pyspark-mcp # starts the MCP server
29
+ ```
30
+
31
+ ## MCP Configuration
32
+
33
+ ### Claude Desktop
34
+
35
+ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
36
+
37
+ ```json
38
+ {
39
+ "mcpServers": {
40
+ "pyspark": {
41
+ "command": "pyspark-mcp",
42
+ "args": []
43
+ }
44
+ }
45
+ }
46
+ ```
47
+
48
+ ### Hermes Agent
49
+
50
+ Add to `~/.hermes/config.yaml`:
51
+
52
+ ```yaml
53
+ mcp:
54
+ servers:
55
+ pyspark:
56
+ command: pyspark-mcp
57
+ enabled_tools: all
58
+ ```
59
+
60
+ ### Docker
61
+
62
+ ```bash
63
+ docker compose up -d
64
+ ```
65
+
66
+ ## Tools
67
+
68
+ ### SQL Conversion
69
+ - `convert_sql_to_pyspark` — Convert SQL to PySpark with dialect detection
70
+ - `analyze_sql_context` — Analyze SQL complexity and suggest approach
71
+
72
+ ### AWS Glue
73
+ - `generate_aws_glue_job_template` — Generate complete Glue job scripts
74
+ - `convert_dataframe_to_dynamic_frame` — DataFrame ↔ DynamicFrame conversion
75
+ - `generate_data_catalog_table_definition` — Data Catalog table definitions
76
+ - `generate_incremental_processing_job` — Incremental/CDC job generation
77
+ - `analyze_s3_optimization_opportunities` — S3 layout and partitioning analysis
78
+
79
+ ### Optimization
80
+ - `review_pyspark_code` — Code review with performance recommendations
81
+ - `optimize_pyspark_code` — Suggest optimizations for existing code
82
+ - `recommend_join_strategy` — Broadcast vs shuffle join recommendations
83
+ - `suggest_partitioning_strategy` — Partitioning recommendations
84
+
85
+ ### Batch Processing
86
+ - `batch_process_files` — Process multiple SQL files concurrently
87
+ - `batch_process_directory` — Convert entire directories
88
+
89
+ ## Development
90
+
91
+ ```bash
92
+ python -m venv .venv
93
+ source .venv/bin/activate
94
+ pip install -e ".[dev]"
95
+
96
+ # Test
97
+ pytest tests/ -v --cov=pyspark_tools
98
+
99
+ # Format
100
+ black pyspark_tools tests
101
+ isort pyspark_tools tests
102
+
103
+ # Lint
104
+ flake8 pyspark_tools tests
105
+ ```
106
+
107
+ ## Architecture
108
+
109
+ ```
110
+ pyspark_tools/
111
+ ├── server.py # FastMCP server + tool definitions
112
+ ├── sql_converter.py # SQLGlot-based transpilation + DataFrame API generation
113
+ ├── aws_glue_integration.py # Glue job templates, DynamicFrame, Data Catalog
114
+ ├── advanced_optimizer.py # Performance analysis + optimization suggestions
115
+ ├── batch_processor.py # Concurrent file processing
116
+ ├── code_reviewer.py # PySpark code review patterns
117
+ ├── duplicate_detector.py # Code deduplication
118
+ ├── data_source_analyzer.py # Data source analysis
119
+ └── file_utils.py # File I/O utilities
120
+ ```
121
+
122
+ ## CI/CD
123
+
124
+ - ✅ 256 tests passing
125
+ - ✅ 71% code coverage
126
+ - ✅ Code quality checks (black, isort, flake8)
127
+ - ✅ Python 3.11 tested
128
+
129
+ ## License
130
+
131
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,76 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pyspark-tools"
7
+ version = "0.0.1"
8
+ description = "MCP server for SQL migration, AWS Glue job generation, and PySpark optimization"
9
+ authors = [{name = "Annas Mazhar", email = "annas.mazhar10@gmail.com"}]
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "fastmcp>=3.2.3",
13
+ "sqlglot>=25.0",
14
+ "pyspark>=3.5.0,<5.0.0",
15
+ "sqlite-utils>=3.39",
16
+ "pypdf>=4.0.0",
17
+ "pdfplumber>=0.10.0"
18
+ ]
19
+
20
+ [tool.setuptools.packages.find]
21
+ where = ["."]
22
+ include = ["pyspark_tools*"]
23
+ exclude = ["tests*", "output*", "test_venv*", "test_queries*", "coverage_reports*"]
24
+
25
+ [project.optional-dependencies]
26
+ dev = [
27
+ "black>=24.0",
28
+ "isort>=5.12.0",
29
+ "flake8>=7.0.0",
30
+ "mypy>=1.8.0",
31
+ "pytest>=8.0",
32
+ "pytest-cov>=7.1.0",
33
+ "pytest-xdist>=3.0.0",
34
+ "pytest-mock>=3.12.0",
35
+ "coverage>=7.0.0",
36
+ "safety>=3.0.0",
37
+ "build>=0.10.0",
38
+ "twine>=6.2.0",
39
+ "docker>=6.0.0",
40
+ "requests>=2.31.0",
41
+ "psutil>=5.9.0"
42
+ ]
43
+
44
+ [tool.black]
45
+ line-length = 88
46
+ target-version = ['py310']
47
+
48
+ [tool.isort]
49
+ profile = "black"
50
+ line_length = 88
51
+ known_first_party = ["pyspark_tools"]
52
+
53
+ [tool.mypy]
54
+ python_version = "3.10"
55
+ ignore_missing_imports = true
56
+ check_untyped_defs = true
57
+
58
+ [tool.pytest.ini_options]
59
+ testpaths = ["tests"]
60
+ addopts = "-v"
61
+ markers = [
62
+ "fast: marks tests as fast (deselect with '-m \"not fast\"')",
63
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
64
+ "unit: marks tests as unit tests",
65
+ "integration: marks tests as integration tests",
66
+ "cached: marks tests that use caching"
67
+ ]
68
+
69
+ [project.scripts]
70
+ pyspark-mcp = "pyspark_tools:main"
71
+
72
+
73
+ [project.urls]
74
+ Homepage = "https://github.com/AnnasMazhar/pyspark_mcp"
75
+ Repository = "https://github.com/AnnasMazhar/pyspark_mcp"
76
+ Issues = "https://github.com/AnnasMazhar/pyspark_mcp/issues"
@@ -0,0 +1,11 @@
1
+ """PySpark Tools - FastMCP server for SQL to PySpark conversion and optimization."""
2
+
3
+ __version__ = "0.0.1"
4
+
5
+ def main():
6
+ """CLI entry point for pyspark-mcp server."""
7
+ from pyspark_tools.server import app
8
+ try:
9
+ app.run()
10
+ except KeyboardInterrupt:
11
+ pass