datus-spark 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+
52
+ # Translations
53
+ *.mo
54
+ *.pot
55
+
56
+ # Django stuff:
57
+ *.log
58
+ local_settings.py
59
+ db.sqlite3
60
+ db.sqlite3-journal
61
+
62
+ # Flask stuff:
63
+ instance/
64
+ .webassets-cache
65
+
66
+ # Scrapy stuff:
67
+ .scrapy
68
+
69
+ # Sphinx documentation
70
+ docs/_build/
71
+
72
+ # PyBuilder
73
+ target/
74
+
75
+ # Jupyter Notebook
76
+ .ipynb_checkpoints
77
+
78
+ # IPython
79
+ profile_default/
80
+ ipython_config.py
81
+
82
+ # pyenv
83
+ .python-version
84
+
85
+ # pipenv
86
+ Pipfile.lock
87
+
88
+ # uv
89
+ uv.lock
90
+
91
+ # PEP 582
92
+ __pypackages__/
93
+
94
+ # Celery stuff
95
+ celerybeat-schedule
96
+ celerybeat.pid
97
+
98
+ # SageMath parsed files
99
+ *.sage.py
100
+
101
+ # Environments
102
+ .env
103
+ .venv
104
+ env/
105
+ venv/
106
+ ENV/
107
+ env.bak/
108
+ venv.bak/
109
+
110
+ # Spyder project settings
111
+ .spyderproject
112
+ .spyproject
113
+
114
+ # Rope project settings
115
+ .ropeproject
116
+
117
+ # mkdocs documentation
118
+ /site
119
+
120
+ # mypy
121
+ .mypy_cache/
122
+ .dmypy.json
123
+ dmypy.json
124
+
125
+ # Pyre type checker
126
+ .pyre/
127
+
128
+ # IDEs
129
+ .vscode/
130
+ .idea/
131
+ *.swp
132
+ *.swo
133
+ *~
134
+
135
+ # OS
136
+ .DS_Store
137
+ Thumbs.db
138
+
139
+
140
+ .omc
@@ -0,0 +1,26 @@
1
+ Metadata-Version: 2.4
2
+ Name: datus-spark
3
+ Version: 0.1.0
4
+ Summary: Spark SQL database adapter for Datus
5
+ Project-URL: Homepage, https://github.com/Datus-ai/datus-db-adapters
6
+ Project-URL: Repository, https://github.com/Datus-ai/datus-db-adapters
7
+ Project-URL: Issues, https://github.com/Datus-ai/datus-db-adapters/issues
8
+ Author-email: DatusAI <support@datus.ai>
9
+ License: Apache-2.0
10
+ Keywords: adapter,database,datus,hive,spark
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Python: >=3.12
17
+ Requires-Dist: datus-agent>0.2.1
18
+ Requires-Dist: datus-sqlalchemy>=0.1.0
19
+ Requires-Dist: pure-sasl>=0.6.2
20
+ Requires-Dist: pydantic>=2.0.0
21
+ Requires-Dist: pyhive>=0.7.0
22
+ Requires-Dist: thrift-sasl>=0.4.3
23
+ Requires-Dist: thrift>=0.16.0
24
+ Provides-Extra: test
25
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'test'
26
+ Requires-Dist: pytest>=7.0.0; extra == 'test'
@@ -0,0 +1,203 @@
1
+ # datus-spark
2
+
3
+ Spark SQL database adapter for Datus, connecting via HiveServer2/Thrift protocol.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install datus-spark
9
+ ```
10
+
11
+ This will automatically install the required dependencies:
12
+ - `datus-agent`
13
+ - `datus-sqlalchemy`
14
+ - `pyhive`
15
+ - `thrift`
16
+ - `thrift-sasl`
17
+ - `pure-sasl`
18
+
19
+ ## Usage
20
+
21
+ The adapter is automatically registered with Datus when installed. Configure your database connection in your Datus configuration:
22
+
23
+ ```yaml
24
+ database:
25
+ type: spark
26
+ host: localhost
27
+ port: 10000
28
+ username: spark
29
+ database: default
30
+ auth_mechanism: NONE
31
+ ```
32
+
33
+ Or use programmatically:
34
+
35
+ ```python
36
+ from datus_spark import SparkConnector, SparkConfig
37
+
38
+ # Using config object
39
+ config = SparkConfig(
40
+ host="localhost",
41
+ port=10000,
42
+ username="spark",
43
+ password="",
44
+ database="default",
45
+ auth_mechanism="NONE",
46
+ )
47
+ connector = SparkConnector(config)
48
+
49
+ # Or using dict
50
+ connector = SparkConnector({
51
+ "host": "localhost",
52
+ "port": 10000,
53
+ "username": "spark",
54
+ "database": "default",
55
+ })
56
+
57
+ # Test connection
58
+ connector.test_connection()
59
+
60
+ # Execute query
61
+ result = connector.execute({"sql_query": "SELECT * FROM `default`.`my_table` LIMIT 10"})
62
+ print(result.sql_return)
63
+
64
+ # Get table list
65
+ tables = connector.get_tables(database_name="default")
66
+ print(f"Tables: {tables}")
67
+
68
+ # Get table schema
69
+ schema = connector.get_schema(database_name="default", table_name="my_table")
70
+ for column in schema:
71
+ print(f"{column['name']}: {column['type']}")
72
+ ```
73
+
74
+ ## Configuration Options
75
+
76
+ | Option | Type | Default | Description |
77
+ |--------|------|---------|-------------|
78
+ | host | str | "127.0.0.1" | Spark Thrift Server host |
79
+ | port | int | 10000 | Spark Thrift Server port |
80
+ | username | str | (required) | Username |
81
+ | password | str | "" | Password |
82
+ | database | str | None | Default database (falls back to `default`) |
83
+ | auth_mechanism | str | "NONE" | Authentication mechanism (NONE, PLAIN, KERBEROS) |
84
+ | timeout_seconds | int | 30 | Connection timeout |
85
+
86
+ ## Features
87
+
88
+ - Query execution via Spark SQL (SELECT)
89
+ - DDL execution (CREATE, ALTER, DROP)
90
+ - Metadata retrieval (databases, tables, views, columns)
91
+ - Sample data extraction
92
+ - Multiple result formats (pandas, arrow, csv, list)
93
+ - Connection pooling and management
94
+ - Context manager support
95
+
96
+ ## Testing
97
+
98
+ ### Quick Start
99
+
100
+ ```bash
101
+ cd datus-spark
102
+
103
+ # Unit tests (no database required)
104
+ uv run pytest tests/ -m "not integration" -v
105
+
106
+ # All tests with coverage
107
+ uv run pytest tests/ -v --cov=datus_spark --cov-report=term-missing
108
+ ```
109
+
110
+ ### Integration Tests (Requires Spark Thrift Server)
111
+
112
+ ```bash
113
+ # Start Spark Thrift Server container
114
+ docker compose up -d
115
+
116
+ # Wait for container to become healthy (~60s)
117
+ docker compose ps
118
+
119
+ # Run integration tests
120
+ uv run pytest tests/integration/ -v
121
+
122
+ # Run only TPC-H tests
123
+ uv run pytest tests/integration/test_tpch.py -v
124
+
125
+ # Run acceptance tests (core functionality)
126
+ uv run pytest tests/ -m acceptance -v
127
+
128
+ # Stop Spark
129
+ docker compose down
130
+ ```
131
+
132
+ ### TPC-H Test Data
133
+
134
+ Integration tests include TPC-H benchmark data for realistic query testing. The `tpch_setup` fixture (session-scoped) automatically creates 5 tables with sample data:
135
+
136
+ | Table | Rows | Description |
137
+ |-------|------|-------------|
138
+ | `tpch_region` | 5 | Standard TPC-H regions |
139
+ | `tpch_nation` | 25 | Standard TPC-H nations |
140
+ | `tpch_customer` | 10 | Simplified customer data |
141
+ | `tpch_orders` | 15 | Simplified order data |
142
+ | `tpch_supplier` | 5 | Simplified supplier data |
143
+
144
+ Tables are created at the start of the test session and dropped after all tests complete.
145
+
146
+ #### Initialize TPC-H Data Manually
147
+
148
+ To create TPC-H data for use with Datus (outside of tests):
149
+
150
+ ```bash
151
+ # Basic usage
152
+ uv run python scripts/init_tpch_data.py
153
+
154
+ # Drop existing tables and re-create
155
+ uv run python scripts/init_tpch_data.py --drop
156
+
157
+ # Custom connection
158
+ uv run python scripts/init_tpch_data.py --host 192.168.1.100 --port 10000
159
+ ```
160
+
161
+ ### Test Statistics
162
+
163
+ - **Unit Tests**: 46 tests (config validation, connector logic, identifiers)
164
+ - **Integration Tests**: 24 tests (connection, metadata, SQL execution, TPC-H)
165
+ - **Total**: 70 tests
166
+
167
+ ### Test Markers
168
+
169
+ | Marker | Description |
170
+ |--------|-------------|
171
+ | `integration` | Requires running Spark Thrift Server |
172
+ | `acceptance` | Core functionality validation for CI/CD |
173
+
174
+ ## Development
175
+
176
+ ### Setup
177
+
178
+ ```bash
179
+ # From workspace root
180
+ uv sync --all-packages
181
+
182
+ # Or install in editable mode
183
+ uv pip install -e .
184
+ ```
185
+
186
+ ### Code Quality
187
+
188
+ ```bash
189
+ black datus_spark tests
190
+ isort datus_spark tests
191
+ ruff check datus_spark tests
192
+ ```
193
+
194
+ ## Requirements
195
+
196
+ - Python >= 3.12
197
+ - Apache Spark >= 3.0 with Thrift Server enabled
198
+ - datus-agent > 0.2.1
199
+ - datus-sqlalchemy >= 0.1.0
200
+
201
+ ## License
202
+
203
+ Apache License 2.0
@@ -0,0 +1,16 @@
1
+ # Copyright 2025-present DatusAI, Inc.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See http://www.apache.org/licenses/LICENSE-2.0 for details.
4
+
5
+ from .config import SparkConfig
6
+ from .connector import SparkConnector
7
+
8
+ __version__ = "0.1.0"
9
+ __all__ = ["SparkConnector", "SparkConfig", "register"]
10
+
11
+
12
+ def register():
13
+ """Register Spark connector with Datus registry."""
14
+ from datus.tools.db_tools import connector_registry
15
+
16
+ connector_registry.register("spark", SparkConnector, config_class=SparkConfig)
@@ -0,0 +1,23 @@
1
+ # Copyright 2025-present DatusAI, Inc.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See http://www.apache.org/licenses/LICENSE-2.0 for details.
4
+
5
+ from typing import Literal, Optional
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class SparkConfig(BaseModel):
11
+ """Spark SQL (via HiveServer2/Thrift) specific configuration."""
12
+
13
+ model_config = ConfigDict(extra="forbid")
14
+
15
+ host: str = Field(default="127.0.0.1", description="Spark Thrift Server host")
16
+ port: int = Field(default=10000, description="Spark Thrift Server port")
17
+ username: str = Field(..., description="Spark username")
18
+ password: str = Field(default="", description="Spark password", json_schema_extra={"input_type": "password"})
19
+ database: Optional[str] = Field(default=None, description="Default database name")
20
+ auth_mechanism: Literal["NONE", "PLAIN", "KERBEROS"] = Field(
21
+ default="NONE", description="Authentication mechanism (NONE, PLAIN, KERBEROS)"
22
+ )
23
+ timeout_seconds: int = Field(default=30, description="Connection timeout in seconds")
@@ -0,0 +1,236 @@
1
+ # Copyright 2025-present DatusAI, Inc.
2
+ # Licensed under the Apache License, Version 2.0.
3
+ # See http://www.apache.org/licenses/LICENSE-2.0 for details.
4
+
5
+ from typing import Any, Dict, List, Optional, Set, Union, override
6
+ from urllib.parse import quote_plus
7
+
8
+ from datus.utils.loggings import get_logger
9
+ from datus_sqlalchemy import SQLAlchemyConnector
10
+
11
+ from .config import SparkConfig
12
+
13
+ logger = get_logger(__name__)
14
+
15
+ SPARK_DIALECT = "spark"
16
+
17
+
18
+ class SparkConnector(SQLAlchemyConnector):
19
+ """
20
+ Spark SQL database connector via HiveServer2/Thrift protocol.
21
+
22
+ Spark uses a two-level hierarchy: database -> table.
23
+ Connects via the Hive SQLAlchemy dialect (pyhive).
24
+ """
25
+
26
+ def __init__(self, config: Union[SparkConfig, dict]):
27
+ """
28
+ Initialize Spark connector.
29
+
30
+ Args:
31
+ config: SparkConfig object or dict with configuration
32
+ """
33
+ if isinstance(config, dict):
34
+ config = SparkConfig(**config)
35
+ elif not isinstance(config, SparkConfig):
36
+ raise TypeError(f"config must be SparkConfig or dict, got {type(config)}")
37
+
38
+ self.spark_config = config
39
+ self.host = config.host
40
+ self.port = config.port
41
+ self.user = config.username
42
+
43
+ database = config.database or "default"
44
+
45
+ # Build connection string: hive://user:pass@host:port/database
46
+ encoded_username = quote_plus(config.username)
47
+ encoded_password = quote_plus(config.password) if config.password else ""
48
+ if config.password:
49
+ auth_part = f"{encoded_username}:{encoded_password}@"
50
+ else:
51
+ auth_part = f"{encoded_username}@"
52
+
53
+ # Build connection string with auth mechanism
54
+ connection_string = f"hive://{auth_part}{config.host}:{config.port}/{database}"
55
+
56
+ if config.auth_mechanism and config.auth_mechanism != "NONE":
57
+ connection_string += f"?auth={config.auth_mechanism}"
58
+
59
+ super().__init__(connection_string, dialect=SPARK_DIALECT, timeout_seconds=config.timeout_seconds)
60
+
61
+ self.dialect = SPARK_DIALECT
62
+ self.database_name = database
63
+
64
+ # ==================== Context Manager Support ====================
65
+
66
+ def __enter__(self):
67
+ """Context manager entry."""
68
+ self.connect()
69
+ return self
70
+
71
+ def __exit__(self, exc_type, exc_val, exc_tb):
72
+ """Context manager exit with cleanup."""
73
+ self.close()
74
+ return False
75
+
76
+ # ==================== System Resources ====================
77
+
78
+ @override
79
+ def _sys_databases(self) -> Set[str]:
80
+ """System databases to filter out."""
81
+ return {"information_schema"}
82
+
83
+ @override
84
+ def _sys_schemas(self) -> Set[str]:
85
+ """System schemas to filter out (same as databases for Spark)."""
86
+ return self._sys_databases()
87
+
88
+ # ==================== Metadata Retrieval ====================
89
+
90
+ @override
91
+ def get_databases(self, catalog_name: str = "", include_sys: bool = False) -> List[str]:
92
+ """Get list of databases."""
93
+ result = self._execute_pandas("SHOW DATABASES")
94
+ if result.empty:
95
+ return []
96
+ databases = result.iloc[:, 0].tolist()
97
+ if not include_sys:
98
+ sys_dbs = self._sys_databases()
99
+ databases = [d for d in databases if d.lower() not in sys_dbs]
100
+ return databases
101
+
102
+ @override
103
+ def get_schemas(self, catalog_name: str = "", database_name: str = "", include_sys: bool = False) -> List[str]:
104
+ """Spark doesn't have separate schemas, return empty list."""
105
+ return []
106
+
107
+ @override
108
+ def get_tables(self, catalog_name: str = "", database_name: str = "", schema_name: str = "") -> List[str]:
109
+ """Get list of table names."""
110
+ db = database_name or self.database_name
111
+ result = self._execute_pandas(f"SHOW TABLES IN {self._quote_identifier(db)}")
112
+ if result.empty:
113
+ return []
114
+ # SHOW TABLES returns (namespace, tableName, isTemporary) in Spark 3.x
115
+ # Use the second column (tableName) when available, otherwise first
116
+ if len(result.columns) >= 2:
117
+ name_col = result.columns[1]
118
+ else:
119
+ name_col = result.columns[0]
120
+ return result[name_col].tolist()
121
+
122
+ @override
123
+ def get_views(self, catalog_name: str = "", database_name: str = "", schema_name: str = "") -> List[str]:
124
+ """Get list of view names."""
125
+ db = database_name or self.database_name
126
+ try:
127
+ result = self._execute_pandas(f"SHOW VIEWS IN {self._quote_identifier(db)}")
128
+ if result.empty:
129
+ return []
130
+ if len(result.columns) >= 2:
131
+ name_col = result.columns[1]
132
+ else:
133
+ name_col = result.columns[0]
134
+ return result[name_col].tolist()
135
+ except Exception as e:
136
+ logger.warning(f"Failed to get views: {e}")
137
+ return []
138
+
139
+ @override
140
+ def get_schema(
141
+ self, catalog_name: str = "", database_name: str = "", schema_name: str = "", table_name: str = ""
142
+ ) -> List[Dict[str, Any]]:
143
+ """Get table schema information using DESCRIBE."""
144
+ if not table_name:
145
+ return []
146
+
147
+ db = database_name or self.database_name
148
+ full_name = self.full_name(database_name=db, table_name=table_name)
149
+
150
+ query_result = self._execute_pandas(f"DESCRIBE {full_name}")
151
+
152
+ result = []
153
+ for i in range(len(query_result)):
154
+ col_name = query_result.iloc[i, 0]
155
+ # Skip partition/metadata separator lines
156
+ if col_name is None or str(col_name).startswith("#") or str(col_name).strip() == "":
157
+ continue
158
+ result.append(
159
+ {
160
+ "cid": len(result),
161
+ "name": col_name,
162
+ "type": str(query_result.iloc[i, 1]) if len(query_result.columns) > 1 else "",
163
+ "nullable": True, # Spark doesn't expose nullable in DESCRIBE
164
+ "default_value": None,
165
+ "pk": False,
166
+ "comment": str(query_result.iloc[i, 2]) if len(query_result.columns) > 2 else None,
167
+ }
168
+ )
169
+ return result
170
+
171
+ # ==================== Database Management ====================
172
+
173
+ @override
174
+ def _sqlalchemy_schema(
175
+ self, catalog_name: str = "", database_name: str = "", schema_name: str = ""
176
+ ) -> Optional[str]:
177
+ """Get schema name for SQLAlchemy Inspector (database name in Spark)."""
178
+ return database_name or self.database_name
179
+
180
+ @override
181
+ def do_switch_context(self, catalog_name: str = "", database_name: str = "", schema_name: str = ""):
182
+ """Switch database context using USE statement."""
183
+ if database_name:
184
+ from sqlalchemy import text
185
+
186
+ with self.engine.connect() as conn:
187
+ conn.execute(text(f"USE {self._quote_identifier(database_name)}"))
188
+ conn.commit()
189
+ self.database_name = database_name
190
+
191
+ # ==================== Utility Methods ====================
192
+
193
+ @staticmethod
194
+ def _quote_identifier(identifier: str) -> str:
195
+ """Safely wrap identifiers with backticks for Spark."""
196
+ escaped = identifier.replace("`", "``")
197
+ return f"`{escaped}`"
198
+
199
+ @override
200
+ def full_name(
201
+ self, catalog_name: str = "", database_name: str = "", schema_name: str = "", table_name: str = ""
202
+ ) -> str:
203
+ """
204
+ Build fully-qualified table name.
205
+
206
+ Spark format: `database`.`table`
207
+ """
208
+ db = database_name or self.database_name
209
+ if db:
210
+ return f"{self._quote_identifier(db)}.{self._quote_identifier(table_name)}"
211
+ return self._quote_identifier(table_name)
212
+
213
+ def to_dict(self) -> Dict[str, Any]:
214
+ """Convert connector to serializable dictionary."""
215
+ return {
216
+ "db_type": SPARK_DIALECT,
217
+ "host": self.host,
218
+ "port": self.port,
219
+ "user": self.user,
220
+ "database": self.database_name,
221
+ }
222
+
223
+ def get_type(self) -> str:
224
+ """Return the database type."""
225
+ return SPARK_DIALECT
226
+
227
+ @override
228
+ def test_connection(self) -> bool:
229
+ """Test the database connection."""
230
+ try:
231
+ return super().test_connection()
232
+ finally:
233
+ try:
234
+ self.close()
235
+ except Exception as e:
236
+ logger.debug(f"Ignoring cleanup error during test: {e}")
@@ -0,0 +1,25 @@
1
+ services:
2
+ spark-thrift:
3
+ image: apache/spark:3.5.0
4
+ container_name: datus-spark-test
5
+ command: >
6
+ /opt/spark/sbin/start-thriftserver.sh
7
+ --master local[*]
8
+ --hiveconf hive.server2.thrift.port=10000
9
+ --hiveconf hive.server2.thrift.bind.host=0.0.0.0
10
+ ports:
11
+ - "10000:10000" # Thrift port
12
+ - "4040:4040" # Spark UI
13
+ environment:
14
+ - SPARK_NO_DAEMONIZE=true
15
+ healthcheck:
16
+ test: ["CMD-SHELL", "bash -c '(echo > /dev/tcp/localhost/10000) 2>/dev/null || exit 1'"]
17
+ interval: 10s
18
+ timeout: 5s
19
+ retries: 15
20
+ start_period: 60s
21
+ volumes:
22
+ - spark_data:/opt/spark/work-dir
23
+
24
+ volumes:
25
+ spark_data: