nurion-raydp 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: nurion-raydp
3
+ Version: 1.7.0
4
+ Summary: RayDP: Run Apache Spark on Ray
5
+ Author: RayDP Contributors
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: ray[default]>=2.0.0
10
+ Requires-Dist: pyarrow>=8.0.0
11
+ Requires-Dist: pandas>=1.0.0
12
+ Requires-Dist: pyspark>=3.4.0
@@ -0,0 +1,19 @@
1
+ raydp/__init__.py,sha256=8hhb07XiSy5KxpvY73RA3OrvOkrjFiLUTw7jg6OBjBQ,1000
2
+ raydp/_build_hooks.py,sha256=oAWOMDgbS08GmTRxhWoL9XHz40biFab8kCd_KnaBqCo,4964
3
+ raydp/context.py,sha256=QG26PIh-g_5qVeDZVu3OXdVPaG1olyNdtekv_CQPms4,9324
4
+ raydp/setup.py,sha256=bBggttP3c7EEJ-bDDoMAruGPrGfqp0VOFm2L0-7CqYQ,1495
5
+ raydp/utils.py,sha256=MIet19DP2AJhiIAiujjt_tgjv3V_QfrXDjYLwaelF0g,12312
6
+ raydp/jars/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ raydp/jars/raydp-1.7.0-SNAPSHOT.jar,sha256=oWFpztnKnLvLJtGGi4gwhHEAGp-ANwbFb3NSg9ACdOk,27769274
8
+ raydp/jars/raydp-shims-common-1.7.0-SNAPSHOT.jar,sha256=9JqfYderFzI9ama6AsVdb7yMW6RLIokuXYovilhyvCU,18437
9
+ raydp/jars/raydp-shims-spark340-1.7.0-SNAPSHOT.jar,sha256=L5TJrn1jm-gO2oFszHcgq5n4m_KoDxLthL1iQiPEOE8,18138
10
+ raydp/jars/raydp-shims-spark350-1.7.0-SNAPSHOT.jar,sha256=xFY8vzvsFLNu8s8N8CROu5-GH7pSqmSfcSUPKrTgUWs,18351
11
+ raydp/spark/__init__.py,sha256=Kpb_kbfA38pccVOBv6MSHGFpzhtQ9ybZBvpRL7BaQos,1203
12
+ raydp/spark/dataset.py,sha256=OVib_wC0iRICR-OpTtakuUjXD0kZ0-_2DsebvMO96Wo,8709
13
+ raydp/spark/ray_cluster.py,sha256=K0_DCE9QJ95Qol3reFydeFU0cj6lV5q3qpWeV-VciFw,6428
14
+ raydp/spark/ray_cluster_master.py,sha256=8fRM-Buqhhnh8xLQarlj5aY9HiiRQrPuSZE7m6Dt-68,3681
15
+ raydp/spark/ray_pyworker.py,sha256=yYZ8-i6VVNF_JuD3OczlIpQeBwLeJR8e94JPlpkh9Yg,5131
16
+ nurion_raydp-1.7.0.dist-info/METADATA,sha256=cjV3eRhchpzKyuyKtc9a0V_CgDRAH0xu540FbL6Cx0s,331
17
+ nurion_raydp-1.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
18
+ nurion_raydp-1.7.0.dist-info/top_level.txt,sha256=2ZCZ27XGfc3HF1QqtUdpbb0pc0rCtALdvl_IOl97wlU,6
19
+ nurion_raydp-1.7.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ raydp
raydp/__init__.py ADDED
@@ -0,0 +1,26 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ from raydp.context import init_spark, stop_spark, start_connect_server
19
+ from raydp.utils import code_search_path
20
+
21
+ __all__ = [
22
+ "init_spark",
23
+ "stop_spark",
24
+ "start_connect_server",
25
+ "code_search_path",
26
+ ]
raydp/_build_hooks.py ADDED
@@ -0,0 +1,139 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ Custom build hooks for fusionflowkit package.
20
+ Handles JAR file preparation during build process.
21
+ """
22
+
23
+ import glob
24
+ import os
25
+ import subprocess
26
+ import sys
27
+ from shutil import copy2
28
+ from setuptools.command.build_py import build_py as _build_py
29
+ from setuptools.command.sdist import sdist as _sdist
30
+
31
+ # JAR files go to jars/ directory (which maps to raydp.jars via package-dir)
32
+ JARS_TARGET = "jars"
33
+
34
+
35
+ class BuildWithJars(_build_py):
36
+ """Custom build_py command that handles JAR files."""
37
+
38
+ def run(self):
39
+ # Setup JAR files before building
40
+ self.setup_jars()
41
+
42
+ # Run the normal build
43
+ super().run()
44
+
45
+ def setup_jars(self):
46
+ """Set up JAR files for packaging."""
47
+ # Java directory is a subdirectory of the raydp package
48
+ CORE_DIR = os.path.abspath(
49
+ os.path.join(os.path.dirname(os.path.abspath(__file__)), "java")
50
+ )
51
+
52
+ # Build JAR files using Maven
53
+ self.build_jars(CORE_DIR)
54
+
55
+ JARS_PATH = glob.glob(
56
+ os.path.join(CORE_DIR, "**/target/raydp-*.jar"), recursive=True
57
+ ) + glob.glob(os.path.join(CORE_DIR, "thirdparty/*.jar"))
58
+
59
+ if len(JARS_PATH) == 0:
60
+ print(
61
+ "Can't find core module jars after Maven build. Build may have failed.",
62
+ file=sys.stderr,
63
+ )
64
+ raise RuntimeError("JAR files not found after Maven build")
65
+
66
+ # Clean up existing temp directory if it exists
67
+ if os.path.exists(JARS_TARGET):
68
+ # Remove only JAR files, not the entire directory
69
+ if os.path.exists(JARS_TARGET):
70
+ for jar_file in glob.glob(os.path.join(JARS_TARGET, "*.jar")):
71
+ try:
72
+ os.remove(jar_file)
73
+ print(f"Removed existing JAR file: {jar_file}")
74
+ except OSError as e:
75
+ print(f"Failed to remove {jar_file}: {e}", file=sys.stderr)
76
+
77
+ try:
78
+ os.makedirs(JARS_TARGET, exist_ok=True)
79
+ except Exception as e:
80
+ print(f"Failed to create temp directories: {e}", file=sys.stderr)
81
+ raise
82
+
83
+ try:
84
+ for jar_path in JARS_PATH:
85
+ print(f"Copying {jar_path} to {JARS_TARGET}")
86
+ copy2(jar_path, JARS_TARGET)
87
+ print(f"Successfully copied {len(JARS_PATH)} JAR files")
88
+ except Exception as e:
89
+ print(f"Failed to copy JAR files: {e}", file=sys.stderr)
90
+ raise
91
+
92
+ def build_jars(self, core_dir):
93
+ """Build JAR files using Maven."""
94
+ # Check if Maven is available
95
+ try:
96
+ subprocess.run(["mvn", "--version"], check=True, capture_output=True)
97
+ except (subprocess.CalledProcessError, FileNotFoundError):
98
+ print("Maven (mvn) could not be found. Please install Maven first.", file=sys.stderr)
99
+ raise RuntimeError("Maven not found")
100
+
101
+ print(f"Building JAR files in {core_dir}")
102
+
103
+ # Save current directory
104
+ original_dir = os.getcwd()
105
+
106
+ try:
107
+ # Change to core directory and run Maven build
108
+ os.chdir(core_dir)
109
+ print("Running: mvn clean package -DskipTests")
110
+
111
+ subprocess.run(
112
+ ["mvn", "clean", "package", "-DskipTests"],
113
+ check=True,
114
+ capture_output=False, # Let Maven output be visible
115
+ )
116
+
117
+ print("Maven build completed successfully")
118
+
119
+ except subprocess.CalledProcessError as e:
120
+ print(f"Maven build failed with exit code {e.returncode}", file=sys.stderr)
121
+ raise RuntimeError(f"Maven build failed: {e}")
122
+ except Exception as e:
123
+ print(f"Failed to run Maven build: {e}", file=sys.stderr)
124
+ raise
125
+ finally:
126
+ # Always restore original directory
127
+ os.chdir(original_dir)
128
+
129
+
130
+ class SdistWithJars(_sdist):
131
+ """Custom sdist command that handles JAR files."""
132
+
133
+ def run(self):
134
+ # Setup JAR files before creating source distribution
135
+ build_cmd = BuildWithJars(self.distribution)
136
+ build_cmd.setup_jars()
137
+
138
+ # Run the normal sdist
139
+ super().run()
raydp/context.py ADDED
@@ -0,0 +1,238 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ import atexit
19
+ import logging
20
+ from contextlib import ContextDecorator
21
+ from threading import RLock
22
+ from typing import Dict, Union, Optional
23
+
24
+ import ray
25
+ from pyspark.sql import SparkSession
26
+
27
+ from raydp.spark import SparkCluster
28
+ from raydp.utils import auto_infer_executor_config
29
+
30
+
31
+ class _SparkContext(ContextDecorator):
32
+ """A class used to create the Spark cluster and get the Spark session.
33
+
34
+ :param app_name the Spark application name
35
+ :param configs the extra Spark configs need to set
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ app_name: str,
41
+ configs: Dict[str, str],
42
+ logging_level: str = "warn",
43
+ ):
44
+ self._app_name = app_name
45
+ self._logging_level = logging_level
46
+
47
+ self._configs = configs
48
+
49
+ self._spark_cluster: Optional[SparkCluster] = None
50
+ self._spark_session: Optional[SparkSession] = None
51
+
52
+ def _get_or_create_spark_cluster(self) -> SparkCluster:
53
+ if self._spark_cluster is not None:
54
+ return self._spark_cluster
55
+ py4j_logger = logging.getLogger("py4j")
56
+ py4j_logger.setLevel(logging.WARNING)
57
+ self._spark_cluster = SparkCluster(
58
+ self._app_name,
59
+ self._configs,
60
+ self._logging_level,
61
+ )
62
+ return self._spark_cluster
63
+
64
+ def get_or_create_session(self):
65
+ if self._spark_session is not None:
66
+ return self._spark_session
67
+ spark_cluster = self._get_or_create_spark_cluster()
68
+ self._spark_session = spark_cluster.get_spark_session()
69
+
70
+ return self._spark_session
71
+
72
+ def start_connect_server(self) -> int:
73
+ if self._spark_session is None:
74
+ raise Exception(
75
+ "The Spark cluster has not been created, please call get_or_create_session first."
76
+ )
77
+ return self._spark_session._jvm.org.apache.spark.sql.connect.ConnectServer.start()
78
+
79
+ def stop(self, cleanup_data=True):
80
+ if self._spark_session is not None:
81
+ self._spark_session.stop()
82
+ self._spark_session = None
83
+ if self._spark_cluster is not None:
84
+ self._spark_cluster.stop(cleanup_data)
85
+ if cleanup_data:
86
+ self._spark_cluster = None
87
+ if self._configs is not None:
88
+ self._configs = None
89
+
90
+ def __enter__(self):
91
+ self.get_or_create_session()
92
+
93
+ def __exit__(self, exc_type, exc_val, exc_tb):
94
+ self.stop()
95
+
96
+
97
+ _spark_context_lock = RLock()
98
+ _global_spark_context: _SparkContext = None
99
+
100
+
101
+ def init_spark(
102
+ app_name: str,
103
+ executor_cores: Optional[int] = None,
104
+ executor_memory: Optional[Union[str, int]] = None,
105
+ num_executors: Optional[int] = None,
106
+ configs: Optional[Dict[str, str]] = None,
107
+ log_to_driver: bool = False,
108
+ logging_level: str = "warn",
109
+ dynamic_allocation: bool = False,
110
+ min_executors: Optional[int] = None,
111
+ max_executors: Optional[int] = None,
112
+ auto_configure: bool = False,
113
+ ) -> SparkSession:
114
+ """
115
+ Init a Spark cluster with given requirements.
116
+
117
+ :param app_name: The application name.
118
+ :param executor_cores: the number of CPU cores for each executor. If None and
119
+ auto_configure=True, will be inferred from cluster resources.
120
+ :param executor_memory: the memory size for each executor, both support bytes or human
121
+ readable string. If None and auto_configure=True, will be
122
+ inferred from cluster resources.
123
+ :param num_executors: number of executor requests. If None and auto_configure=True,
124
+ will be inferred from cluster resources.
125
+ :param configs: the extra Spark config need to set
126
+ :param log_to_driver: whether to log the Spark logs to the driver, default is False,
127
+ set it to True when debugging
128
+ :param dynamic_allocation: whether to enable Spark dynamic allocation
129
+ :param min_executors: minimum number of executors for dynamic allocation
130
+ :param max_executors: maximum number of executors for dynamic allocation
131
+ :param auto_configure: if True and executor_cores/executor_memory/num_executors are not
132
+ provided, automatically infer from Ray cluster resources.
133
+ :return: return the SparkSession
134
+ """
135
+ logger = logging.getLogger(__name__)
136
+
137
+ if not ray.is_initialized():
138
+ # ray has not initialized, init local
139
+ ray.init(log_to_driver=log_to_driver, logging_level=logging_level)
140
+
141
+ # Defensive copy to avoid mutating caller's dict
142
+ _configs = {} if configs is None else configs.copy()
143
+
144
+ # Auto-configure executor settings if requested and not explicitly provided
145
+ if auto_configure:
146
+ inferred_config = auto_infer_executor_config()
147
+
148
+ if executor_cores is None:
149
+ executor_cores = inferred_config.executor_cores
150
+ logger.info(f"Auto-configured executor_cores: {executor_cores}")
151
+
152
+ if executor_memory is None:
153
+ executor_memory = inferred_config.executor_memory
154
+ logger.info(f"Auto-configured executor_memory: {executor_memory}")
155
+
156
+ if num_executors is None and not dynamic_allocation:
157
+ num_executors = inferred_config.num_executors
158
+ logger.info(f"Auto-configured num_executors: {num_executors}")
159
+
160
+ # Also set driver memory if not already in configs
161
+ if "spark.driver.memory" not in _configs:
162
+ _configs["spark.driver.memory"] = inferred_config.driver_memory
163
+ logger.info(f"Auto-configured driver_memory: {inferred_config.driver_memory}")
164
+
165
+ # Validate required parameters
166
+ if executor_cores is None:
167
+ raise ValueError(
168
+ "executor_cores is required. Either provide it explicitly or set auto_configure=True."
169
+ )
170
+ if executor_memory is None:
171
+ raise ValueError(
172
+ "executor_memory is required. Either provide it explicitly or set auto_configure=True."
173
+ )
174
+
175
+ with _spark_context_lock:
176
+ global _global_spark_context
177
+ if dynamic_allocation:
178
+ _configs["spark.dynamicAllocation.enabled"] = "true"
179
+ assert min_executors is not None, (
180
+ "min_executors is required when dynamic_allocation is enabled"
181
+ )
182
+ assert max_executors is not None, (
183
+ "max_executors is required when dynamic_allocation is enabled"
184
+ )
185
+ _configs["spark.dynamicAllocation.minExecutors"] = str(min_executors)
186
+ _configs["spark.dynamicAllocation.maxExecutors"] = str(max_executors)
187
+ _configs["spark.executor.instances"] = str(min_executors)
188
+ else:
189
+ if num_executors is None:
190
+ raise ValueError(
191
+ "num_executors is required when dynamic_allocation is disabled. "
192
+ "Either provide it explicitly or set auto_configure=True."
193
+ )
194
+ _configs["spark.dynamicAllocation.enabled"] = "false"
195
+ _configs["spark.executor.instances"] = str(num_executors)
196
+ _configs["spark.executor.cores"] = str(executor_cores)
197
+ _configs["spark.executor.memory"] = str(executor_memory)
198
+
199
+ if _global_spark_context is None:
200
+ try:
201
+ _global_spark_context = _SparkContext(
202
+ app_name,
203
+ _configs,
204
+ logging_level,
205
+ )
206
+ return _global_spark_context.get_or_create_session()
207
+ except:
208
+ if _global_spark_context is not None:
209
+ _global_spark_context.stop()
210
+ _global_spark_context = None
211
+ raise
212
+ else:
213
+ raise Exception("The spark environment has inited.")
214
+
215
+
216
+ def start_connect_server() -> int:
217
+ with _spark_context_lock:
218
+ global _global_spark_context
219
+ if _global_spark_context is not None:
220
+ port = _global_spark_context.start_connect_server()
221
+ if port < 0:
222
+ raise Exception(
223
+ "The spark connect server start failed, can not find available port, please check the spark logs."
224
+ )
225
+ return port
226
+ raise Exception("The spark environment has not inited, please call init_spark first.")
227
+
228
+
229
+ def stop_spark(cleanup_data=True):
230
+ with _spark_context_lock:
231
+ global _global_spark_context
232
+ if _global_spark_context is not None:
233
+ _global_spark_context.stop(cleanup_data)
234
+ if cleanup_data:
235
+ _global_spark_context = None
236
+
237
+
238
+ atexit.register(stop_spark)
raydp/jars/__init__.py ADDED
File without changes
Binary file
raydp/setup.py ADDED
@@ -0,0 +1,44 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ """
19
+ Setup script for raydp package.
20
+ Uses pyproject.toml for metadata but provides custom build hooks for JAR files.
21
+ """
22
+
23
+ import importlib.util
24
+ import os
25
+
26
+ from setuptools import setup
27
+
28
+ # Load _build_hooks directly without triggering raydp/__init__.py
29
+ _build_hooks_path = os.path.join(
30
+ os.path.dirname(os.path.abspath(__file__)), "_build_hooks.py"
31
+ )
32
+ spec = importlib.util.spec_from_file_location("_build_hooks", _build_hooks_path)
33
+ _build_hooks = importlib.util.module_from_spec(spec)
34
+ spec.loader.exec_module(_build_hooks)
35
+
36
+ BuildWithJars = _build_hooks.BuildWithJars
37
+ SdistWithJars = _build_hooks.SdistWithJars
38
+
39
+ setup(
40
+ cmdclass={
41
+ "build_py": BuildWithJars,
42
+ "sdist": SdistWithJars,
43
+ },
44
+ )
@@ -0,0 +1,34 @@
1
+ #
2
+ # Licensed to the Apache Software Foundation (ASF) under one or more
3
+ # contributor license agreements. See the NOTICE file distributed with
4
+ # this work for additional information regarding copyright ownership.
5
+ # The ASF licenses this file to You under the Apache License, Version 2.0
6
+ # (the "License"); you may not use this file except in compliance with
7
+ # the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ from .dataset import (
19
+ PartitionObjectsOwner,
20
+ get_raydp_master_owner,
21
+ spark_dataframe_to_ray_dataset,
22
+ ray_dataset_to_spark_dataframe,
23
+ from_spark_recoverable,
24
+ )
25
+ from .ray_cluster import SparkCluster
26
+
27
+ __all__ = [
28
+ "SparkCluster",
29
+ "PartitionObjectsOwner",
30
+ "get_raydp_master_owner",
31
+ "spark_dataframe_to_ray_dataset",
32
+ "ray_dataset_to_spark_dataframe",
33
+ "from_spark_recoverable",
34
+ ]