PyPI - nurion-raydp - Versions diffs - 1.7.0__py3-none-any.whl - Mend

nurion-raydp 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

nurion_raydp-1.7.0.dist-info/METADATA +12 -0
nurion_raydp-1.7.0.dist-info/RECORD +19 -0
nurion_raydp-1.7.0.dist-info/WHEEL +5 -0
nurion_raydp-1.7.0.dist-info/top_level.txt +1 -0
raydp/__init__.py +26 -0
raydp/_build_hooks.py +139 -0
raydp/context.py +238 -0
raydp/jars/__init__.py +0 -0
raydp/jars/raydp-1.7.0-SNAPSHOT.jar +0 -0
raydp/jars/raydp-shims-common-1.7.0-SNAPSHOT.jar +0 -0
raydp/jars/raydp-shims-spark340-1.7.0-SNAPSHOT.jar +0 -0
raydp/jars/raydp-shims-spark350-1.7.0-SNAPSHOT.jar +0 -0
raydp/setup.py +44 -0
raydp/spark/__init__.py +34 -0
raydp/spark/dataset.py +232 -0
raydp/spark/ray_cluster.py +162 -0
raydp/spark/ray_cluster_master.py +102 -0
raydp/spark/ray_pyworker.py +135 -0
raydp/utils.py +359 -0

nurion_raydp-1.7.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,12 @@
+Metadata-Version: 2.4
+Name: nurion-raydp
+Version: 1.7.0
+Summary: RayDP: Run Apache Spark on Ray
+Author: RayDP Contributors
+License: Apache-2.0
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: ray[default]>=2.0.0
+Requires-Dist: pyarrow>=8.0.0
+Requires-Dist: pandas>=1.0.0
+Requires-Dist: pyspark>=3.4.0

nurion_raydp-1.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+raydp/__init__.py,sha256=8hhb07XiSy5KxpvY73RA3OrvOkrjFiLUTw7jg6OBjBQ,1000
+raydp/_build_hooks.py,sha256=oAWOMDgbS08GmTRxhWoL9XHz40biFab8kCd_KnaBqCo,4964
+raydp/context.py,sha256=QG26PIh-g_5qVeDZVu3OXdVPaG1olyNdtekv_CQPms4,9324
+raydp/setup.py,sha256=bBggttP3c7EEJ-bDDoMAruGPrGfqp0VOFm2L0-7CqYQ,1495
+raydp/utils.py,sha256=MIet19DP2AJhiIAiujjt_tgjv3V_QfrXDjYLwaelF0g,12312
+raydp/jars/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+raydp/jars/raydp-1.7.0-SNAPSHOT.jar,sha256=oWFpztnKnLvLJtGGi4gwhHEAGp-ANwbFb3NSg9ACdOk,27769274
+raydp/jars/raydp-shims-common-1.7.0-SNAPSHOT.jar,sha256=9JqfYderFzI9ama6AsVdb7yMW6RLIokuXYovilhyvCU,18437
+raydp/jars/raydp-shims-spark340-1.7.0-SNAPSHOT.jar,sha256=L5TJrn1jm-gO2oFszHcgq5n4m_KoDxLthL1iQiPEOE8,18138
+raydp/jars/raydp-shims-spark350-1.7.0-SNAPSHOT.jar,sha256=xFY8vzvsFLNu8s8N8CROu5-GH7pSqmSfcSUPKrTgUWs,18351
+raydp/spark/__init__.py,sha256=Kpb_kbfA38pccVOBv6MSHGFpzhtQ9ybZBvpRL7BaQos,1203
+raydp/spark/dataset.py,sha256=OVib_wC0iRICR-OpTtakuUjXD0kZ0-_2DsebvMO96Wo,8709
+raydp/spark/ray_cluster.py,sha256=K0_DCE9QJ95Qol3reFydeFU0cj6lV5q3qpWeV-VciFw,6428
+raydp/spark/ray_cluster_master.py,sha256=8fRM-Buqhhnh8xLQarlj5aY9HiiRQrPuSZE7m6Dt-68,3681
+raydp/spark/ray_pyworker.py,sha256=yYZ8-i6VVNF_JuD3OczlIpQeBwLeJR8e94JPlpkh9Yg,5131
+nurion_raydp-1.7.0.dist-info/METADATA,sha256=cjV3eRhchpzKyuyKtc9a0V_CgDRAH0xu540FbL6Cx0s,331
+nurion_raydp-1.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+nurion_raydp-1.7.0.dist-info/top_level.txt,sha256=2ZCZ27XGfc3HF1QqtUdpbb0pc0rCtALdvl_IOl97wlU,6
+nurion_raydp-1.7.0.dist-info/RECORD,,

nurion_raydp-1.7.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.10.2)
+Root-Is-Purelib: true
+Tag: py3-none-any

nurion_raydp-1.7.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ raydp

raydp/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from raydp.context import init_spark, stop_spark, start_connect_server
+from raydp.utils import code_search_path
+__all__ = [
+    "init_spark",
+    "stop_spark",
+    "start_connect_server",
+    "code_search_path",
+]

raydp/_build_hooks.py ADDED Viewed

@@ -0,0 +1,139 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Custom build hooks for fusionflowkit package.
+Handles JAR file preparation during build process.
+"""
+import glob
+import os
+import subprocess
+import sys
+from shutil import copy2
+from setuptools.command.build_py import build_py as _build_py
+from setuptools.command.sdist import sdist as _sdist
+# JAR files go to jars/ directory (which maps to raydp.jars via package-dir)
+JARS_TARGET = "jars"
+class BuildWithJars(_build_py):
+    """Custom build_py command that handles JAR files."""
+    def run(self):
+        # Setup JAR files before building
+        self.setup_jars()
+        # Run the normal build
+        super().run()
+    def setup_jars(self):
+        """Set up JAR files for packaging."""
+        # Java directory is a subdirectory of the raydp package
+        CORE_DIR = os.path.abspath(
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), "java")
+        )
+        # Build JAR files using Maven
+        self.build_jars(CORE_DIR)
+        JARS_PATH = glob.glob(
+            os.path.join(CORE_DIR, "**/target/raydp-*.jar"), recursive=True
+        ) + glob.glob(os.path.join(CORE_DIR, "thirdparty/*.jar"))
+        if len(JARS_PATH) == 0:
+            print(
+                "Can't find core module jars after Maven build. Build may have failed.",
+                file=sys.stderr,
+            )
+            raise RuntimeError("JAR files not found after Maven build")
+        # Clean up existing temp directory if it exists
+        if os.path.exists(JARS_TARGET):
+            # Remove only JAR files, not the entire directory
+            if os.path.exists(JARS_TARGET):
+                for jar_file in glob.glob(os.path.join(JARS_TARGET, "*.jar")):
+                    try:
+                        os.remove(jar_file)
+                        print(f"Removed existing JAR file: {jar_file}")
+                    except OSError as e:
+                        print(f"Failed to remove {jar_file}: {e}", file=sys.stderr)
+        try:
+            os.makedirs(JARS_TARGET, exist_ok=True)
+        except Exception as e:
+            print(f"Failed to create temp directories: {e}", file=sys.stderr)
+            raise
+        try:
+            for jar_path in JARS_PATH:
+                print(f"Copying {jar_path} to {JARS_TARGET}")
+                copy2(jar_path, JARS_TARGET)
+            print(f"Successfully copied {len(JARS_PATH)} JAR files")
+        except Exception as e:
+            print(f"Failed to copy JAR files: {e}", file=sys.stderr)
+            raise
+    def build_jars(self, core_dir):
+        """Build JAR files using Maven."""
+        # Check if Maven is available
+        try:
+            subprocess.run(["mvn", "--version"], check=True, capture_output=True)
+        except (subprocess.CalledProcessError, FileNotFoundError):
+            print("Maven (mvn) could not be found. Please install Maven first.", file=sys.stderr)
+            raise RuntimeError("Maven not found")
+        print(f"Building JAR files in {core_dir}")
+        # Save current directory
+        original_dir = os.getcwd()
+        try:
+            # Change to core directory and run Maven build
+            os.chdir(core_dir)
+            print("Running: mvn clean package -DskipTests")
+            subprocess.run(
+                ["mvn", "clean", "package", "-DskipTests"],
+                check=True,
+                capture_output=False,  # Let Maven output be visible
+            )
+            print("Maven build completed successfully")
+        except subprocess.CalledProcessError as e:
+            print(f"Maven build failed with exit code {e.returncode}", file=sys.stderr)
+            raise RuntimeError(f"Maven build failed: {e}")
+        except Exception as e:
+            print(f"Failed to run Maven build: {e}", file=sys.stderr)
+            raise
+        finally:
+            # Always restore original directory
+            os.chdir(original_dir)
+class SdistWithJars(_sdist):
+    """Custom sdist command that handles JAR files."""
+    def run(self):
+        # Setup JAR files before creating source distribution
+        build_cmd = BuildWithJars(self.distribution)
+        build_cmd.setup_jars()
+        # Run the normal sdist
+        super().run()

raydp/context.py ADDED Viewed

@@ -0,0 +1,238 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import atexit
+import logging
+from contextlib import ContextDecorator
+from threading import RLock
+from typing import Dict, Union, Optional
+import ray
+from pyspark.sql import SparkSession
+from raydp.spark import SparkCluster
+from raydp.utils import auto_infer_executor_config
+class _SparkContext(ContextDecorator):
+    """A class used to create the Spark cluster and get the Spark session.
+    :param app_name the Spark application name
+    :param configs the extra Spark configs need to set
+    """
+    def __init__(
+        self,
+        app_name: str,
+        configs: Dict[str, str],
+        logging_level: str = "warn",
+    ):
+        self._app_name = app_name
+        self._logging_level = logging_level
+        self._configs = configs
+        self._spark_cluster: Optional[SparkCluster] = None
+        self._spark_session: Optional[SparkSession] = None
+    def _get_or_create_spark_cluster(self) -> SparkCluster:
+        if self._spark_cluster is not None:
+            return self._spark_cluster
+        py4j_logger = logging.getLogger("py4j")
+        py4j_logger.setLevel(logging.WARNING)
+        self._spark_cluster = SparkCluster(
+            self._app_name,
+            self._configs,
+            self._logging_level,
+        )
+        return self._spark_cluster
+    def get_or_create_session(self):
+        if self._spark_session is not None:
+            return self._spark_session
+        spark_cluster = self._get_or_create_spark_cluster()
+        self._spark_session = spark_cluster.get_spark_session()
+        return self._spark_session
+    def start_connect_server(self) -> int:
+        if self._spark_session is None:
+            raise Exception(
+                "The Spark cluster has not been created, please call get_or_create_session first."
+            )
+        return self._spark_session._jvm.org.apache.spark.sql.connect.ConnectServer.start()
+    def stop(self, cleanup_data=True):
+        if self._spark_session is not None:
+            self._spark_session.stop()
+            self._spark_session = None
+        if self._spark_cluster is not None:
+            self._spark_cluster.stop(cleanup_data)
+            if cleanup_data:
+                self._spark_cluster = None
+        if self._configs is not None:
+            self._configs = None
+    def __enter__(self):
+        self.get_or_create_session()
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.stop()
+_spark_context_lock = RLock()
+_global_spark_context: _SparkContext = None
+def init_spark(
+    app_name: str,
+    executor_cores: Optional[int] = None,
+    executor_memory: Optional[Union[str, int]] = None,
+    num_executors: Optional[int] = None,
+    configs: Optional[Dict[str, str]] = None,
+    log_to_driver: bool = False,
+    logging_level: str = "warn",
+    dynamic_allocation: bool = False,
+    min_executors: Optional[int] = None,
+    max_executors: Optional[int] = None,
+    auto_configure: bool = False,
+) -> SparkSession:
+    """
+    Init a Spark cluster with given requirements.
+    :param app_name: The application name.
+    :param executor_cores: the number of CPU cores for each executor. If None and
+                           auto_configure=True, will be inferred from cluster resources.
+    :param executor_memory: the memory size for each executor, both support bytes or human
+                            readable string. If None and auto_configure=True, will be
+                            inferred from cluster resources.
+    :param num_executors: number of executor requests. If None and auto_configure=True,
+                          will be inferred from cluster resources.
+    :param configs: the extra Spark config need to set
+    :param log_to_driver: whether to log the Spark logs to the driver, default is False,
+                          set it to True when debugging
+    :param dynamic_allocation: whether to enable Spark dynamic allocation
+    :param min_executors: minimum number of executors for dynamic allocation
+    :param max_executors: maximum number of executors for dynamic allocation
+    :param auto_configure: if True and executor_cores/executor_memory/num_executors are not
+                           provided, automatically infer from Ray cluster resources.
+    :return: return the SparkSession
+    """
+    logger = logging.getLogger(__name__)
+    if not ray.is_initialized():
+        # ray has not initialized, init local
+        ray.init(log_to_driver=log_to_driver, logging_level=logging_level)
+    # Defensive copy to avoid mutating caller's dict
+    _configs = {} if configs is None else configs.copy()
+    # Auto-configure executor settings if requested and not explicitly provided
+    if auto_configure:
+        inferred_config = auto_infer_executor_config()
+        if executor_cores is None:
+            executor_cores = inferred_config.executor_cores
+            logger.info(f"Auto-configured executor_cores: {executor_cores}")
+        if executor_memory is None:
+            executor_memory = inferred_config.executor_memory
+            logger.info(f"Auto-configured executor_memory: {executor_memory}")
+        if num_executors is None and not dynamic_allocation:
+            num_executors = inferred_config.num_executors
+            logger.info(f"Auto-configured num_executors: {num_executors}")
+        # Also set driver memory if not already in configs
+        if "spark.driver.memory" not in _configs:
+            _configs["spark.driver.memory"] = inferred_config.driver_memory
+            logger.info(f"Auto-configured driver_memory: {inferred_config.driver_memory}")
+    # Validate required parameters
+    if executor_cores is None:
+        raise ValueError(
+            "executor_cores is required. Either provide it explicitly or set auto_configure=True."
+        )
+    if executor_memory is None:
+        raise ValueError(
+            "executor_memory is required. Either provide it explicitly or set auto_configure=True."
+        )
+    with _spark_context_lock:
+        global _global_spark_context
+        if dynamic_allocation:
+            _configs["spark.dynamicAllocation.enabled"] = "true"
+            assert min_executors is not None, (
+                "min_executors is required when dynamic_allocation is enabled"
+            )
+            assert max_executors is not None, (
+                "max_executors is required when dynamic_allocation is enabled"
+            )
+            _configs["spark.dynamicAllocation.minExecutors"] = str(min_executors)
+            _configs["spark.dynamicAllocation.maxExecutors"] = str(max_executors)
+            _configs["spark.executor.instances"] = str(min_executors)
+        else:
+            if num_executors is None:
+                raise ValueError(
+                    "num_executors is required when dynamic_allocation is disabled. "
+                    "Either provide it explicitly or set auto_configure=True."
+                )
+            _configs["spark.dynamicAllocation.enabled"] = "false"
+            _configs["spark.executor.instances"] = str(num_executors)
+        _configs["spark.executor.cores"] = str(executor_cores)
+        _configs["spark.executor.memory"] = str(executor_memory)
+        if _global_spark_context is None:
+            try:
+                _global_spark_context = _SparkContext(
+                    app_name,
+                    _configs,
+                    logging_level,
+                )
+                return _global_spark_context.get_or_create_session()
+            except:
+                if _global_spark_context is not None:
+                    _global_spark_context.stop()
+                _global_spark_context = None
+                raise
+        else:
+            raise Exception("The spark environment has inited.")
+def start_connect_server() -> int:
+    with _spark_context_lock:
+        global _global_spark_context
+        if _global_spark_context is not None:
+            port = _global_spark_context.start_connect_server()
+            if port < 0:
+                raise Exception(
+                    "The spark connect server start failed, can not find available port, please check the spark logs."
+                )
+            return port
+        raise Exception("The spark environment has not inited, please call init_spark first.")
+def stop_spark(cleanup_data=True):
+    with _spark_context_lock:
+        global _global_spark_context
+        if _global_spark_context is not None:
+            _global_spark_context.stop(cleanup_data)
+            if cleanup_data:
+                _global_spark_context = None
+atexit.register(stop_spark)

raydp/jars/__init__.py ADDED Viewed

File without changes

raydp/jars/raydp-1.7.0-SNAPSHOT.jar ADDED Viewed

Binary file

raydp/jars/raydp-shims-common-1.7.0-SNAPSHOT.jar ADDED Viewed

Binary file

raydp/jars/raydp-shims-spark340-1.7.0-SNAPSHOT.jar ADDED Viewed

Binary file

raydp/jars/raydp-shims-spark350-1.7.0-SNAPSHOT.jar ADDED Viewed

Binary file

raydp/setup.py ADDED Viewed

@@ -0,0 +1,44 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Setup script for raydp package.
+Uses pyproject.toml for metadata but provides custom build hooks for JAR files.
+"""
+import importlib.util
+import os
+from setuptools import setup
+# Load _build_hooks directly without triggering raydp/__init__.py
+_build_hooks_path = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "_build_hooks.py"
+)
+spec = importlib.util.spec_from_file_location("_build_hooks", _build_hooks_path)
+_build_hooks = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(_build_hooks)
+BuildWithJars = _build_hooks.BuildWithJars
+SdistWithJars = _build_hooks.SdistWithJars
+setup(
+    cmdclass={
+        "build_py": BuildWithJars,
+        "sdist": SdistWithJars,
+    },
+)

raydp/spark/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from .dataset import (
+    PartitionObjectsOwner,
+    get_raydp_master_owner,
+    spark_dataframe_to_ray_dataset,
+    ray_dataset_to_spark_dataframe,
+    from_spark_recoverable,
+)
+from .ray_cluster import SparkCluster
+__all__ = [
+    "SparkCluster",
+    "PartitionObjectsOwner",
+    "get_raydp_master_owner",
+    "spark_dataframe_to_ray_dataset",
+    "ray_dataset_to_spark_dataframe",
+    "from_spark_recoverable",
+]