PyPI - data-prep-toolkit - Versions diffs - 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl - Mend

data-prep-toolkit 0.2.2.dev1py3-none-any.whl → 0.2.2.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

data_processing_spark/runtime/spark/transform_orchestrator.py CHANGED Viewed

@@ -10,24 +10,69 @@
 # limitations under the License.
 ################################################################################
+import os
+import socket
 import time
 import traceback
 from datetime import datetime
+import yaml
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.transform import TransformStatistics
+from data_processing.transform import TransformStatistics, AbstractFolderTransform
 from data_processing.utils import GB, get_logger
 from data_processing_spark.runtime.spark import (
+    SparkTransformExecutionConfiguration,
     SparkTransformFileProcessor,
     SparkTransformRuntimeConfiguration,
-    SparkTransformExecutionConfiguration,
 )
 from pyspark import SparkConf, SparkContext
+from pyspark.sql import SparkSession
 logger = get_logger(__name__)
+def _init_spark(runtime_config: SparkTransformRuntimeConfiguration) -> SparkSession:
+    server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1"))
+    if server_port_https == -1:
+        # running locally
+        spark_config = {"spark.driver.host": "127.0.0.1"}
+        return SparkSession.builder.appName(runtime_config.get_name()).config(map=spark_config).getOrCreate()
+    else:
+        # running in Kubernetes, use spark_profile.yml and
+        # environment variables for configuration
+        server_port = os.environ["KUBERNETES_SERVICE_PORT"]
+        master_url = f"k8s://https://kubernetes.default:{server_port}"
+        # Read Spark configuration profile
+        config_filepath = os.path.abspath(
+            os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml")
+        )
+        with open(config_filepath, "r") as config_fp:
+            spark_config = yaml.safe_load(os.path.expandvars(config_fp.read()))
+        spark_config["spark.submit.deployMode"] = "client"
+        # configure the executor pods from template
+        executor_pod_template_file = os.path.join(
+            os.getenv("SPARK_HOME"),
+            "work-dir",
+            "src",
+            "templates",
+            "spark-executor-pod-template.yml",
+        )
+        spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file
+        spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always"
+        # Pass the driver IP address to the workers for callback
+        myservice_url = socket.gethostbyname(socket.gethostname())
+        spark_config["spark.driver.host"] = myservice_url
+        spark_config["spark.driver.bindAddress"] = "0.0.0.0"
+        spark_config["spark.decommission.enabled"] = True
+        logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}")
+        app_name = spark_config.get("spark.app.name", "my-spark-app")
+        return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate()
 def orchestrate(
     runtime_config: SparkTransformRuntimeConfiguration,
     execution_configuration: SparkTransformExecutionConfiguration,
@@ -45,14 +90,17 @@ def orchestrate(
     logger.info(f"orchestrator started at {start_ts}")
     # create data access
     data_access = data_access_factory.create_data_access()
+    bcast_params = runtime_config.get_bcast_params(data_access_factory)
     if data_access is None:
         logger.error("No DataAccess instance provided - exiting")
         return 1
     # initialize Spark
-    conf = SparkConf().setAppName(runtime_config.get_name()).set("spark.driver.host", "127.0.0.1")
-    sc = SparkContext(conf=conf)
+    spark_session = _init_spark(runtime_config)
+    sc = spark_session.sparkContext
+    # broadcast
     spark_runtime_config = sc.broadcast(runtime_config)
     daf = sc.broadcast(data_access_factory)
+    spark_bcast_params = sc.broadcast(bcast_params)
     def process_partition(iterator):
         """
@@ -63,12 +111,16 @@ def orchestrate(
         # local statistics dictionary
         statistics = TransformStatistics()
         # create transformer runtime
+        bcast_params = spark_bcast_params.value
         d_access_factory = daf.value
         runtime_conf = spark_runtime_config.value
         runtime = runtime_conf.create_transform_runtime()
         # create file processor
         file_processor = SparkTransformFileProcessor(
-            data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
+            data_access_factory=d_access_factory,
+            runtime_configuration=runtime_conf,
+            statistics=statistics,
+            is_folder=is_folder,
         )
         first = True
         for f in iterator:
@@ -77,8 +129,11 @@ def orchestrate(
                 logger.debug(f"partition {f}")
                 # add additional parameters
                 transform_params = (
-                    runtime.get_transform_config(partition=int(f[1]), data_access_factory=d_access_factory,
-                                                 statistics=statistics))
+                    runtime.get_transform_config(
+                        partition=int(f[1]), data_access_factory=d_access_factory, statistics=statistics
+                    )
+                    | bcast_params
+                )
                 # create transform with partition number
                 file_processor.create_transform(transform_params)
                 first = False
@@ -92,13 +147,20 @@ def orchestrate(
         return list(statistics.get_execution_stats().items())
     num_partitions = 0
+    is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
     try:
-        # Get files to process
-        files, profile, retries = data_access.get_files_to_process()
-        if len(files) == 0:
-            logger.error("No input files to process - exiting")
-            return 0
-        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        if is_folder:
+            # folder transform
+            runtime = runtime_config.create_transform_runtime()
+            files = runtime.get_folders(data_access=data_access)
+            logger.info(f"Number of folders is {len(files)}")        # Get files to process
+        else:
+            # Get files to process
+            files, profile, retries = data_access.get_files_to_process()
+            if len(files) == 0:
+                logger.error("No input files to process - exiting")
+                return 0
+            logger.info(f"Number of files is {len(files)}, source profile {profile}")
         # process data
         logger.debug("Begin processing files")
         # process files split by partitions
@@ -128,7 +190,7 @@ def orchestrate(
         memory = 0.0
         for i in range(executors.size()):
             memory += executors.toList().apply(i)._2()._1()
-        resources = {"cpus": cpus, "gpus": 0, "memory": round(memory/GB, 2), "object_store": 0}
+        resources = {"cpus": cpus, "gpus": 0, "memory": round(memory / GB, 2), "object_store": 0}
         input_params = runtime_config.get_transform_metadata() | execution_configuration.get_input_params()
         metadata = {
             "pipeline": execution_configuration.pipeline_id,
@@ -143,7 +205,8 @@ def orchestrate(
             "execution_stats": {
                 "num partitions": num_partitions,
                 "execution time, min": round((time.time() - start_time) / 60, 3),
-            } | resources,
+            }
+            | resources,
             "job_output_stats": stats,
         }
         logger.debug(f"Saving job metadata: {metadata}.")

data_processing_spark/runtime/spark/transform_runtime.py CHANGED Viewed

@@ -12,7 +12,7 @@
 from typing import Any
-from data_processing.data_access import DataAccessFactoryBase
+from data_processing.data_access import DataAccessFactoryBase, DataAccess
 from data_processing.transform import TransformStatistics
@@ -28,25 +28,43 @@ class DefaultSparkTransformRuntime:
         """
         self.params = params
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        raise NotImplemented()
     def get_transform_config(
         self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
     ) -> dict[str, Any]:
         """
         Get the dictionary of configuration that will be provided to the transform's initializer.
         This is the opportunity for this runtime to create a new set of configuration based on the
-        config/params provided to this instance's initializer.  This may include the addition
-        of new configuration data such as ray shared memory, new actors, etc, that might be needed and
-        expected by the transform in its initializer and/or transform() methods.
+        config/params provided to this instance's initializer.
+        :param partition - the partition assigned to this worker, needed by transforms like doc_id
         :param data_access_factory - data access factory class being used by the RayOrchestrator.
         :param statistics - reference to statistics actor
         :return: dictionary of transform init params
         """
         return self.params
+    def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
+        """Allows retrieving and broadcasting to all the workers very large
+        configuration parameters, like the list of document IDs to remove for
+        fuzzy dedup, or the list of blocked web domains for block listing. This
+        function is called by the spark runtime after spark initialization, and
+        before spark_context.parallelize()
+        :param data_access_factory - creates data_access object to download the large config parameter
+        """
+        return {}
     def compute_execution_stats(self, stats: TransformStatistics) -> None:
         """
         Update/augment the given statistics object with runtime-specific additions/modifications.
+        This method does not return a value; the job execution statistics are generally reported
+        as metadata by the Spark Orchestrator.
         :param stats: output of statistics as aggregated across all calls to all transforms.
-        :return: job execution statistics.  These are generally reported as metadata by the Ray Orchestrator.
         """
-        pass
+        pass

data_processing_spark/test_support/transform/__init__.py CHANGED Viewed

@@ -11,3 +11,4 @@
 ################################################################################
 from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration
+from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration

data_processing_spark/test_support/transform/noop_folder_transform.py ADDED Viewed

@@ -0,0 +1,53 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime
+from data_processing.data_access import DataAccess
+logger = get_logger(__name__)
+class NOOPFolderSparkRuntime(DefaultSparkTransformRuntime):
+    def get_folders(self, data_access: DataAccess) -> list[str]:
+        """
+        Get folders to process
+        :param data_access: data access
+        :return: list of folders to process
+        """
+        return [data_access.get_input_folder()]
+class NOOPFolderSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
+                         runtime_class=NOOPFolderSparkRuntime)
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=NOOPFolderSparkTransformConfiguration())
+    logger.info("Launching noop transform")
+    # Launch the ray actor(s) to process the input
+    launcher.launch()

{data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt RENAMED Viewed

File without changes

data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl

data-prep-toolkit 0.2.2.dev1py3-none-any.whl → 0.2.2.dev2py3-none-any.whl