PyPI - data-prep-toolkit - Versions diffs - 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl - Mend

data-prep-toolkit 0.2.2.dev0py3-none-any.whl → 0.2.2.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

data_processing_spark/runtime/spark/runtime_configuration.py ADDED Viewed

@@ -0,0 +1,37 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from data_processing.runtime import TransformRuntimeConfiguration
+from data_processing.transform import TransformConfiguration
+from data_processing_spark.runtime.spark import DefaultSparkTransformRuntime
+class SparkTransformRuntimeConfiguration(TransformRuntimeConfiguration):
+    def __init__(
+        self,
+        transform_config: TransformConfiguration,
+        runtime_class: type[DefaultSparkTransformRuntime] = DefaultSparkTransformRuntime,
+    ):
+        """
+        Initialization
+        :param transform_config - base configuration class
+        :param runtime_class: implementation of the transform runtime
+        """
+        super().__init__(transform_config=transform_config)
+        self.runtime_class = runtime_class
+    def create_transform_runtime(self) -> DefaultSparkTransformRuntime:
+        """
+        Create transform runtime with the parameters captured during apply_input_params()
+        :return: transform runtime object
+        """
+        return self.runtime_class(self.transform_config.get_transform_params())

data_processing_spark/runtime/spark/transform_file_processor.py ADDED Viewed

@@ -0,0 +1,65 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from typing import Any
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.runtime import AbstractTransformFileProcessor
+from data_processing.transform import TransformStatistics
+from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration
+from data_processing.utils import UnrecoverableException
+class SparkTransformFileProcessor(AbstractTransformFileProcessor):
+    """
+    This is the class implementing the actual work/actor processing of a single file
+    """
+    def __init__(
+        self,
+        data_access_factory: DataAccessFactoryBase,
+        runtime_configuration: SparkTransformRuntimeConfiguration,
+        statistics: TransformStatistics,
+    ):
+        """
+        Init method
+        """
+        super().__init__(
+            data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params()
+        )
+        # Add data access ant statistics to the processor parameters
+        self.runtime_configuration = runtime_configuration
+        self.transform = None
+        # set up statistics
+        self.transform_params["statistics"] = statistics
+        self.stats = statistics
+    def create_transform(self, transform_parameters: dict[str, Any]):
+        """
+        Create transform
+        :param transform_parameters - transform parameters
+        :return: None
+        """
+        # Create local processor
+        try:
+            self.transform = self.runtime_configuration.get_transform_class()(transform_parameters)
+        except Exception as e:
+            self.logger.error(f"Exception creating transform  {e}")
+            raise UnrecoverableException("failed creating transform")
+    def _publish_stats(self, stats: dict[str, Any]) -> None:
+        """
+        Publish statistics (to the local dictionary)
+        :param stats: statistics dictionary
+        :return: None
+        """
+        self.stats.add_stats(stats)

data_processing_spark/runtime/spark/transform_launcher.py ADDED Viewed

@@ -0,0 +1,64 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import time
+from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
+from data_processing.runtime.transform_launcher import AbstractTransformLauncher
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import (
+    SparkTransformExecutionConfiguration,
+    orchestrate,
+)
+from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration
+logger = get_logger(__name__)
+class SparkTransformLauncher(AbstractTransformLauncher):
+    """
+    Driver class starting Spark execution
+    """
+    def __init__(
+        self,
+        runtime_config: SparkTransformRuntimeConfiguration,
+        data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
+    ):
+        """
+        Creates driver
+        :param runtime_config: transform runtime factory
+        :param data_access_factory: the factory to create DataAccess instances.
+        """
+        super().__init__(runtime_config, data_access_factory)
+        self.execution_config = SparkTransformExecutionConfiguration(name=runtime_config.get_name())
+    def _submit_for_execution(self) -> int:
+        """
+        Submit for execution
+        :return:
+        """
+        res = 1
+        start = time.time()
+        try:
+            logger.debug("Starting orchestrator")
+            res = orchestrate(
+                data_access_factory=self.data_access_factory,
+                runtime_config=self.runtime_config,
+                execution_configuration=self.execution_config
+            )
+            logger.debug("Completed orchestrator")
+        except Exception as e:
+            logger.info(f"Exception running orchestration\n{e}")
+        finally:
+            logger.info(f"Completed execution in {(time.time() - start)/60.} min, execution result {res}")
+            return res

data_processing_spark/runtime/spark/transform_orchestrator.py ADDED Viewed

@@ -0,0 +1,158 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+import time
+import traceback
+from datetime import datetime
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import TransformStatistics
+from data_processing.utils import GB, get_logger
+from data_processing_spark.runtime.spark import (
+    SparkTransformFileProcessor,
+    SparkTransformRuntimeConfiguration,
+    SparkTransformExecutionConfiguration,
+)
+from pyspark import SparkConf, SparkContext
+logger = get_logger(__name__)
+def orchestrate(
+    runtime_config: SparkTransformRuntimeConfiguration,
+    execution_configuration: SparkTransformExecutionConfiguration,
+    data_access_factory: DataAccessFactoryBase,
+) -> int:
+    """
+    orchestrator for transformer execution
+    :param data_access_factory: data access factory
+    :param runtime_config: transformer runtime configuration
+    :param execution_configuration: orchestrator configuration
+    :return: 0 - success or 1 - failure
+    """
+    start_time = time.time()
+    start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    logger.info(f"orchestrator started at {start_ts}")
+    # create data access
+    data_access = data_access_factory.create_data_access()
+    if data_access is None:
+        logger.error("No DataAccess instance provided - exiting")
+        return 1
+    # initialize Spark
+    conf = SparkConf().setAppName(runtime_config.get_name()).set("spark.driver.host", "127.0.0.1")
+    sc = SparkContext(conf=conf)
+    spark_runtime_config = sc.broadcast(runtime_config)
+    daf = sc.broadcast(data_access_factory)
+    def process_partition(iterator):
+        """
+        process partitions
+        :param iterator: iterator of records
+        :return:
+        """
+        # local statistics dictionary
+        statistics = TransformStatistics()
+        # create transformer runtime
+        d_access_factory = daf.value
+        runtime_conf = spark_runtime_config.value
+        runtime = runtime_conf.create_transform_runtime()
+        # create file processor
+        file_processor = SparkTransformFileProcessor(
+            data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
+        )
+        first = True
+        for f in iterator:
+            # for every file
+            if first:
+                logger.debug(f"partition {f}")
+                # add additional parameters
+                transform_params = (
+                    runtime.get_transform_config(partition=int(f[1]), data_access_factory=d_access_factory,
+                                                 statistics=statistics))
+                # create transform with partition number
+                file_processor.create_transform(transform_params)
+                first = False
+            # process file
+            file_processor.process_file(f_name=f[0])
+        # flush
+        file_processor.flush()
+        # enhance statistics
+        runtime.compute_execution_stats(statistics)
+        # return partition's statistics
+        return list(statistics.get_execution_stats().items())
+    num_partitions = 0
+    try:
+        # Get files to process
+        files, profile, retries = data_access.get_files_to_process()
+        if len(files) == 0:
+            logger.error("No input files to process - exiting")
+            return 0
+        logger.info(f"Number of files is {len(files)}, source profile {profile}")
+        # process data
+        logger.debug("Begin processing files")
+        # process files split by partitions
+        logger.debug(f"parallelization {execution_configuration.parallelization}")
+        if execution_configuration.parallelization > 0:
+            source_rdd = sc.parallelize(files, execution_configuration.parallelization)
+        else:
+            source_rdd = sc.parallelize(files)
+        num_partitions = source_rdd.getNumPartitions()
+        logger.info(f"Parallelizing execution. Using {num_partitions} partitions")
+        stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition)
+        # build overall statistics
+        stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect())
+        return_code = 0
+        status = "success"
+    except Exception as e:
+        # process execution exception
+        logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
+        return_code = 1
+        status = "failure"
+        stats = {}
+    try:
+        # build and save metadata
+        logger.debug("Building job metadata")
+        cpus = sc.defaultParallelism
+        executors = sc._jsc.sc().getExecutorMemoryStatus()
+        memory = 0.0
+        for i in range(executors.size()):
+            memory += executors.toList().apply(i)._2()._1()
+        resources = {"cpus": cpus, "gpus": 0, "memory": round(memory/GB, 2), "object_store": 0}
+        input_params = runtime_config.get_transform_metadata() | execution_configuration.get_input_params()
+        metadata = {
+            "pipeline": execution_configuration.pipeline_id,
+            "job details": execution_configuration.job_details
+            | {
+                "start_time": start_ts,
+                "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                "status": status,
+            },
+            "code": execution_configuration.code_location,
+            "job_input_params": input_params | data_access_factory.get_input_params(),
+            "execution_stats": {
+                "num partitions": num_partitions,
+                "execution time, min": round((time.time() - start_time) / 60, 3),
+            } | resources,
+            "job_output_stats": stats,
+        }
+        logger.debug(f"Saving job metadata: {metadata}.")
+        data_access.save_job_metadata(metadata)
+        logger.debug("Saved job metadata.")
+        return return_code
+    except Exception as e:
+        logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
+        return 1
+    finally:
+        # stop spark context at the end. Required for running multiple tests
+        sc.stop()

data_processing_spark/runtime/spark/transform_runtime.py ADDED Viewed

@@ -0,0 +1,52 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from typing import Any
+from data_processing.data_access import DataAccessFactoryBase
+from data_processing.transform import TransformStatistics
+class DefaultSparkTransformRuntime:
+    """
+    Transformer runtime used by processor to to create Transform specific environment
+    """
+    def __init__(self, params: dict[str, Any]):
+        """
+        Create/config this runtime.
+        :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
+        """
+        self.params = params
+    def get_transform_config(
+        self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
+    ) -> dict[str, Any]:
+        """
+        Get the dictionary of configuration that will be provided to the transform's initializer.
+        This is the opportunity for this runtime to create a new set of configuration based on the
+        config/params provided to this instance's initializer.  This may include the addition
+        of new configuration data such as ray shared memory, new actors, etc, that might be needed and
+        expected by the transform in its initializer and/or transform() methods.
+        :param data_access_factory - data access factory class being used by the RayOrchestrator.
+        :param statistics - reference to statistics actor
+        :return: dictionary of transform init params
+        """
+        return self.params
+    def compute_execution_stats(self, stats: TransformStatistics) -> None:
+        """
+        Update/augment the given statistics object with runtime-specific additions/modifications.
+        :param stats: output of statistics as aggregated across all calls to all transforms.
+        :return: job execution statistics.  These are generally reported as metadata by the Ray Orchestrator.
+        """
+        pass

data_processing_spark/test_support/transform/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration

data_processing_spark/test_support/transform/noop_transform.py ADDED Viewed

@@ -0,0 +1,42 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+from data_processing.test_support.transform.noop_transform import (
+    NOOPTransformConfiguration,
+)
+from data_processing.utils import get_logger
+from data_processing_spark.runtime.spark import SparkTransformLauncher
+from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration
+logger = get_logger(__name__)
+class NOOPSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
+    """
+    Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
+    NOOP does not use a RayRuntime class so the superclass only needs the base
+    python-only configuration.
+    """
+    def __init__(self):
+        """
+        Initialization
+        """
+        super().__init__(transform_config=NOOPTransformConfiguration())
+if __name__ == "__main__":
+    # create launcher
+    launcher = SparkTransformLauncher(runtime_config=NOOPSparkTransformConfiguration())
+    logger.info("Launching noop transform")
+    # Launch the ray actor(s) to process the input
+    launcher.launch()

data_prep_toolkit-0.2.2.dev0.dist-info/METADATA DELETED Viewed

@@ -1,56 +0,0 @@
-Metadata-Version: 2.1
-Name: data_prep_toolkit
-Version: 0.2.2.dev0
-Summary: Data Preparation Toolkit Library
-Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
-License: Apache-2.0
-Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
-Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-Requires-Dist: numpy<1.29.0
-Requires-Dist: pyarrow==16.1.0
-Requires-Dist: boto3==1.34.69
-Requires-Dist: argparse
-Requires-Dist: mmh3
-Provides-Extra: dev
-Requires-Dist: twine; extra == "dev"
-Requires-Dist: pytest>=7.3.2; extra == "dev"
-Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
-Requires-Dist: pytest-env>=1.0.0; extra == "dev"
-Requires-Dist: pre-commit>=3.3.2; extra == "dev"
-Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
-Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
-Requires-Dist: moto==5.0.5; extra == "dev"
-Requires-Dist: markupsafe==2.0.1; extra == "dev"
-# Data Processing Library
-This provides a python framework for developing _transforms_
-on data stored in files - currently parquet files are supported -
-and running them in a [ray](https://www.ray.io/) cluster.
-Data files may be stored in the local file system or  COS/S3.
-For more details see the [documentation](../doc/overview.md).
-### Virtual Environment
-The project uses `pyproject.toml` and a Makefile for operations.
-To do development you should establish the virtual environment
-```shell
-make venv
-```
-and then either activate
-```shell
-source venv/bin/activate
-```
-or set up your IDE to use the venv directory when developing in this project
-## Library Artifact Build and Publish
-To test, build and publish the library
-```shell
-make test build publish
-```
-To up the version number, edit the Makefile to change VERSION and rerun
-the above.  This will require committing both the `Makefile` and the
-autotmatically updated `pyproject.toml` file.

data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- data_processing

{data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/WHEEL RENAMED Viewed

File without changes

data-prep-toolkit 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

data-prep-toolkit 0.2.2.dev0py3-none-any.whl → 0.2.2.dev1py3-none-any.whl