data-prep-toolkit 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. data_prep_toolkit-0.2.2.dev1.dist-info/METADATA +33 -0
  2. {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/RECORD +26 -5
  3. data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt +3 -0
  4. data_processing/runtime/pure_python/transform_orchestrator.py +24 -2
  5. data_processing_ray/runtime/ray/__init__.py +9 -0
  6. data_processing_ray/runtime/ray/execution_configuration.py +110 -0
  7. data_processing_ray/runtime/ray/ray_utils.py +246 -0
  8. data_processing_ray/runtime/ray/runtime_configuration.py +37 -0
  9. data_processing_ray/runtime/ray/transform_file_processor.py +53 -0
  10. data_processing_ray/runtime/ray/transform_invoker.py +103 -0
  11. data_processing_ray/runtime/ray/transform_launcher.py +117 -0
  12. data_processing_ray/runtime/ray/transform_orchestrator.py +159 -0
  13. data_processing_ray/runtime/ray/transform_runtime.py +53 -0
  14. data_processing_ray/runtime/ray/transform_statistics.py +76 -0
  15. data_processing_ray/test_support/transform/__init__.py +1 -0
  16. data_processing_ray/test_support/transform/noop_transform.py +45 -0
  17. data_processing_spark/runtime/spark/__init__.py +17 -0
  18. data_processing_spark/runtime/spark/execution_configuration.py +85 -0
  19. data_processing_spark/runtime/spark/runtime_configuration.py +37 -0
  20. data_processing_spark/runtime/spark/transform_file_processor.py +65 -0
  21. data_processing_spark/runtime/spark/transform_launcher.py +64 -0
  22. data_processing_spark/runtime/spark/transform_orchestrator.py +158 -0
  23. data_processing_spark/runtime/spark/transform_runtime.py +52 -0
  24. data_processing_spark/test_support/transform/__init__.py +13 -0
  25. data_processing_spark/test_support/transform/noop_transform.py +42 -0
  26. data_prep_toolkit-0.2.2.dev0.dist-info/METADATA +0 -56
  27. data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt +0 -1
  28. {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,37 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.runtime import TransformRuntimeConfiguration
14
+ from data_processing.transform import TransformConfiguration
15
+ from data_processing_spark.runtime.spark import DefaultSparkTransformRuntime
16
+
17
+
18
+ class SparkTransformRuntimeConfiguration(TransformRuntimeConfiguration):
19
+ def __init__(
20
+ self,
21
+ transform_config: TransformConfiguration,
22
+ runtime_class: type[DefaultSparkTransformRuntime] = DefaultSparkTransformRuntime,
23
+ ):
24
+ """
25
+ Initialization
26
+ :param transform_config - base configuration class
27
+ :param runtime_class: implementation of the transform runtime
28
+ """
29
+ super().__init__(transform_config=transform_config)
30
+ self.runtime_class = runtime_class
31
+
32
+ def create_transform_runtime(self) -> DefaultSparkTransformRuntime:
33
+ """
34
+ Create transform runtime with the parameters captured during apply_input_params()
35
+ :return: transform runtime object
36
+ """
37
+ return self.runtime_class(self.transform_config.get_transform_params())
@@ -0,0 +1,65 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
16
+ from data_processing.runtime import AbstractTransformFileProcessor
17
+ from data_processing.transform import TransformStatistics
18
+ from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration
19
+ from data_processing.utils import UnrecoverableException
20
+
21
+
22
+ class SparkTransformFileProcessor(AbstractTransformFileProcessor):
23
+ """
24
+ This is the class implementing the actual work/actor processing of a single file
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ data_access_factory: DataAccessFactoryBase,
30
+ runtime_configuration: SparkTransformRuntimeConfiguration,
31
+ statistics: TransformStatistics,
32
+ ):
33
+ """
34
+ Init method
35
+ """
36
+ super().__init__(
37
+ data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params()
38
+ )
39
+ # Add data access ant statistics to the processor parameters
40
+ self.runtime_configuration = runtime_configuration
41
+ self.transform = None
42
+ # set up statistics
43
+ self.transform_params["statistics"] = statistics
44
+ self.stats = statistics
45
+
46
+ def create_transform(self, transform_parameters: dict[str, Any]):
47
+ """
48
+ Create transform
49
+ :param transform_parameters - transform parameters
50
+ :return: None
51
+ """
52
+ # Create local processor
53
+ try:
54
+ self.transform = self.runtime_configuration.get_transform_class()(transform_parameters)
55
+ except Exception as e:
56
+ self.logger.error(f"Exception creating transform {e}")
57
+ raise UnrecoverableException("failed creating transform")
58
+
59
+ def _publish_stats(self, stats: dict[str, Any]) -> None:
60
+ """
61
+ Publish statistics (to the local dictionary)
62
+ :param stats: statistics dictionary
63
+ :return: None
64
+ """
65
+ self.stats.add_stats(stats)
@@ -0,0 +1,64 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+ import time
13
+
14
+ from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
15
+ from data_processing.runtime.transform_launcher import AbstractTransformLauncher
16
+ from data_processing.utils import get_logger
17
+ from data_processing_spark.runtime.spark import (
18
+ SparkTransformExecutionConfiguration,
19
+ orchestrate,
20
+ )
21
+ from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration
22
+
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ class SparkTransformLauncher(AbstractTransformLauncher):
28
+ """
29
+ Driver class starting Spark execution
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ runtime_config: SparkTransformRuntimeConfiguration,
35
+ data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
36
+ ):
37
+ """
38
+ Creates driver
39
+ :param runtime_config: transform runtime factory
40
+ :param data_access_factory: the factory to create DataAccess instances.
41
+ """
42
+ super().__init__(runtime_config, data_access_factory)
43
+ self.execution_config = SparkTransformExecutionConfiguration(name=runtime_config.get_name())
44
+
45
+ def _submit_for_execution(self) -> int:
46
+ """
47
+ Submit for execution
48
+ :return:
49
+ """
50
+ res = 1
51
+ start = time.time()
52
+ try:
53
+ logger.debug("Starting orchestrator")
54
+ res = orchestrate(
55
+ data_access_factory=self.data_access_factory,
56
+ runtime_config=self.runtime_config,
57
+ execution_configuration=self.execution_config
58
+ )
59
+ logger.debug("Completed orchestrator")
60
+ except Exception as e:
61
+ logger.info(f"Exception running orchestration\n{e}")
62
+ finally:
63
+ logger.info(f"Completed execution in {(time.time() - start)/60.} min, execution result {res}")
64
+ return res
@@ -0,0 +1,158 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import time
14
+ import traceback
15
+ from datetime import datetime
16
+
17
+ from data_processing.data_access import DataAccessFactoryBase
18
+ from data_processing.transform import TransformStatistics
19
+ from data_processing.utils import GB, get_logger
20
+ from data_processing_spark.runtime.spark import (
21
+ SparkTransformFileProcessor,
22
+ SparkTransformRuntimeConfiguration,
23
+ SparkTransformExecutionConfiguration,
24
+ )
25
+ from pyspark import SparkConf, SparkContext
26
+
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ def orchestrate(
32
+ runtime_config: SparkTransformRuntimeConfiguration,
33
+ execution_configuration: SparkTransformExecutionConfiguration,
34
+ data_access_factory: DataAccessFactoryBase,
35
+ ) -> int:
36
+ """
37
+ orchestrator for transformer execution
38
+ :param data_access_factory: data access factory
39
+ :param runtime_config: transformer runtime configuration
40
+ :param execution_configuration: orchestrator configuration
41
+ :return: 0 - success or 1 - failure
42
+ """
43
+ start_time = time.time()
44
+ start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
45
+ logger.info(f"orchestrator started at {start_ts}")
46
+ # create data access
47
+ data_access = data_access_factory.create_data_access()
48
+ if data_access is None:
49
+ logger.error("No DataAccess instance provided - exiting")
50
+ return 1
51
+ # initialize Spark
52
+ conf = SparkConf().setAppName(runtime_config.get_name()).set("spark.driver.host", "127.0.0.1")
53
+ sc = SparkContext(conf=conf)
54
+ spark_runtime_config = sc.broadcast(runtime_config)
55
+ daf = sc.broadcast(data_access_factory)
56
+
57
+ def process_partition(iterator):
58
+ """
59
+ process partitions
60
+ :param iterator: iterator of records
61
+ :return:
62
+ """
63
+ # local statistics dictionary
64
+ statistics = TransformStatistics()
65
+ # create transformer runtime
66
+ d_access_factory = daf.value
67
+ runtime_conf = spark_runtime_config.value
68
+ runtime = runtime_conf.create_transform_runtime()
69
+ # create file processor
70
+ file_processor = SparkTransformFileProcessor(
71
+ data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
72
+ )
73
+ first = True
74
+ for f in iterator:
75
+ # for every file
76
+ if first:
77
+ logger.debug(f"partition {f}")
78
+ # add additional parameters
79
+ transform_params = (
80
+ runtime.get_transform_config(partition=int(f[1]), data_access_factory=d_access_factory,
81
+ statistics=statistics))
82
+ # create transform with partition number
83
+ file_processor.create_transform(transform_params)
84
+ first = False
85
+ # process file
86
+ file_processor.process_file(f_name=f[0])
87
+ # flush
88
+ file_processor.flush()
89
+ # enhance statistics
90
+ runtime.compute_execution_stats(statistics)
91
+ # return partition's statistics
92
+ return list(statistics.get_execution_stats().items())
93
+
94
+ num_partitions = 0
95
+ try:
96
+ # Get files to process
97
+ files, profile, retries = data_access.get_files_to_process()
98
+ if len(files) == 0:
99
+ logger.error("No input files to process - exiting")
100
+ return 0
101
+ logger.info(f"Number of files is {len(files)}, source profile {profile}")
102
+ # process data
103
+ logger.debug("Begin processing files")
104
+ # process files split by partitions
105
+ logger.debug(f"parallelization {execution_configuration.parallelization}")
106
+ if execution_configuration.parallelization > 0:
107
+ source_rdd = sc.parallelize(files, execution_configuration.parallelization)
108
+ else:
109
+ source_rdd = sc.parallelize(files)
110
+ num_partitions = source_rdd.getNumPartitions()
111
+ logger.info(f"Parallelizing execution. Using {num_partitions} partitions")
112
+ stats_rdd = source_rdd.zipWithIndex().mapPartitions(process_partition)
113
+ # build overall statistics
114
+ stats = dict(stats_rdd.reduceByKey(lambda a, b: a + b).collect())
115
+ return_code = 0
116
+ status = "success"
117
+ except Exception as e:
118
+ # process execution exception
119
+ logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
120
+ return_code = 1
121
+ status = "failure"
122
+ stats = {}
123
+ try:
124
+ # build and save metadata
125
+ logger.debug("Building job metadata")
126
+ cpus = sc.defaultParallelism
127
+ executors = sc._jsc.sc().getExecutorMemoryStatus()
128
+ memory = 0.0
129
+ for i in range(executors.size()):
130
+ memory += executors.toList().apply(i)._2()._1()
131
+ resources = {"cpus": cpus, "gpus": 0, "memory": round(memory/GB, 2), "object_store": 0}
132
+ input_params = runtime_config.get_transform_metadata() | execution_configuration.get_input_params()
133
+ metadata = {
134
+ "pipeline": execution_configuration.pipeline_id,
135
+ "job details": execution_configuration.job_details
136
+ | {
137
+ "start_time": start_ts,
138
+ "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
139
+ "status": status,
140
+ },
141
+ "code": execution_configuration.code_location,
142
+ "job_input_params": input_params | data_access_factory.get_input_params(),
143
+ "execution_stats": {
144
+ "num partitions": num_partitions,
145
+ "execution time, min": round((time.time() - start_time) / 60, 3),
146
+ } | resources,
147
+ "job_output_stats": stats,
148
+ }
149
+ logger.debug(f"Saving job metadata: {metadata}.")
150
+ data_access.save_job_metadata(metadata)
151
+ logger.debug("Saved job metadata.")
152
+ return return_code
153
+ except Exception as e:
154
+ logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
155
+ return 1
156
+ finally:
157
+ # stop spark context at the end. Required for running multiple tests
158
+ sc.stop()
@@ -0,0 +1,52 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
16
+ from data_processing.transform import TransformStatistics
17
+
18
+
19
+ class DefaultSparkTransformRuntime:
20
+ """
21
+ Transformer runtime used by processor to to create Transform specific environment
22
+ """
23
+
24
+ def __init__(self, params: dict[str, Any]):
25
+ """
26
+ Create/config this runtime.
27
+ :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
28
+ """
29
+ self.params = params
30
+
31
+ def get_transform_config(
32
+ self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
33
+ ) -> dict[str, Any]:
34
+ """
35
+ Get the dictionary of configuration that will be provided to the transform's initializer.
36
+ This is the opportunity for this runtime to create a new set of configuration based on the
37
+ config/params provided to this instance's initializer. This may include the addition
38
+ of new configuration data such as ray shared memory, new actors, etc, that might be needed and
39
+ expected by the transform in its initializer and/or transform() methods.
40
+ :param data_access_factory - data access factory class being used by the RayOrchestrator.
41
+ :param statistics - reference to statistics actor
42
+ :return: dictionary of transform init params
43
+ """
44
+ return self.params
45
+
46
+ def compute_execution_stats(self, stats: TransformStatistics) -> None:
47
+ """
48
+ Update/augment the given statistics object with runtime-specific additions/modifications.
49
+ :param stats: output of statistics as aggregated across all calls to all transforms.
50
+ :return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
51
+ """
52
+ pass
@@ -0,0 +1,13 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an "AS IS" BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration
@@ -0,0 +1,42 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+ from data_processing.test_support.transform.noop_transform import (
13
+ NOOPTransformConfiguration,
14
+ )
15
+ from data_processing.utils import get_logger
16
+ from data_processing_spark.runtime.spark import SparkTransformLauncher
17
+ from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration
18
+
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class NOOPSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
24
+ """
25
+ Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
26
+ NOOP does not use a RayRuntime class so the superclass only needs the base
27
+ python-only configuration.
28
+ """
29
+
30
+ def __init__(self):
31
+ """
32
+ Initialization
33
+ """
34
+ super().__init__(transform_config=NOOPTransformConfiguration())
35
+
36
+
37
+ if __name__ == "__main__":
38
+ # create launcher
39
+ launcher = SparkTransformLauncher(runtime_config=NOOPSparkTransformConfiguration())
40
+ logger.info("Launching noop transform")
41
+ # Launch the ray actor(s) to process the input
42
+ launcher.launch()
@@ -1,56 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: data_prep_toolkit
3
- Version: 0.2.2.dev0
4
- Summary: Data Preparation Toolkit Library
5
- Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
- License: Apache-2.0
7
- Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: numpy<1.29.0
11
- Requires-Dist: pyarrow==16.1.0
12
- Requires-Dist: boto3==1.34.69
13
- Requires-Dist: argparse
14
- Requires-Dist: mmh3
15
- Provides-Extra: dev
16
- Requires-Dist: twine; extra == "dev"
17
- Requires-Dist: pytest>=7.3.2; extra == "dev"
18
- Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
19
- Requires-Dist: pytest-env>=1.0.0; extra == "dev"
20
- Requires-Dist: pre-commit>=3.3.2; extra == "dev"
21
- Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
22
- Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
23
- Requires-Dist: moto==5.0.5; extra == "dev"
24
- Requires-Dist: markupsafe==2.0.1; extra == "dev"
25
-
26
- # Data Processing Library
27
- This provides a python framework for developing _transforms_
28
- on data stored in files - currently parquet files are supported -
29
- and running them in a [ray](https://www.ray.io/) cluster.
30
- Data files may be stored in the local file system or COS/S3.
31
- For more details see the [documentation](../doc/overview.md).
32
-
33
- ### Virtual Environment
34
- The project uses `pyproject.toml` and a Makefile for operations.
35
- To do development you should establish the virtual environment
36
- ```shell
37
- make venv
38
- ```
39
- and then either activate
40
- ```shell
41
- source venv/bin/activate
42
- ```
43
- or set up your IDE to use the venv directory when developing in this project
44
-
45
- ## Library Artifact Build and Publish
46
- To test, build and publish the library
47
- ```shell
48
- make test build publish
49
- ```
50
-
51
- To up the version number, edit the Makefile to change VERSION and rerun
52
- the above. This will require committing both the `Makefile` and the
53
- autotmatically updated `pyproject.toml` file.
54
-
55
-
56
-
@@ -1 +0,0 @@
1
- data_processing