data-prep-toolkit 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. data_prep_toolkit-0.2.2.dev1.dist-info/METADATA +33 -0
  2. {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/RECORD +26 -5
  3. data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt +3 -0
  4. data_processing/runtime/pure_python/transform_orchestrator.py +24 -2
  5. data_processing_ray/runtime/ray/__init__.py +9 -0
  6. data_processing_ray/runtime/ray/execution_configuration.py +110 -0
  7. data_processing_ray/runtime/ray/ray_utils.py +246 -0
  8. data_processing_ray/runtime/ray/runtime_configuration.py +37 -0
  9. data_processing_ray/runtime/ray/transform_file_processor.py +53 -0
  10. data_processing_ray/runtime/ray/transform_invoker.py +103 -0
  11. data_processing_ray/runtime/ray/transform_launcher.py +117 -0
  12. data_processing_ray/runtime/ray/transform_orchestrator.py +159 -0
  13. data_processing_ray/runtime/ray/transform_runtime.py +53 -0
  14. data_processing_ray/runtime/ray/transform_statistics.py +76 -0
  15. data_processing_ray/test_support/transform/__init__.py +1 -0
  16. data_processing_ray/test_support/transform/noop_transform.py +45 -0
  17. data_processing_spark/runtime/spark/__init__.py +17 -0
  18. data_processing_spark/runtime/spark/execution_configuration.py +85 -0
  19. data_processing_spark/runtime/spark/runtime_configuration.py +37 -0
  20. data_processing_spark/runtime/spark/transform_file_processor.py +65 -0
  21. data_processing_spark/runtime/spark/transform_launcher.py +64 -0
  22. data_processing_spark/runtime/spark/transform_orchestrator.py +158 -0
  23. data_processing_spark/runtime/spark/transform_runtime.py +52 -0
  24. data_processing_spark/test_support/transform/__init__.py +13 -0
  25. data_processing_spark/test_support/transform/noop_transform.py +42 -0
  26. data_prep_toolkit-0.2.2.dev0.dist-info/METADATA +0 -56
  27. data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt +0 -1
  28. {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,103 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.runtime.pure_python import invoke_transform
16
+ from data_processing.utils import (
17
+ PipInstaller,
18
+ TransformRuntime,
19
+ TransformsConfiguration,
20
+ get_logger,
21
+ )
22
+ from data_processing_ray.runtime.ray import RayTransformLauncher
23
+
24
+
25
+ project = "https://github.com/IBM/data-prep-kit.git"
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ def execute_ray_transform(
30
+ configuration: TransformsConfiguration,
31
+ name: str,
32
+ params: dict[str, Any],
33
+ input_folder: str,
34
+ output_folder: str,
35
+ s3_config: dict[str, Any] = None,
36
+ ) -> bool:
37
+ """
38
+ Execute Ray transform
39
+ :param configuration: transforms configuration
40
+ :param name: transform name
41
+ :param params: transform params
42
+ :param input_folder: input folder (local or S3)
43
+ :param output_folder: output folder (local or S3)
44
+ :param s3_config: S3 configuration - None local data
45
+ :return: True/False - execution result
46
+ """
47
+ # get transform configuration
48
+ r_subdirectory, r_l_name, extra_libraries, t_class = configuration.get_configuration(
49
+ transform=name, runtime=TransformRuntime.RAY
50
+ )
51
+ if r_subdirectory is None:
52
+ return False
53
+ p_subdirectory, p_l_name, _, _ = configuration.get_configuration(transform=name, runtime=TransformRuntime.PYTHON)
54
+
55
+ installer = PipInstaller()
56
+ # Ray installer can depend on Python installer, if this is the case install Python one first
57
+ p_installed = False
58
+ if p_subdirectory is not None and not installer.validate(name=r_l_name):
59
+ # install corresponding python transform library, if required
60
+ if not installer.install(project=project, subdirectory=p_subdirectory, name=p_l_name):
61
+ logger.warning(f"failed to install transform {name}")
62
+ return False
63
+ p_installed = True
64
+
65
+ # Check if transformer already installed
66
+ r_installed = False
67
+ if not installer.validate(name=r_l_name):
68
+ # transformer is not installed, install it
69
+ if not installer.install(project=project, subdirectory=r_subdirectory, name=r_l_name):
70
+ logger.warning(f"failed to install transform {name}")
71
+ if p_installed:
72
+ installer.uninstall(name=p_l_name)
73
+ for library in extra_libraries:
74
+ installer.uninstall(name=library)
75
+ return False
76
+ r_installed = True
77
+ # invoke transform
78
+ res = invoke_transform(
79
+ name=name,
80
+ t_class=t_class,
81
+ launcher=RayTransformLauncher,
82
+ input_folder=input_folder,
83
+ output_folder=output_folder,
84
+ s3_config=s3_config,
85
+ params=params | {"run_locally": True},
86
+ )
87
+ # clean up
88
+ if p_installed:
89
+ # we installed transformer, uninstall it
90
+ if not installer.uninstall(name=p_l_name):
91
+ logger.warning(f"failed uninstall transform {r_l_name}")
92
+ if r_installed:
93
+ # we installed transformer, uninstall it
94
+ if not installer.uninstall(name=r_l_name):
95
+ logger.warning(f"failed uninstall transform {r_l_name}")
96
+ # uninstall support libraries
97
+ for library in extra_libraries:
98
+ if not installer.uninstall(name=library):
99
+ logger.warning(f"failed uninstall transform {library}")
100
+ if res == 0:
101
+ return True
102
+ logger.warning(f"failed execution of transform {name}")
103
+ return False
@@ -0,0 +1,117 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import argparse
14
+ import sys
15
+ import time
16
+
17
+ import ray
18
+ from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
19
+ from data_processing.runtime.transform_launcher import AbstractTransformLauncher
20
+ from data_processing.utils import get_logger, str2bool
21
+ from data_processing_ray.runtime.ray import (
22
+ RayTransformExecutionConfiguration,
23
+ RayTransformRuntimeConfiguration,
24
+ orchestrate,
25
+ )
26
+
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ class RayTransformLauncher(AbstractTransformLauncher):
32
+ """
33
+ Driver class starting Filter execution
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ runtime_config: RayTransformRuntimeConfiguration,
39
+ data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
40
+ ):
41
+ """
42
+ Creates driver
43
+ :param runtime_config: transform runtime factory
44
+ :param data_access_factory: the factory to create DataAccess instances.
45
+ """
46
+ super().__init__(runtime_config, data_access_factory)
47
+ self.execution_config = RayTransformExecutionConfiguration(name=self.name)
48
+
49
+ def _get_arguments(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
50
+ """
51
+ Parse input parameters
52
+ :param parser: parser
53
+ :return: list of arguments
54
+ """
55
+ parser.add_argument(
56
+ "--run_locally", type=lambda x: bool(str2bool(x)), default=False, help="running ray local flag"
57
+ )
58
+ return super()._get_arguments(parser)
59
+
60
+ def _get_parameters(self, args: argparse.Namespace) -> bool:
61
+ """
62
+ This method creates arg parser, fill it with the parameters
63
+ and does parameters validation
64
+ :return: True id validation passe or False, if not
65
+ """
66
+ result = super()._get_parameters(args)
67
+ self.run_locally = args.run_locally
68
+ if self.run_locally:
69
+ logger.info("Running locally")
70
+ else:
71
+ logger.info("connecting to existing cluster")
72
+ return result
73
+
74
+ def _submit_for_execution(self) -> int:
75
+ """
76
+ Submit for Ray execution
77
+ :return:
78
+ """
79
+ res = 1
80
+ start = time.time()
81
+ try:
82
+ if self.run_locally:
83
+ # Will create a local Ray cluster
84
+ logger.debug("running locally creating Ray cluster")
85
+ # enable metrics for local Ray
86
+ ray.init(_metrics_export_port=8088)
87
+ else:
88
+ # connect to the existing cluster
89
+ logger.info("Connecting to the existing Ray cluster")
90
+ ray.init(f"ray://localhost:10001", ignore_reinit_error=True)
91
+ logger.debug("Starting orchestrator")
92
+ res = ray.get(
93
+ orchestrate.remote(
94
+ preprocessing_params=self.execution_config,
95
+ data_access_factory=self.data_access_factory,
96
+ runtime_config=self.runtime_config,
97
+ )
98
+ )
99
+ logger.debug("Completed orchestrator")
100
+ time.sleep(10)
101
+ except Exception as e:
102
+ logger.info(f"Exception running ray remote orchestration\n{e}")
103
+ finally:
104
+ logger.info(f"Completed execution in {round((time.time() - start)/60., 3)} min, execution result {res}")
105
+ ray.shutdown()
106
+ return res
107
+
108
+ def launch(self) -> int:
109
+ """
110
+ Execute method orchestrates driver invocation
111
+ :return: launch result
112
+ """
113
+ res = super().launch()
114
+ if not self.run_locally and res > 0:
115
+ # if we are running in kfp exit to signal kfp that we failed
116
+ sys.exit(1)
117
+ return res
@@ -0,0 +1,159 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import time
14
+ import traceback
15
+ from datetime import datetime
16
+
17
+ import ray
18
+ from data_processing.data_access import DataAccessFactoryBase
19
+ from data_processing_ray.runtime.ray import (
20
+ RayTransformExecutionConfiguration,
21
+ RayTransformFileProcessor,
22
+ RayTransformRuntimeConfiguration,
23
+ RayUtils,
24
+ TransformStatisticsRay,
25
+ )
26
+ from ray.util import ActorPool
27
+
28
+
29
+ @ray.remote(num_cpus=1, scheduling_strategy="SPREAD")
30
+ def orchestrate(
31
+ preprocessing_params: RayTransformExecutionConfiguration,
32
+ data_access_factory: DataAccessFactoryBase,
33
+ runtime_config: RayTransformRuntimeConfiguration,
34
+ ) -> int:
35
+ """
36
+ orchestrator for transformer execution
37
+ :param preprocessing_params: orchestrator configuration
38
+ :param data_access_factory: data access factory
39
+ :param runtime_config: transformer runtime configuration
40
+ :return: 0 - success or 1 - failure
41
+ """
42
+
43
+ from data_processing.utils import get_logger
44
+ from ray.util.metrics import Gauge
45
+
46
+ logger = get_logger(__name__)
47
+ start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
48
+ start_time = time.time()
49
+ logger.info(f"orchestrator started at {start_ts}")
50
+ # create data access
51
+ data_access = data_access_factory.create_data_access()
52
+ if data_access is None:
53
+ logger.error("No DataAccess instance provided - exiting")
54
+ return 1
55
+ statistics = TransformStatisticsRay.remote({})
56
+ # create transformer runtime
57
+ runtime = runtime_config.create_transform_runtime()
58
+ resources = RayUtils.get_cluster_resources()
59
+ try:
60
+ # Get files to process
61
+ files, profile, retries = data_access.get_files_to_process()
62
+ if len(files) == 0:
63
+ logger.error("No input files to process - exiting")
64
+ return 0
65
+ logger.info(f"Number of files is {len(files)}, source profile {profile}")
66
+ # Print interval
67
+ print_interval = int(len(files) / 100)
68
+ if print_interval == 0:
69
+ print_interval = 1
70
+ # Get Resources for execution
71
+ logger.info(f"Cluster resources: {resources}")
72
+ # print execution params
73
+ logger.info(
74
+ f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each"
75
+ )
76
+ # log retries
77
+ if retries > 0:
78
+ statistics.add_stats.remote({"data access retries": retries})
79
+ # create executors
80
+ processor_params = {
81
+ "data_access_factory": data_access_factory,
82
+ "transform_class": runtime_config.get_transform_class(),
83
+ "transform_params": runtime.get_transform_config(
84
+ data_access_factory=data_access_factory, statistics=statistics, files=files
85
+ ),
86
+ "statistics": statistics,
87
+ }
88
+ logger.debug("Creating actors")
89
+ processors = RayUtils.create_actors(
90
+ clazz=RayTransformFileProcessor,
91
+ params=processor_params,
92
+ actor_options=preprocessing_params.worker_options,
93
+ n_actors=preprocessing_params.n_workers,
94
+ creation_delay=preprocessing_params.creation_delay,
95
+ )
96
+ processors_pool = ActorPool(processors)
97
+ # create gauges
98
+ files_in_progress_gauge = Gauge("files_in_progress", "Number of files in progress")
99
+ files_completed_gauge = Gauge("files_processed_total", "Number of files completed")
100
+ available_cpus_gauge = Gauge("available_cpus", "Number of available CPUs")
101
+ available_gpus_gauge = Gauge("available_gpus", "Number of available GPUs")
102
+ available_memory_gauge = Gauge("available_memory", "Available memory")
103
+ available_object_memory_gauge = Gauge("available_object_store", "Available object store")
104
+ # process data
105
+ logger.debug("Begin processing files")
106
+ failures = RayUtils.process_files(
107
+ executors=processors_pool,
108
+ files=files,
109
+ print_interval=print_interval,
110
+ files_in_progress_gauge=files_in_progress_gauge,
111
+ files_completed_gauge=files_completed_gauge,
112
+ available_cpus_gauge=available_cpus_gauge,
113
+ available_gpus_gauge=available_gpus_gauge,
114
+ available_memory_gauge=available_memory_gauge,
115
+ object_memory_gauge=available_object_memory_gauge,
116
+ logger=logger,
117
+ )
118
+ if failures > 0:
119
+ statistics.add_stats.remote({"actor failures": failures})
120
+ logger.debug("Done processing files, waiting for flush() completion.")
121
+ # invoke flush to ensure that all results are returned
122
+ start = time.time()
123
+ replies = [processor.flush.remote() for processor in processors]
124
+ failures = RayUtils.wait_for_execution_completion(logger=logger, replies=replies)
125
+ if failures > 0:
126
+ statistics.add_stats.remote({"actor failures": failures})
127
+ logger.info(f"done flushing in {round(time.time() - start, 3)} sec")
128
+ status = "success"
129
+ return_code = 0
130
+ except Exception as e:
131
+ logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
132
+ status = "failure"
133
+ return_code = 1
134
+ try:
135
+ # Compute execution statistics
136
+ logger.debug("Computing execution stats")
137
+ stats = runtime.compute_execution_stats(ray.get(statistics.get_execution_stats.remote()))
138
+ stats["processing_time"] = round(stats["processing_time"], 3)
139
+
140
+ # build and save metadata
141
+ logger.debug("Building job metadata")
142
+ metadata = {
143
+ "pipeline": preprocessing_params.pipeline_id,
144
+ "job details": preprocessing_params.job_details
145
+ | {"start_time": start_ts, "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "status": status},
146
+ "code": preprocessing_params.code_location,
147
+ "job_input_params": runtime_config.get_transform_metadata()
148
+ | data_access_factory.get_input_params()
149
+ | preprocessing_params.get_input_params(),
150
+ "execution_stats": resources | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
151
+ "job_output_stats": stats,
152
+ }
153
+ logger.debug(f"Saving job metadata: {metadata}.")
154
+ data_access.save_job_metadata(metadata)
155
+ logger.debug("Saved job metadata.")
156
+ return return_code
157
+ except Exception as e:
158
+ logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
159
+ return 1
@@ -0,0 +1,53 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
16
+ from ray.actor import ActorHandle
17
+
18
+
19
+ class DefaultRayTransformRuntime:
20
+ """
21
+ Transformer runtime used by processor to to create Transform specific environment
22
+ """
23
+
24
+ def __init__(self, params: dict[str, Any]):
25
+ """
26
+ Create/config this runtime.
27
+ :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
28
+ """
29
+ self.params = params
30
+
31
+ def get_transform_config(
32
+ self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
33
+ ) -> dict[str, Any]:
34
+ """
35
+ Get the dictionary of configuration that will be provided to the transform's initializer.
36
+ This is the opportunity for this runtime to create a new set of configuration based on the
37
+ config/params provided to this instance's initializer. This may include the addition
38
+ of new configuration data such as ray shared memory, new actors, etc, that might be needed and
39
+ expected by the transform in its initializer and/or transform() methods.
40
+ :param data_access_factory - data access factory class being used by the RayOrchestrator.
41
+ :param statistics - reference to statistics actor
42
+ :param files - list of files to process
43
+ :return: dictionary of transform init params
44
+ """
45
+ return self.params
46
+
47
+ def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
48
+ """
49
+ Update/augment the given stats object with runtime-specific additions/modifications.
50
+ :param stats: output of statistics as aggregated across all calls to all transforms.
51
+ :return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
52
+ """
53
+ return stats
@@ -0,0 +1,76 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ import ray
16
+ from data_processing.transform import TransformStatistics
17
+
18
+
19
+ @ray.remote(num_cpus=0.25, scheduling_strategy="SPREAD")
20
+ class TransformStatisticsRay(TransformStatistics):
21
+ """
22
+ Basic statistics class collecting basic execution statistics.
23
+ It can be extended for specific processors
24
+ """
25
+
26
+ def __init__(self, params: dict[str, Any]):
27
+ from ray.util.metrics import Counter
28
+
29
+ super().__init__()
30
+ self.data_write_counter = Counter(name="data_written", description="Total data written bytes")
31
+ self.data_read_counter = Counter(name="data_read", description="Total data read bytes")
32
+ self.source_files_counter = Counter(name="source_files_processed", description="Total source files processed")
33
+ self.result_files_counter = Counter(name="result_files_written", description="Total result files written")
34
+ self.source_documents_counter = Counter(
35
+ name="source_documents_processed", description="Total source document processed"
36
+ )
37
+ self.result_documents_counter = Counter(
38
+ name="result_documents_written", description="Total result documents written"
39
+ )
40
+ self.empty_table_counter = Counter(name="empty_tables", description="Total empty tables read")
41
+ self.failed_read_counter = Counter(name="failed_read_files", description="Total read failed files")
42
+ self.failed_write_counter = Counter(name="failed_write_files", description="Total write failed files")
43
+ self.transform_exceptions_counter = Counter(
44
+ name="transform_exceptions", description="Transform exception occurred"
45
+ )
46
+ self.data_retries_counter = Counter(name="data_access_retries", description="Data access retries")
47
+
48
+ def add_stats(self, stats=dict[str, Any]) -> None:
49
+ """
50
+ Add statistics
51
+ :param stats - dictionary creating new statistics
52
+ :return: None
53
+ """
54
+ for key, val in stats.items():
55
+ self.stats[key] = self.stats.get(key, 0) + val
56
+ if val > 0:
57
+ if key == "source_files":
58
+ self.source_files_counter.inc(val)
59
+ if key == "source_size":
60
+ self.data_read_counter.inc(val)
61
+ if key == "result_files":
62
+ self.result_files_counter.inc(val)
63
+ if key == "source_doc_count":
64
+ self.source_documents_counter.inc(val)
65
+ if key == "result_doc_count":
66
+ self.result_documents_counter.inc(val)
67
+ if key == "skipped empty tables":
68
+ self.empty_table_counter.inc(val)
69
+ if key == "failed_reads":
70
+ self.failed_read_counter.inc(val)
71
+ if key == "failed_writes":
72
+ self.failed_write_counter.inc(val)
73
+ if key == "transform execution exception":
74
+ self.transform_exceptions_counter.inc(val)
75
+ if key == "data access retries":
76
+ self.data_retries_counter.inc(val)
@@ -0,0 +1 @@
1
+ from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration
@@ -0,0 +1,45 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+
14
+ from data_processing.test_support.transform.noop_transform import (
15
+ NOOPTransformConfiguration,
16
+ )
17
+ from data_processing.utils import get_logger
18
+ from data_processing_ray.runtime.ray import (
19
+ RayTransformLauncher,
20
+ RayTransformRuntimeConfiguration,
21
+ )
22
+
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ class NOOPRayTransformConfiguration(RayTransformRuntimeConfiguration):
28
+ """
29
+ Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
30
+ NOOP does not use a RayRuntime class so the superclass only needs the base
31
+ python-only configuration.
32
+ """
33
+
34
+ def __init__(self):
35
+ """
36
+ Initialization
37
+ """
38
+ super().__init__(transform_config=NOOPTransformConfiguration())
39
+
40
+
41
+ if __name__ == "__main__":
42
+ # launcher = NOOPRayLauncher()
43
+ launcher = RayTransformLauncher(NOOPRayTransformConfiguration())
44
+ logger.info("Launching noop transform")
45
+ launcher.launch()
@@ -0,0 +1,17 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an "AS IS" BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+ from data_processing_spark.runtime.spark.transform_runtime import DefaultSparkTransformRuntime
13
+ from data_processing_spark.runtime.spark.execution_configuration import SparkTransformExecutionConfiguration
14
+ from data_processing_spark.runtime.spark.runtime_configuration import SparkTransformRuntimeConfiguration
15
+ from data_processing_spark.runtime.spark.transform_file_processor import SparkTransformFileProcessor
16
+ from data_processing_spark.runtime.spark.transform_orchestrator import orchestrate
17
+ from data_processing_spark.runtime.spark.transform_launcher import SparkTransformLauncher
@@ -0,0 +1,85 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import argparse
14
+ from typing import Any
15
+
16
+ from data_processing.runtime import TransformExecutionConfiguration, runtime_cli_prefix
17
+ from data_processing.utils import CLIArgumentProvider, get_logger
18
+
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class SparkTransformExecutionConfiguration(TransformExecutionConfiguration):
24
+ """
25
+ A class specifying and validating Spark orchestrator configuration
26
+ """
27
+
28
+ def __init__(self, name: str):
29
+ """
30
+ Initialization
31
+ """
32
+ super().__init__(name=name, print_params=False)
33
+ self.parallelization = -1
34
+
35
+ def add_input_params(self, parser: argparse.ArgumentParser) -> None:
36
+ """
37
+ This method adds transformer specific parameter to parser
38
+ :param parser: parser
39
+ :return: None
40
+ """
41
+ """
42
+ This determines how many partitions the RDD should be divided into. See
43
+ https://sparktpoint.com/how-to-create-rdd-using-parallelize/ for the explanation
44
+ of this parameter
45
+ If you specify a positive value of the parameter, Spark will attempt to evenly
46
+ distribute the data from seq into that many partitions. For example, if you have
47
+ a collection of 100 elements and you specify numSlices as 4, Spark will try
48
+ to create 4 partitions with approximately 25 elements in each partition.
49
+ If you don’t specify this parameter, Spark will use a default value, which is
50
+ typically determined based on the cluster configuration or the available resources
51
+ (number of workers).
52
+ """
53
+ parser.add_argument(f"--{runtime_cli_prefix}parallelization", type=int, default=-1, help="parallelization.")
54
+ return TransformExecutionConfiguration.add_input_params(self, parser=parser)
55
+
56
+ def apply_input_params(self, args: argparse.Namespace) -> bool:
57
+ """
58
+ Validate transformer specific parameters
59
+ :param args: user defined arguments
60
+ :return: True, if validate pass or False otherwise
61
+ """
62
+ if not TransformExecutionConfiguration.apply_input_params(self, args=args):
63
+ return False
64
+ captured = CLIArgumentProvider.capture_parameters(args, runtime_cli_prefix, False)
65
+ # store parameters locally
66
+ self.job_details = {
67
+ "job category": "preprocessing",
68
+ "job name": self.name,
69
+ "job type": "spark",
70
+ "job id": captured["job_id"],
71
+ }
72
+ self.parallelization = captured["parallelization"]
73
+ # if the user did not define actor max_restarts set it up for fault tolerance
74
+ logger.info(f"job details {self.job_details}")
75
+ logger.info(f"RDD parallelization {self.parallelization}")
76
+ return True
77
+
78
+ def get_input_params(self) -> dict[str, Any]:
79
+ """
80
+ get input parameters for job_input_params in metadata
81
+ :return: dictionary of parameters
82
+ """
83
+ return {
84
+ "RDD parallelization": self.parallelization,
85
+ }