data-prep-toolkit 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_prep_toolkit-0.2.2.dev1.dist-info/METADATA +33 -0
- {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/RECORD +26 -5
- data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt +3 -0
- data_processing/runtime/pure_python/transform_orchestrator.py +24 -2
- data_processing_ray/runtime/ray/__init__.py +9 -0
- data_processing_ray/runtime/ray/execution_configuration.py +110 -0
- data_processing_ray/runtime/ray/ray_utils.py +246 -0
- data_processing_ray/runtime/ray/runtime_configuration.py +37 -0
- data_processing_ray/runtime/ray/transform_file_processor.py +53 -0
- data_processing_ray/runtime/ray/transform_invoker.py +103 -0
- data_processing_ray/runtime/ray/transform_launcher.py +117 -0
- data_processing_ray/runtime/ray/transform_orchestrator.py +159 -0
- data_processing_ray/runtime/ray/transform_runtime.py +53 -0
- data_processing_ray/runtime/ray/transform_statistics.py +76 -0
- data_processing_ray/test_support/transform/__init__.py +1 -0
- data_processing_ray/test_support/transform/noop_transform.py +45 -0
- data_processing_spark/runtime/spark/__init__.py +17 -0
- data_processing_spark/runtime/spark/execution_configuration.py +85 -0
- data_processing_spark/runtime/spark/runtime_configuration.py +37 -0
- data_processing_spark/runtime/spark/transform_file_processor.py +65 -0
- data_processing_spark/runtime/spark/transform_launcher.py +64 -0
- data_processing_spark/runtime/spark/transform_orchestrator.py +158 -0
- data_processing_spark/runtime/spark/transform_runtime.py +52 -0
- data_processing_spark/test_support/transform/__init__.py +13 -0
- data_processing_spark/test_support/transform/noop_transform.py +42 -0
- data_prep_toolkit-0.2.2.dev0.dist-info/METADATA +0 -56
- data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt +0 -1
- {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from data_processing.runtime.pure_python import invoke_transform
|
|
16
|
+
from data_processing.utils import (
|
|
17
|
+
PipInstaller,
|
|
18
|
+
TransformRuntime,
|
|
19
|
+
TransformsConfiguration,
|
|
20
|
+
get_logger,
|
|
21
|
+
)
|
|
22
|
+
from data_processing_ray.runtime.ray import RayTransformLauncher
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
project = "https://github.com/IBM/data-prep-kit.git"
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def execute_ray_transform(
|
|
30
|
+
configuration: TransformsConfiguration,
|
|
31
|
+
name: str,
|
|
32
|
+
params: dict[str, Any],
|
|
33
|
+
input_folder: str,
|
|
34
|
+
output_folder: str,
|
|
35
|
+
s3_config: dict[str, Any] = None,
|
|
36
|
+
) -> bool:
|
|
37
|
+
"""
|
|
38
|
+
Execute Ray transform
|
|
39
|
+
:param configuration: transforms configuration
|
|
40
|
+
:param name: transform name
|
|
41
|
+
:param params: transform params
|
|
42
|
+
:param input_folder: input folder (local or S3)
|
|
43
|
+
:param output_folder: output folder (local or S3)
|
|
44
|
+
:param s3_config: S3 configuration - None local data
|
|
45
|
+
:return: True/False - execution result
|
|
46
|
+
"""
|
|
47
|
+
# get transform configuration
|
|
48
|
+
r_subdirectory, r_l_name, extra_libraries, t_class = configuration.get_configuration(
|
|
49
|
+
transform=name, runtime=TransformRuntime.RAY
|
|
50
|
+
)
|
|
51
|
+
if r_subdirectory is None:
|
|
52
|
+
return False
|
|
53
|
+
p_subdirectory, p_l_name, _, _ = configuration.get_configuration(transform=name, runtime=TransformRuntime.PYTHON)
|
|
54
|
+
|
|
55
|
+
installer = PipInstaller()
|
|
56
|
+
# Ray installer can depend on Python installer, if this is the case install Python one first
|
|
57
|
+
p_installed = False
|
|
58
|
+
if p_subdirectory is not None and not installer.validate(name=r_l_name):
|
|
59
|
+
# install corresponding python transform library, if required
|
|
60
|
+
if not installer.install(project=project, subdirectory=p_subdirectory, name=p_l_name):
|
|
61
|
+
logger.warning(f"failed to install transform {name}")
|
|
62
|
+
return False
|
|
63
|
+
p_installed = True
|
|
64
|
+
|
|
65
|
+
# Check if transformer already installed
|
|
66
|
+
r_installed = False
|
|
67
|
+
if not installer.validate(name=r_l_name):
|
|
68
|
+
# transformer is not installed, install it
|
|
69
|
+
if not installer.install(project=project, subdirectory=r_subdirectory, name=r_l_name):
|
|
70
|
+
logger.warning(f"failed to install transform {name}")
|
|
71
|
+
if p_installed:
|
|
72
|
+
installer.uninstall(name=p_l_name)
|
|
73
|
+
for library in extra_libraries:
|
|
74
|
+
installer.uninstall(name=library)
|
|
75
|
+
return False
|
|
76
|
+
r_installed = True
|
|
77
|
+
# invoke transform
|
|
78
|
+
res = invoke_transform(
|
|
79
|
+
name=name,
|
|
80
|
+
t_class=t_class,
|
|
81
|
+
launcher=RayTransformLauncher,
|
|
82
|
+
input_folder=input_folder,
|
|
83
|
+
output_folder=output_folder,
|
|
84
|
+
s3_config=s3_config,
|
|
85
|
+
params=params | {"run_locally": True},
|
|
86
|
+
)
|
|
87
|
+
# clean up
|
|
88
|
+
if p_installed:
|
|
89
|
+
# we installed transformer, uninstall it
|
|
90
|
+
if not installer.uninstall(name=p_l_name):
|
|
91
|
+
logger.warning(f"failed uninstall transform {r_l_name}")
|
|
92
|
+
if r_installed:
|
|
93
|
+
# we installed transformer, uninstall it
|
|
94
|
+
if not installer.uninstall(name=r_l_name):
|
|
95
|
+
logger.warning(f"failed uninstall transform {r_l_name}")
|
|
96
|
+
# uninstall support libraries
|
|
97
|
+
for library in extra_libraries:
|
|
98
|
+
if not installer.uninstall(name=library):
|
|
99
|
+
logger.warning(f"failed uninstall transform {library}")
|
|
100
|
+
if res == 0:
|
|
101
|
+
return True
|
|
102
|
+
logger.warning(f"failed execution of transform {name}")
|
|
103
|
+
return False
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
|
|
17
|
+
import ray
|
|
18
|
+
from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
|
|
19
|
+
from data_processing.runtime.transform_launcher import AbstractTransformLauncher
|
|
20
|
+
from data_processing.utils import get_logger, str2bool
|
|
21
|
+
from data_processing_ray.runtime.ray import (
|
|
22
|
+
RayTransformExecutionConfiguration,
|
|
23
|
+
RayTransformRuntimeConfiguration,
|
|
24
|
+
orchestrate,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
logger = get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RayTransformLauncher(AbstractTransformLauncher):
|
|
32
|
+
"""
|
|
33
|
+
Driver class starting Filter execution
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
runtime_config: RayTransformRuntimeConfiguration,
|
|
39
|
+
data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Creates driver
|
|
43
|
+
:param runtime_config: transform runtime factory
|
|
44
|
+
:param data_access_factory: the factory to create DataAccess instances.
|
|
45
|
+
"""
|
|
46
|
+
super().__init__(runtime_config, data_access_factory)
|
|
47
|
+
self.execution_config = RayTransformExecutionConfiguration(name=self.name)
|
|
48
|
+
|
|
49
|
+
def _get_arguments(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
|
|
50
|
+
"""
|
|
51
|
+
Parse input parameters
|
|
52
|
+
:param parser: parser
|
|
53
|
+
:return: list of arguments
|
|
54
|
+
"""
|
|
55
|
+
parser.add_argument(
|
|
56
|
+
"--run_locally", type=lambda x: bool(str2bool(x)), default=False, help="running ray local flag"
|
|
57
|
+
)
|
|
58
|
+
return super()._get_arguments(parser)
|
|
59
|
+
|
|
60
|
+
def _get_parameters(self, args: argparse.Namespace) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
This method creates arg parser, fill it with the parameters
|
|
63
|
+
and does parameters validation
|
|
64
|
+
:return: True id validation passe or False, if not
|
|
65
|
+
"""
|
|
66
|
+
result = super()._get_parameters(args)
|
|
67
|
+
self.run_locally = args.run_locally
|
|
68
|
+
if self.run_locally:
|
|
69
|
+
logger.info("Running locally")
|
|
70
|
+
else:
|
|
71
|
+
logger.info("connecting to existing cluster")
|
|
72
|
+
return result
|
|
73
|
+
|
|
74
|
+
def _submit_for_execution(self) -> int:
|
|
75
|
+
"""
|
|
76
|
+
Submit for Ray execution
|
|
77
|
+
:return:
|
|
78
|
+
"""
|
|
79
|
+
res = 1
|
|
80
|
+
start = time.time()
|
|
81
|
+
try:
|
|
82
|
+
if self.run_locally:
|
|
83
|
+
# Will create a local Ray cluster
|
|
84
|
+
logger.debug("running locally creating Ray cluster")
|
|
85
|
+
# enable metrics for local Ray
|
|
86
|
+
ray.init(_metrics_export_port=8088)
|
|
87
|
+
else:
|
|
88
|
+
# connect to the existing cluster
|
|
89
|
+
logger.info("Connecting to the existing Ray cluster")
|
|
90
|
+
ray.init(f"ray://localhost:10001", ignore_reinit_error=True)
|
|
91
|
+
logger.debug("Starting orchestrator")
|
|
92
|
+
res = ray.get(
|
|
93
|
+
orchestrate.remote(
|
|
94
|
+
preprocessing_params=self.execution_config,
|
|
95
|
+
data_access_factory=self.data_access_factory,
|
|
96
|
+
runtime_config=self.runtime_config,
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
logger.debug("Completed orchestrator")
|
|
100
|
+
time.sleep(10)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.info(f"Exception running ray remote orchestration\n{e}")
|
|
103
|
+
finally:
|
|
104
|
+
logger.info(f"Completed execution in {round((time.time() - start)/60., 3)} min, execution result {res}")
|
|
105
|
+
ray.shutdown()
|
|
106
|
+
return res
|
|
107
|
+
|
|
108
|
+
def launch(self) -> int:
|
|
109
|
+
"""
|
|
110
|
+
Execute method orchestrates driver invocation
|
|
111
|
+
:return: launch result
|
|
112
|
+
"""
|
|
113
|
+
res = super().launch()
|
|
114
|
+
if not self.run_locally and res > 0:
|
|
115
|
+
# if we are running in kfp exit to signal kfp that we failed
|
|
116
|
+
sys.exit(1)
|
|
117
|
+
return res
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
import traceback
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
|
|
17
|
+
import ray
|
|
18
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
19
|
+
from data_processing_ray.runtime.ray import (
|
|
20
|
+
RayTransformExecutionConfiguration,
|
|
21
|
+
RayTransformFileProcessor,
|
|
22
|
+
RayTransformRuntimeConfiguration,
|
|
23
|
+
RayUtils,
|
|
24
|
+
TransformStatisticsRay,
|
|
25
|
+
)
|
|
26
|
+
from ray.util import ActorPool
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@ray.remote(num_cpus=1, scheduling_strategy="SPREAD")
|
|
30
|
+
def orchestrate(
|
|
31
|
+
preprocessing_params: RayTransformExecutionConfiguration,
|
|
32
|
+
data_access_factory: DataAccessFactoryBase,
|
|
33
|
+
runtime_config: RayTransformRuntimeConfiguration,
|
|
34
|
+
) -> int:
|
|
35
|
+
"""
|
|
36
|
+
orchestrator for transformer execution
|
|
37
|
+
:param preprocessing_params: orchestrator configuration
|
|
38
|
+
:param data_access_factory: data access factory
|
|
39
|
+
:param runtime_config: transformer runtime configuration
|
|
40
|
+
:return: 0 - success or 1 - failure
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from data_processing.utils import get_logger
|
|
44
|
+
from ray.util.metrics import Gauge
|
|
45
|
+
|
|
46
|
+
logger = get_logger(__name__)
|
|
47
|
+
start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
48
|
+
start_time = time.time()
|
|
49
|
+
logger.info(f"orchestrator started at {start_ts}")
|
|
50
|
+
# create data access
|
|
51
|
+
data_access = data_access_factory.create_data_access()
|
|
52
|
+
if data_access is None:
|
|
53
|
+
logger.error("No DataAccess instance provided - exiting")
|
|
54
|
+
return 1
|
|
55
|
+
statistics = TransformStatisticsRay.remote({})
|
|
56
|
+
# create transformer runtime
|
|
57
|
+
runtime = runtime_config.create_transform_runtime()
|
|
58
|
+
resources = RayUtils.get_cluster_resources()
|
|
59
|
+
try:
|
|
60
|
+
# Get files to process
|
|
61
|
+
files, profile, retries = data_access.get_files_to_process()
|
|
62
|
+
if len(files) == 0:
|
|
63
|
+
logger.error("No input files to process - exiting")
|
|
64
|
+
return 0
|
|
65
|
+
logger.info(f"Number of files is {len(files)}, source profile {profile}")
|
|
66
|
+
# Print interval
|
|
67
|
+
print_interval = int(len(files) / 100)
|
|
68
|
+
if print_interval == 0:
|
|
69
|
+
print_interval = 1
|
|
70
|
+
# Get Resources for execution
|
|
71
|
+
logger.info(f"Cluster resources: {resources}")
|
|
72
|
+
# print execution params
|
|
73
|
+
logger.info(
|
|
74
|
+
f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each"
|
|
75
|
+
)
|
|
76
|
+
# log retries
|
|
77
|
+
if retries > 0:
|
|
78
|
+
statistics.add_stats.remote({"data access retries": retries})
|
|
79
|
+
# create executors
|
|
80
|
+
processor_params = {
|
|
81
|
+
"data_access_factory": data_access_factory,
|
|
82
|
+
"transform_class": runtime_config.get_transform_class(),
|
|
83
|
+
"transform_params": runtime.get_transform_config(
|
|
84
|
+
data_access_factory=data_access_factory, statistics=statistics, files=files
|
|
85
|
+
),
|
|
86
|
+
"statistics": statistics,
|
|
87
|
+
}
|
|
88
|
+
logger.debug("Creating actors")
|
|
89
|
+
processors = RayUtils.create_actors(
|
|
90
|
+
clazz=RayTransformFileProcessor,
|
|
91
|
+
params=processor_params,
|
|
92
|
+
actor_options=preprocessing_params.worker_options,
|
|
93
|
+
n_actors=preprocessing_params.n_workers,
|
|
94
|
+
creation_delay=preprocessing_params.creation_delay,
|
|
95
|
+
)
|
|
96
|
+
processors_pool = ActorPool(processors)
|
|
97
|
+
# create gauges
|
|
98
|
+
files_in_progress_gauge = Gauge("files_in_progress", "Number of files in progress")
|
|
99
|
+
files_completed_gauge = Gauge("files_processed_total", "Number of files completed")
|
|
100
|
+
available_cpus_gauge = Gauge("available_cpus", "Number of available CPUs")
|
|
101
|
+
available_gpus_gauge = Gauge("available_gpus", "Number of available GPUs")
|
|
102
|
+
available_memory_gauge = Gauge("available_memory", "Available memory")
|
|
103
|
+
available_object_memory_gauge = Gauge("available_object_store", "Available object store")
|
|
104
|
+
# process data
|
|
105
|
+
logger.debug("Begin processing files")
|
|
106
|
+
failures = RayUtils.process_files(
|
|
107
|
+
executors=processors_pool,
|
|
108
|
+
files=files,
|
|
109
|
+
print_interval=print_interval,
|
|
110
|
+
files_in_progress_gauge=files_in_progress_gauge,
|
|
111
|
+
files_completed_gauge=files_completed_gauge,
|
|
112
|
+
available_cpus_gauge=available_cpus_gauge,
|
|
113
|
+
available_gpus_gauge=available_gpus_gauge,
|
|
114
|
+
available_memory_gauge=available_memory_gauge,
|
|
115
|
+
object_memory_gauge=available_object_memory_gauge,
|
|
116
|
+
logger=logger,
|
|
117
|
+
)
|
|
118
|
+
if failures > 0:
|
|
119
|
+
statistics.add_stats.remote({"actor failures": failures})
|
|
120
|
+
logger.debug("Done processing files, waiting for flush() completion.")
|
|
121
|
+
# invoke flush to ensure that all results are returned
|
|
122
|
+
start = time.time()
|
|
123
|
+
replies = [processor.flush.remote() for processor in processors]
|
|
124
|
+
failures = RayUtils.wait_for_execution_completion(logger=logger, replies=replies)
|
|
125
|
+
if failures > 0:
|
|
126
|
+
statistics.add_stats.remote({"actor failures": failures})
|
|
127
|
+
logger.info(f"done flushing in {round(time.time() - start, 3)} sec")
|
|
128
|
+
status = "success"
|
|
129
|
+
return_code = 0
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
|
|
132
|
+
status = "failure"
|
|
133
|
+
return_code = 1
|
|
134
|
+
try:
|
|
135
|
+
# Compute execution statistics
|
|
136
|
+
logger.debug("Computing execution stats")
|
|
137
|
+
stats = runtime.compute_execution_stats(ray.get(statistics.get_execution_stats.remote()))
|
|
138
|
+
stats["processing_time"] = round(stats["processing_time"], 3)
|
|
139
|
+
|
|
140
|
+
# build and save metadata
|
|
141
|
+
logger.debug("Building job metadata")
|
|
142
|
+
metadata = {
|
|
143
|
+
"pipeline": preprocessing_params.pipeline_id,
|
|
144
|
+
"job details": preprocessing_params.job_details
|
|
145
|
+
| {"start_time": start_ts, "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "status": status},
|
|
146
|
+
"code": preprocessing_params.code_location,
|
|
147
|
+
"job_input_params": runtime_config.get_transform_metadata()
|
|
148
|
+
| data_access_factory.get_input_params()
|
|
149
|
+
| preprocessing_params.get_input_params(),
|
|
150
|
+
"execution_stats": resources | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
|
|
151
|
+
"job_output_stats": stats,
|
|
152
|
+
}
|
|
153
|
+
logger.debug(f"Saving job metadata: {metadata}.")
|
|
154
|
+
data_access.save_job_metadata(metadata)
|
|
155
|
+
logger.debug("Saved job metadata.")
|
|
156
|
+
return return_code
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
|
|
159
|
+
return 1
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
16
|
+
from ray.actor import ActorHandle
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DefaultRayTransformRuntime:
|
|
20
|
+
"""
|
|
21
|
+
Transformer runtime used by processor to to create Transform specific environment
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, params: dict[str, Any]):
|
|
25
|
+
"""
|
|
26
|
+
Create/config this runtime.
|
|
27
|
+
:param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
|
|
28
|
+
"""
|
|
29
|
+
self.params = params
|
|
30
|
+
|
|
31
|
+
def get_transform_config(
|
|
32
|
+
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
|
|
33
|
+
) -> dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Get the dictionary of configuration that will be provided to the transform's initializer.
|
|
36
|
+
This is the opportunity for this runtime to create a new set of configuration based on the
|
|
37
|
+
config/params provided to this instance's initializer. This may include the addition
|
|
38
|
+
of new configuration data such as ray shared memory, new actors, etc, that might be needed and
|
|
39
|
+
expected by the transform in its initializer and/or transform() methods.
|
|
40
|
+
:param data_access_factory - data access factory class being used by the RayOrchestrator.
|
|
41
|
+
:param statistics - reference to statistics actor
|
|
42
|
+
:param files - list of files to process
|
|
43
|
+
:return: dictionary of transform init params
|
|
44
|
+
"""
|
|
45
|
+
return self.params
|
|
46
|
+
|
|
47
|
+
def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Update/augment the given stats object with runtime-specific additions/modifications.
|
|
50
|
+
:param stats: output of statistics as aggregated across all calls to all transforms.
|
|
51
|
+
:return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
|
|
52
|
+
"""
|
|
53
|
+
return stats
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import ray
|
|
16
|
+
from data_processing.transform import TransformStatistics
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@ray.remote(num_cpus=0.25, scheduling_strategy="SPREAD")
|
|
20
|
+
class TransformStatisticsRay(TransformStatistics):
|
|
21
|
+
"""
|
|
22
|
+
Basic statistics class collecting basic execution statistics.
|
|
23
|
+
It can be extended for specific processors
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, params: dict[str, Any]):
|
|
27
|
+
from ray.util.metrics import Counter
|
|
28
|
+
|
|
29
|
+
super().__init__()
|
|
30
|
+
self.data_write_counter = Counter(name="data_written", description="Total data written bytes")
|
|
31
|
+
self.data_read_counter = Counter(name="data_read", description="Total data read bytes")
|
|
32
|
+
self.source_files_counter = Counter(name="source_files_processed", description="Total source files processed")
|
|
33
|
+
self.result_files_counter = Counter(name="result_files_written", description="Total result files written")
|
|
34
|
+
self.source_documents_counter = Counter(
|
|
35
|
+
name="source_documents_processed", description="Total source document processed"
|
|
36
|
+
)
|
|
37
|
+
self.result_documents_counter = Counter(
|
|
38
|
+
name="result_documents_written", description="Total result documents written"
|
|
39
|
+
)
|
|
40
|
+
self.empty_table_counter = Counter(name="empty_tables", description="Total empty tables read")
|
|
41
|
+
self.failed_read_counter = Counter(name="failed_read_files", description="Total read failed files")
|
|
42
|
+
self.failed_write_counter = Counter(name="failed_write_files", description="Total write failed files")
|
|
43
|
+
self.transform_exceptions_counter = Counter(
|
|
44
|
+
name="transform_exceptions", description="Transform exception occurred"
|
|
45
|
+
)
|
|
46
|
+
self.data_retries_counter = Counter(name="data_access_retries", description="Data access retries")
|
|
47
|
+
|
|
48
|
+
def add_stats(self, stats=dict[str, Any]) -> None:
|
|
49
|
+
"""
|
|
50
|
+
Add statistics
|
|
51
|
+
:param stats - dictionary creating new statistics
|
|
52
|
+
:return: None
|
|
53
|
+
"""
|
|
54
|
+
for key, val in stats.items():
|
|
55
|
+
self.stats[key] = self.stats.get(key, 0) + val
|
|
56
|
+
if val > 0:
|
|
57
|
+
if key == "source_files":
|
|
58
|
+
self.source_files_counter.inc(val)
|
|
59
|
+
if key == "source_size":
|
|
60
|
+
self.data_read_counter.inc(val)
|
|
61
|
+
if key == "result_files":
|
|
62
|
+
self.result_files_counter.inc(val)
|
|
63
|
+
if key == "source_doc_count":
|
|
64
|
+
self.source_documents_counter.inc(val)
|
|
65
|
+
if key == "result_doc_count":
|
|
66
|
+
self.result_documents_counter.inc(val)
|
|
67
|
+
if key == "skipped empty tables":
|
|
68
|
+
self.empty_table_counter.inc(val)
|
|
69
|
+
if key == "failed_reads":
|
|
70
|
+
self.failed_read_counter.inc(val)
|
|
71
|
+
if key == "failed_writes":
|
|
72
|
+
self.failed_write_counter.inc(val)
|
|
73
|
+
if key == "transform execution exception":
|
|
74
|
+
self.transform_exceptions_counter.inc(val)
|
|
75
|
+
if key == "data access retries":
|
|
76
|
+
self.data_retries_counter.inc(val)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from data_processing.test_support.transform.noop_transform import (
|
|
15
|
+
NOOPTransformConfiguration,
|
|
16
|
+
)
|
|
17
|
+
from data_processing.utils import get_logger
|
|
18
|
+
from data_processing_ray.runtime.ray import (
|
|
19
|
+
RayTransformLauncher,
|
|
20
|
+
RayTransformRuntimeConfiguration,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NOOPRayTransformConfiguration(RayTransformRuntimeConfiguration):
|
|
28
|
+
"""
|
|
29
|
+
Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
|
|
30
|
+
NOOP does not use a RayRuntime class so the superclass only needs the base
|
|
31
|
+
python-only configuration.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self):
|
|
35
|
+
"""
|
|
36
|
+
Initialization
|
|
37
|
+
"""
|
|
38
|
+
super().__init__(transform_config=NOOPTransformConfiguration())
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
if __name__ == "__main__":
|
|
42
|
+
# launcher = NOOPRayLauncher()
|
|
43
|
+
launcher = RayTransformLauncher(NOOPRayTransformConfiguration())
|
|
44
|
+
logger.info("Launching noop transform")
|
|
45
|
+
launcher.launch()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
from data_processing_spark.runtime.spark.transform_runtime import DefaultSparkTransformRuntime
|
|
13
|
+
from data_processing_spark.runtime.spark.execution_configuration import SparkTransformExecutionConfiguration
|
|
14
|
+
from data_processing_spark.runtime.spark.runtime_configuration import SparkTransformRuntimeConfiguration
|
|
15
|
+
from data_processing_spark.runtime.spark.transform_file_processor import SparkTransformFileProcessor
|
|
16
|
+
from data_processing_spark.runtime.spark.transform_orchestrator import orchestrate
|
|
17
|
+
from data_processing_spark.runtime.spark.transform_launcher import SparkTransformLauncher
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from data_processing.runtime import TransformExecutionConfiguration, runtime_cli_prefix
|
|
17
|
+
from data_processing.utils import CLIArgumentProvider, get_logger
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SparkTransformExecutionConfiguration(TransformExecutionConfiguration):
|
|
24
|
+
"""
|
|
25
|
+
A class specifying and validating Spark orchestrator configuration
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, name: str):
|
|
29
|
+
"""
|
|
30
|
+
Initialization
|
|
31
|
+
"""
|
|
32
|
+
super().__init__(name=name, print_params=False)
|
|
33
|
+
self.parallelization = -1
|
|
34
|
+
|
|
35
|
+
def add_input_params(self, parser: argparse.ArgumentParser) -> None:
|
|
36
|
+
"""
|
|
37
|
+
This method adds transformer specific parameter to parser
|
|
38
|
+
:param parser: parser
|
|
39
|
+
:return: None
|
|
40
|
+
"""
|
|
41
|
+
"""
|
|
42
|
+
This determines how many partitions the RDD should be divided into. See
|
|
43
|
+
https://sparktpoint.com/how-to-create-rdd-using-parallelize/ for the explanation
|
|
44
|
+
of this parameter
|
|
45
|
+
If you specify a positive value of the parameter, Spark will attempt to evenly
|
|
46
|
+
distribute the data from seq into that many partitions. For example, if you have
|
|
47
|
+
a collection of 100 elements and you specify numSlices as 4, Spark will try
|
|
48
|
+
to create 4 partitions with approximately 25 elements in each partition.
|
|
49
|
+
If you don’t specify this parameter, Spark will use a default value, which is
|
|
50
|
+
typically determined based on the cluster configuration or the available resources
|
|
51
|
+
(number of workers).
|
|
52
|
+
"""
|
|
53
|
+
parser.add_argument(f"--{runtime_cli_prefix}parallelization", type=int, default=-1, help="parallelization.")
|
|
54
|
+
return TransformExecutionConfiguration.add_input_params(self, parser=parser)
|
|
55
|
+
|
|
56
|
+
def apply_input_params(self, args: argparse.Namespace) -> bool:
|
|
57
|
+
"""
|
|
58
|
+
Validate transformer specific parameters
|
|
59
|
+
:param args: user defined arguments
|
|
60
|
+
:return: True, if validate pass or False otherwise
|
|
61
|
+
"""
|
|
62
|
+
if not TransformExecutionConfiguration.apply_input_params(self, args=args):
|
|
63
|
+
return False
|
|
64
|
+
captured = CLIArgumentProvider.capture_parameters(args, runtime_cli_prefix, False)
|
|
65
|
+
# store parameters locally
|
|
66
|
+
self.job_details = {
|
|
67
|
+
"job category": "preprocessing",
|
|
68
|
+
"job name": self.name,
|
|
69
|
+
"job type": "spark",
|
|
70
|
+
"job id": captured["job_id"],
|
|
71
|
+
}
|
|
72
|
+
self.parallelization = captured["parallelization"]
|
|
73
|
+
# if the user did not define actor max_restarts set it up for fault tolerance
|
|
74
|
+
logger.info(f"job details {self.job_details}")
|
|
75
|
+
logger.info(f"RDD parallelization {self.parallelization}")
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
def get_input_params(self) -> dict[str, Any]:
|
|
79
|
+
"""
|
|
80
|
+
get input parameters for job_input_params in metadata
|
|
81
|
+
:return: dictionary of parameters
|
|
82
|
+
"""
|
|
83
|
+
return {
|
|
84
|
+
"RDD parallelization": self.parallelization,
|
|
85
|
+
}
|