data-prep-toolkit 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_prep_toolkit-0.2.2.dev1.dist-info/METADATA +33 -0
- {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/RECORD +26 -5
- data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt +3 -0
- data_processing/runtime/pure_python/transform_orchestrator.py +24 -2
- data_processing_ray/runtime/ray/__init__.py +9 -0
- data_processing_ray/runtime/ray/execution_configuration.py +110 -0
- data_processing_ray/runtime/ray/ray_utils.py +246 -0
- data_processing_ray/runtime/ray/runtime_configuration.py +37 -0
- data_processing_ray/runtime/ray/transform_file_processor.py +53 -0
- data_processing_ray/runtime/ray/transform_invoker.py +103 -0
- data_processing_ray/runtime/ray/transform_launcher.py +117 -0
- data_processing_ray/runtime/ray/transform_orchestrator.py +159 -0
- data_processing_ray/runtime/ray/transform_runtime.py +53 -0
- data_processing_ray/runtime/ray/transform_statistics.py +76 -0
- data_processing_ray/test_support/transform/__init__.py +1 -0
- data_processing_ray/test_support/transform/noop_transform.py +45 -0
- data_processing_spark/runtime/spark/__init__.py +17 -0
- data_processing_spark/runtime/spark/execution_configuration.py +85 -0
- data_processing_spark/runtime/spark/runtime_configuration.py +37 -0
- data_processing_spark/runtime/spark/transform_file_processor.py +65 -0
- data_processing_spark/runtime/spark/transform_launcher.py +64 -0
- data_processing_spark/runtime/spark/transform_orchestrator.py +158 -0
- data_processing_spark/runtime/spark/transform_runtime.py +52 -0
- data_processing_spark/test_support/transform/__init__.py +13 -0
- data_processing_spark/test_support/transform/noop_transform.py +42 -0
- data_prep_toolkit-0.2.2.dev0.dist-info/METADATA +0 -56
- data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt +0 -1
- {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: data_prep_toolkit
|
|
3
|
+
Version: 0.2.2.dev1
|
|
4
|
+
Summary: Data Preparation Toolkit Library for Ray and Python
|
|
5
|
+
Author-email: Maroun Touma <touma@us.ibm.com>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
8
|
+
Requires-Python: <3.13,>=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: numpy<1.29.0
|
|
11
|
+
Requires-Dist: pyarrow==16.1.0
|
|
12
|
+
Requires-Dist: boto3==1.34.69
|
|
13
|
+
Requires-Dist: argparse
|
|
14
|
+
Requires-Dist: mmh3
|
|
15
|
+
Requires-Dist: psutil
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: twine; extra == "dev"
|
|
18
|
+
Requires-Dist: pytest>=7.3.2; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-env>=1.0.0; extra == "dev"
|
|
21
|
+
Requires-Dist: pre-commit>=3.3.2; extra == "dev"
|
|
22
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
|
|
24
|
+
Requires-Dist: moto==5.0.5; extra == "dev"
|
|
25
|
+
Requires-Dist: markupsafe==2.0.1; extra == "dev"
|
|
26
|
+
Provides-Extra: ray
|
|
27
|
+
Requires-Dist: ray[default]==2.36.1; extra == "ray"
|
|
28
|
+
Requires-Dist: fastapi>=0.110.2; extra == "ray"
|
|
29
|
+
Requires-Dist: pillow>=10.3.0; extra == "ray"
|
|
30
|
+
Provides-Extra: spark
|
|
31
|
+
Requires-Dist: pyspark>=3.5.2; extra == "spark"
|
|
32
|
+
Requires-Dist: psutil>=6.0.0; extra == "spark"
|
|
33
|
+
|
|
@@ -18,7 +18,7 @@ data_processing/runtime/pure_python/runtime_configuration.py,sha256=a4vSY98HfRm2
|
|
|
18
18
|
data_processing/runtime/pure_python/transform_file_processor.py,sha256=PYWNUSeb6i6q6Ov7nE0jXQfHIhp1u9adArEU3mQ7B24,4394
|
|
19
19
|
data_processing/runtime/pure_python/transform_invoker.py,sha256=lAG7tfyJyNqtwRB15-db4HJOQsBhT6JahLmjUFQFCRk,5192
|
|
20
20
|
data_processing/runtime/pure_python/transform_launcher.py,sha256=BDctJnYlR9OVzGCzMwg2cEuGdnV3E9fvhUgoyslvK8k,2447
|
|
21
|
-
data_processing/runtime/pure_python/transform_orchestrator.py,sha256=
|
|
21
|
+
data_processing/runtime/pure_python/transform_orchestrator.py,sha256=OIQzOL0jT-3ahT7aDs6suySkoEhmvM_T4C_qMDt0JSQ,9468
|
|
22
22
|
data_processing/runtime/pure_python/transform_runtime.py,sha256=pWvuGJGAB6M798LJU3FZBG6l35VQCuhsh-SyzSf9ok0,2558
|
|
23
23
|
data_processing/test_support/__init__.py,sha256=O4lySih15vkOYUSa3uhTaoYw0RrV4rM_sUd691JEuVU,83
|
|
24
24
|
data_processing/test_support/abstract_test.py,sha256=gZ51wnWITEAyb8BzA2WFCM0quJBxQrlD7WBwUfIsWEA,12875
|
|
@@ -45,7 +45,28 @@ data_processing/utils/transform_configuration.json,sha256=6YBw0Hk2mokY6JBn1kR6L9
|
|
|
45
45
|
data_processing/utils/transform_configurator.py,sha256=9OHSCQ8rFSoDdMW6ZCHYdNe6thRwV9zOaRPnLkWNMYE,3601
|
|
46
46
|
data_processing/utils/transform_utils.py,sha256=KGNioN35B1i1h-MIsfm3QvXLlU1aGXimheva7NbUhMM,8496
|
|
47
47
|
data_processing/utils/unrecoverable.py,sha256=svNdVzQaArnf8GdLvB2nP9miv7kYe3bDfFRW--SWvbU,171
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
48
|
+
data_processing_ray/runtime/ray/__init__.py,sha256=vjQOvb_OJNq3c1F_tG3WjO-pciY77Z1lETO2Ha_GVbw,784
|
|
49
|
+
data_processing_ray/runtime/ray/execution_configuration.py,sha256=C9YFixlATr7PPpkVQ0WzjCCPTWFuP80W2rnzY1bbp5I,4628
|
|
50
|
+
data_processing_ray/runtime/ray/ray_utils.py,sha256=eDPm-pybPOELjKkvoz3l-qFU-k1Iwh-giGlXULiZjEk,10212
|
|
51
|
+
data_processing_ray/runtime/ray/runtime_configuration.py,sha256=js9dXwdxjYbSigMC49F07XmbLjmj9HiipPE6BDaIGfA,1691
|
|
52
|
+
data_processing_ray/runtime/ray/transform_file_processor.py,sha256=eR814VvfmPOlvyv_FU7eyt0HRqIzkkvACURxJCK-xrM,2335
|
|
53
|
+
data_processing_ray/runtime/ray/transform_invoker.py,sha256=apfH8uilpm9sJ4IpHgiNdIzcH_IHGQba5fui4cCfolk,4026
|
|
54
|
+
data_processing_ray/runtime/ray/transform_launcher.py,sha256=oxI3MFZI_-LzTwHbrHBIUqJ0htnliKBuALt86qijRwU,4304
|
|
55
|
+
data_processing_ray/runtime/ray/transform_orchestrator.py,sha256=FZl7NM0eU1SxOcavZm4lru3laCswACPB1rjk3KK3FtY,7102
|
|
56
|
+
data_processing_ray/runtime/ray/transform_runtime.py,sha256=0-b5syOW9zNnZxmMHDdwPo_pvoqDBiM5dHCgSakZhGQ,2531
|
|
57
|
+
data_processing_ray/runtime/ray/transform_statistics.py,sha256=cxrSQVnzRBCGS68IoiVGLoRBWBxPBSFFMDiT29FNt0g,3749
|
|
58
|
+
data_processing_ray/test_support/transform/__init__.py,sha256=CKk-J3aEwH7OgDardyUEbLjlWaZWLUBs93PdukT4Rbc,100
|
|
59
|
+
data_processing_ray/test_support/transform/noop_transform.py,sha256=ZTx09M9vNOaqrVzeuT2VmWM-IF4Upip0g0EtbHaOn-0,1588
|
|
60
|
+
data_processing_spark/runtime/spark/__init__.py,sha256=bhY1xI9lL0GR2v1APahlhC5sh5rdVcGhQbWN4yoXApw,1233
|
|
61
|
+
data_processing_spark/runtime/spark/execution_configuration.py,sha256=BqxUlpXFdHRK-csO2jaJJtktyKbcMtjIn3sjAPBfO58,3643
|
|
62
|
+
data_processing_spark/runtime/spark/runtime_configuration.py,sha256=uABzBvzzFZ5HA_lGYEBFRd1qViMj5sbpKtCSLy64riM,1705
|
|
63
|
+
data_processing_spark/runtime/spark/transform_file_processor.py,sha256=sdDBZZyqCqyKaJmEqZh1QzIqCDkLQzqV0dcAI5TRBjo,2611
|
|
64
|
+
data_processing_spark/runtime/spark/transform_launcher.py,sha256=1PZ-N4Wy2Qqiqr2z9S1xV88cNsAoHrmmuPadiOakJLM,2479
|
|
65
|
+
data_processing_spark/runtime/spark/transform_orchestrator.py,sha256=HadnLNx_icy2n7CXOwqLiUA7vjV-gOvajxE0AQU3_NM,6645
|
|
66
|
+
data_processing_spark/runtime/spark/transform_runtime.py,sha256=IKChGY1uGxFlAqZaL-XeSv_J3BMm3nev9MAs0NTT8og,2506
|
|
67
|
+
data_processing_spark/test_support/transform/__init__.py,sha256=v58HbP2x9KF8MG8SOGWjodrTjU57KXlL0aPPB7z8KQQ,755
|
|
68
|
+
data_processing_spark/test_support/transform/noop_transform.py,sha256=0FR3o-LnXf-UFS5gU0j-i4LVlw1mHDxGaPI40dkkIKY,1694
|
|
69
|
+
data_prep_toolkit-0.2.2.dev1.dist-info/METADATA,sha256=QiZEK2qc8or6csEZk_weYltxiDRFvDb0chVpwLMCMrU,1235
|
|
70
|
+
data_prep_toolkit-0.2.2.dev1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
71
|
+
data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
|
|
72
|
+
data_prep_toolkit-0.2.2.dev1.dist-info/RECORD,,
|
|
@@ -9,9 +9,10 @@
|
|
|
9
9
|
# See the License for the specific language governing permissions and
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
|
-
|
|
12
|
+
import os
|
|
13
13
|
import time
|
|
14
14
|
import traceback
|
|
15
|
+
import psutil
|
|
15
16
|
from datetime import datetime
|
|
16
17
|
from multiprocessing import Pool
|
|
17
18
|
from typing import Any
|
|
@@ -24,12 +25,31 @@ from data_processing.runtime.pure_python import (
|
|
|
24
25
|
PythonTransformRuntimeConfiguration,
|
|
25
26
|
)
|
|
26
27
|
from data_processing.transform import AbstractBinaryTransform, TransformStatistics
|
|
27
|
-
from data_processing.utils import get_logger
|
|
28
|
+
from data_processing.utils import GB, get_logger
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
logger = get_logger(__name__)
|
|
31
32
|
|
|
32
33
|
|
|
34
|
+
@staticmethod
|
|
35
|
+
def _execution_resources() -> dict[str, Any]:
|
|
36
|
+
"""
|
|
37
|
+
Get Execution resource
|
|
38
|
+
:return: tuple of cpu/memory usage
|
|
39
|
+
"""
|
|
40
|
+
# Getting loadover15 minutes
|
|
41
|
+
load1, load5, load15 = psutil.getloadavg()
|
|
42
|
+
# Getting memory used
|
|
43
|
+
mused = round(psutil.virtual_memory()[3] / GB, 2)
|
|
44
|
+
return {
|
|
45
|
+
"cpus": round((load15/os.cpu_count()) * 100, 1),
|
|
46
|
+
"gpus": 0,
|
|
47
|
+
"memory": mused,
|
|
48
|
+
"object_store": 0,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
|
|
33
53
|
def orchestrate(
|
|
34
54
|
data_access_factory: DataAccessFactoryBase,
|
|
35
55
|
runtime_config: PythonTransformRuntimeConfiguration,
|
|
@@ -43,6 +63,7 @@ def orchestrate(
|
|
|
43
63
|
:return: 0 - success or 1 - failure
|
|
44
64
|
"""
|
|
45
65
|
start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
66
|
+
start_time = time.time()
|
|
46
67
|
logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
|
|
47
68
|
# create statistics
|
|
48
69
|
statistics = TransformStatistics()
|
|
@@ -118,6 +139,7 @@ def orchestrate(
|
|
|
118
139
|
"job_input_params": input_params
|
|
119
140
|
| data_access_factory.get_input_params()
|
|
120
141
|
| execution_config.get_input_params(),
|
|
142
|
+
"execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
|
|
121
143
|
"job_output_stats": stats,
|
|
122
144
|
}
|
|
123
145
|
logger.debug(f"Saving job metadata: {metadata}.")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from data_processing_ray.runtime.ray.ray_utils import RayUtils
|
|
2
|
+
from data_processing_ray.runtime.ray.transform_statistics import TransformStatisticsRay
|
|
3
|
+
from data_processing_ray.runtime.ray.transform_runtime import DefaultRayTransformRuntime
|
|
4
|
+
from data_processing_ray.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
|
|
5
|
+
from data_processing_ray.runtime.ray.transform_file_processor import RayTransformFileProcessor
|
|
6
|
+
from data_processing_ray.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
|
|
7
|
+
from data_processing_ray.runtime.ray.transform_orchestrator import orchestrate
|
|
8
|
+
from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher
|
|
9
|
+
from data_processing_ray.runtime.ray.transform_invoker import execute_ray_transform
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import ast
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from data_processing.runtime import TransformExecutionConfiguration
|
|
18
|
+
from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = get_logger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
cli_prefix = "runtime_"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class RayTransformExecutionConfiguration(TransformExecutionConfiguration):
|
|
28
|
+
"""
|
|
29
|
+
A class specifying and validating Ray orchestrator configuration
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, name: str):
|
|
33
|
+
"""
|
|
34
|
+
Initialization
|
|
35
|
+
"""
|
|
36
|
+
super().__init__(name=name, print_params=False)
|
|
37
|
+
self.worker_options = {}
|
|
38
|
+
self.n_workers = 1
|
|
39
|
+
self.creation_delay = 0
|
|
40
|
+
|
|
41
|
+
def add_input_params(self, parser: argparse.ArgumentParser) -> None:
|
|
42
|
+
"""
|
|
43
|
+
This method adds transformer specific parameter to parser
|
|
44
|
+
:param parser: parser
|
|
45
|
+
:return:
|
|
46
|
+
"""
|
|
47
|
+
parser.add_argument(f"--{cli_prefix}num_workers", type=int, default=1, help="number of workers")
|
|
48
|
+
|
|
49
|
+
help_example_dict = {
|
|
50
|
+
"num_cpus": ["8", "Required number of CPUs."],
|
|
51
|
+
"num_gpus": ["1", "Required number of GPUs"],
|
|
52
|
+
"resources": [
|
|
53
|
+
'{"special_hardware": 1, "custom_label": 1}',
|
|
54
|
+
"""The complete list can be found at
|
|
55
|
+
https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html#ray.remote_function.RemoteFunction.options
|
|
56
|
+
and contains accelerator_type, memory, name, num_cpus, num_gpus, object_store_memory, placement_group,
|
|
57
|
+
placement_group_bundle_index, placement_group_capture_child_tasks, resources, runtime_env,
|
|
58
|
+
scheduling_strategy, _metadata, concurrency_groups, lifetime, max_concurrency, max_restarts,
|
|
59
|
+
max_task_retries, max_pending_calls, namespace, get_if_exists""",
|
|
60
|
+
],
|
|
61
|
+
}
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
f"--{cli_prefix}worker_options",
|
|
64
|
+
type=ast.literal_eval,
|
|
65
|
+
default="{'num_cpus': 0.8}",
|
|
66
|
+
help="AST string defining worker resource requirements.\n"
|
|
67
|
+
+ ParamsUtils.get_ast_help_text(help_example_dict),
|
|
68
|
+
)
|
|
69
|
+
parser.add_argument(f"--{cli_prefix}creation_delay", type=int, default=0, help="delay between actor' creation")
|
|
70
|
+
return TransformExecutionConfiguration.add_input_params(self, parser=parser)
|
|
71
|
+
|
|
72
|
+
def apply_input_params(self, args: argparse.Namespace) -> bool:
|
|
73
|
+
"""
|
|
74
|
+
Validate transformer specific parameters
|
|
75
|
+
:param args: user defined arguments
|
|
76
|
+
:return: True, if validate pass or False otherwise
|
|
77
|
+
"""
|
|
78
|
+
if not TransformExecutionConfiguration.apply_input_params(self, args=args):
|
|
79
|
+
return False
|
|
80
|
+
captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
|
|
81
|
+
# store parameters locally
|
|
82
|
+
self.worker_options = captured["worker_options"]
|
|
83
|
+
self.n_workers = captured["num_workers"]
|
|
84
|
+
self.creation_delay = captured["creation_delay"]
|
|
85
|
+
self.job_details = {
|
|
86
|
+
"job category": "preprocessing",
|
|
87
|
+
"job name": self.name,
|
|
88
|
+
"job type": "ray",
|
|
89
|
+
"job id": captured["job_id"],
|
|
90
|
+
}
|
|
91
|
+
# if the user did not define actor max_restarts set it up for fault tolerance
|
|
92
|
+
if "max_restarts" not in self.worker_options:
|
|
93
|
+
self.worker_options["max_restarts"] = -1
|
|
94
|
+
|
|
95
|
+
# print them
|
|
96
|
+
logger.info(f"number of workers {self.n_workers} worker options {self.worker_options}")
|
|
97
|
+
logger.info(f"actor creation delay {self.creation_delay}")
|
|
98
|
+
logger.info(f"job details {self.job_details}")
|
|
99
|
+
return True
|
|
100
|
+
|
|
101
|
+
def get_input_params(self) -> dict[str, Any]:
|
|
102
|
+
"""
|
|
103
|
+
get input parameters for job_input_params in metadata
|
|
104
|
+
:return: dictionary of parameters
|
|
105
|
+
"""
|
|
106
|
+
return {
|
|
107
|
+
"number of workers": self.n_workers,
|
|
108
|
+
"worker options": self.worker_options,
|
|
109
|
+
"actor creation delay": self.creation_delay,
|
|
110
|
+
}
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import ray
|
|
18
|
+
from ray.experimental.state.api import list_actors
|
|
19
|
+
from data_processing.utils import GB, UnrecoverableException
|
|
20
|
+
from ray.actor import ActorHandle
|
|
21
|
+
from ray.exceptions import RayError
|
|
22
|
+
from ray.util.actor_pool import ActorPool
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RayUtils:
|
|
26
|
+
"""
|
|
27
|
+
Class implementing support methods for Ray execution
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from ray.util.metrics import Gauge
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_available_resources(
|
|
34
|
+
available_cpus_gauge: Gauge = None,
|
|
35
|
+
available_gpus_gauge: Gauge = None,
|
|
36
|
+
available_memory_gauge: Gauge = None,
|
|
37
|
+
object_memory_gauge: Gauge = None,
|
|
38
|
+
) -> dict[str, Any]:
|
|
39
|
+
"""
|
|
40
|
+
Get currently available cluster resources
|
|
41
|
+
:param available_cpus_gauge: ray Gauge to report available CPU
|
|
42
|
+
:param available_gpus_gauge: ray Gauge to report available GPU
|
|
43
|
+
:param available_memory_gauge: ray Gauge to report available memory
|
|
44
|
+
:param object_memory_gauge: ray Gauge to report available object memory
|
|
45
|
+
:return: a dict of currently available resources
|
|
46
|
+
"""
|
|
47
|
+
resources = ray.available_resources()
|
|
48
|
+
if available_cpus_gauge is not None:
|
|
49
|
+
available_cpus_gauge.set(int(resources.get("CPU", 0.0)))
|
|
50
|
+
if available_gpus_gauge is not None:
|
|
51
|
+
available_gpus_gauge.set(int(resources.get("GPU", 0.0)))
|
|
52
|
+
if available_memory_gauge is not None:
|
|
53
|
+
available_memory_gauge.set(resources.get("memory", 0.0) / GB)
|
|
54
|
+
if object_memory_gauge is not None:
|
|
55
|
+
object_memory_gauge.set(resources.get("object_store_memory", 0.0) / GB)
|
|
56
|
+
return {
|
|
57
|
+
"cpus": int(resources.get("CPU", 0.0)),
|
|
58
|
+
"gpus": int(resources.get("GPU", 0.0)),
|
|
59
|
+
"memory": resources.get("memory", 0.0) / GB,
|
|
60
|
+
"object_store": resources.get("object_store_memory", 0.0) / GB,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def get_cluster_resources() -> dict[str, Any]:
|
|
65
|
+
"""
|
|
66
|
+
Get cluster resources
|
|
67
|
+
:return: cluster resources
|
|
68
|
+
"""
|
|
69
|
+
resources = ray.cluster_resources()
|
|
70
|
+
return {
|
|
71
|
+
"cpus": int(resources.get("CPU", 0.0)),
|
|
72
|
+
"gpus": int(resources.get("GPU", 0.0)),
|
|
73
|
+
"memory": resources.get("memory", 0.0) / GB,
|
|
74
|
+
"object_store": resources.get("object_store_memory", 0.0) / GB,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def get_available_nodes(available_nodes_gauge: Gauge = None) -> int:
|
|
79
|
+
"""
|
|
80
|
+
Get the list of the alive Ray nodes and optionally expose it to prometheus
|
|
81
|
+
:param available_nodes_gauge: the gauge used to publish number of available node
|
|
82
|
+
:return: number of available nodes
|
|
83
|
+
"""
|
|
84
|
+
# get nodes from Ray
|
|
85
|
+
nodes = ray.nodes()
|
|
86
|
+
# filer out available ones
|
|
87
|
+
nnodes = 0
|
|
88
|
+
for node in nodes:
|
|
89
|
+
if node["Alive"]:
|
|
90
|
+
nnodes += 1
|
|
91
|
+
return nnodes
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def create_actors(
|
|
95
|
+
clazz: type, params: dict[str, Any], actor_options: dict[str, Any], n_actors: int, creation_delay: int = 0
|
|
96
|
+
) -> list[ActorHandle]:
|
|
97
|
+
"""
|
|
98
|
+
Create a set of actors
|
|
99
|
+
:param clazz: actor class, has to be annotated as remote
|
|
100
|
+
:param params: actor init params
|
|
101
|
+
:param actor_options: dictionary of actor options.
|
|
102
|
+
see https://docs.ray.io/en/latest/ray-core/api/doc/ray.actor.ActorClass.options.html
|
|
103
|
+
:param n_actors: number of actors
|
|
104
|
+
:param creation_delay - delay between actor's creations
|
|
105
|
+
:return: a list of actor handles
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def operator() -> ActorHandle:
|
|
109
|
+
time.sleep(creation_delay)
|
|
110
|
+
return clazz.options(**actor_options).remote(params)
|
|
111
|
+
|
|
112
|
+
cls_name = clazz.__class__.__name__.replace('ActorClass(', '').replace(')','')
|
|
113
|
+
actors = [operator() for _ in range(n_actors)]
|
|
114
|
+
for i in range(120):
|
|
115
|
+
time.sleep(1)
|
|
116
|
+
alive = list_actors(filters=[("class_name", "=", cls_name), ("state", "=", "ALIVE")])
|
|
117
|
+
if len(actors) == len(alive):
|
|
118
|
+
return actors
|
|
119
|
+
# failed - raise an exception
|
|
120
|
+
print(f"created {actors}, alive {alive}")
|
|
121
|
+
raise UnrecoverableException(f"out of {len(actors)} created actors only {len(alive)} alive")
|
|
122
|
+
|
|
123
|
+
@staticmethod
|
|
124
|
+
def process_files(
|
|
125
|
+
executors: ActorPool,
|
|
126
|
+
files: list[str],
|
|
127
|
+
print_interval: int,
|
|
128
|
+
files_in_progress_gauge: Gauge,
|
|
129
|
+
files_completed_gauge: Gauge,
|
|
130
|
+
available_cpus_gauge: Gauge,
|
|
131
|
+
available_gpus_gauge: Gauge,
|
|
132
|
+
available_memory_gauge: Gauge,
|
|
133
|
+
object_memory_gauge: Gauge,
|
|
134
|
+
logger: logging.Logger,
|
|
135
|
+
) -> int:
|
|
136
|
+
"""
|
|
137
|
+
Process files
|
|
138
|
+
:param executors: actor pool of executors
|
|
139
|
+
:param files: list of files to process
|
|
140
|
+
:param print_interval: print interval
|
|
141
|
+
:param files_in_progress_gauge: ray Gauge to report files in process
|
|
142
|
+
:param files_completed_gauge: ray Gauge to report completed files
|
|
143
|
+
:param available_cpus_gauge: ray Gauge to report available CPU
|
|
144
|
+
:param available_gpus_gauge: ray Gauge to report available GPU
|
|
145
|
+
:param available_memory_gauge: ray Gauge to report available memory
|
|
146
|
+
:param object_memory_gauge: ray Gauge to report available object memory
|
|
147
|
+
:param logger: logger
|
|
148
|
+
:return: number of actors failures
|
|
149
|
+
"""
|
|
150
|
+
logger.debug("Begin processing files")
|
|
151
|
+
actor_failures = 0
|
|
152
|
+
RayUtils.get_available_resources(
|
|
153
|
+
available_cpus_gauge=available_cpus_gauge,
|
|
154
|
+
available_gpus_gauge=available_gpus_gauge,
|
|
155
|
+
available_memory_gauge=available_memory_gauge,
|
|
156
|
+
object_memory_gauge=object_memory_gauge,
|
|
157
|
+
)
|
|
158
|
+
terminate = False
|
|
159
|
+
running = 0
|
|
160
|
+
t_start = time.time()
|
|
161
|
+
completed = 0
|
|
162
|
+
for path in files:
|
|
163
|
+
if executors.has_free(): # still have room
|
|
164
|
+
executors.submit(lambda a, v: a.process_file.remote(v), path)
|
|
165
|
+
running += 1
|
|
166
|
+
files_in_progress_gauge.set(running)
|
|
167
|
+
else: # need to wait for some actors
|
|
168
|
+
while True:
|
|
169
|
+
# we can have several workers fail here
|
|
170
|
+
try:
|
|
171
|
+
res = executors.get_next_unordered()
|
|
172
|
+
break
|
|
173
|
+
except Exception as e:
|
|
174
|
+
if isinstance(e, RayError):
|
|
175
|
+
# Ray exception - terminate
|
|
176
|
+
logger.error(f"Got Ray worker exception {e}, terminating")
|
|
177
|
+
terminate = True
|
|
178
|
+
break
|
|
179
|
+
logger.error(f"Failed to process request worker exception {e}")
|
|
180
|
+
actor_failures += 1
|
|
181
|
+
completed += 1
|
|
182
|
+
break
|
|
183
|
+
if terminate:
|
|
184
|
+
raise UnrecoverableException
|
|
185
|
+
executors.submit(lambda a, v: a.process_file.remote(v), path)
|
|
186
|
+
|
|
187
|
+
completed += 1
|
|
188
|
+
files_completed_gauge.set(completed)
|
|
189
|
+
RayUtils.get_available_resources(
|
|
190
|
+
available_cpus_gauge=available_cpus_gauge,
|
|
191
|
+
available_gpus_gauge=available_gpus_gauge,
|
|
192
|
+
available_memory_gauge=available_memory_gauge,
|
|
193
|
+
object_memory_gauge=object_memory_gauge,
|
|
194
|
+
)
|
|
195
|
+
if completed % print_interval == 0:
|
|
196
|
+
logger.info(f"Completed {completed} files in {round((time.time() - t_start)/60., 3)} min")
|
|
197
|
+
# Wait for completion
|
|
198
|
+
files_completed_gauge.set(completed)
|
|
199
|
+
# Wait for completion
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Completed {completed} files ({round(100 * completed / len(files), 3)}%) "
|
|
202
|
+
f"in {round((time.time() - t_start)/60., 3)} min. Waiting for completion"
|
|
203
|
+
)
|
|
204
|
+
while executors.has_next():
|
|
205
|
+
while True:
|
|
206
|
+
# we can have several workers fail here
|
|
207
|
+
try:
|
|
208
|
+
executors.get_next_unordered()
|
|
209
|
+
break
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Failed to process request worker exception {e}")
|
|
212
|
+
actor_failures += 1
|
|
213
|
+
completed += 1
|
|
214
|
+
running -= 1
|
|
215
|
+
completed += 1
|
|
216
|
+
files_in_progress_gauge.set(running)
|
|
217
|
+
files_completed_gauge.set(completed)
|
|
218
|
+
RayUtils.get_available_resources(
|
|
219
|
+
available_cpus_gauge=available_cpus_gauge,
|
|
220
|
+
available_gpus_gauge=available_gpus_gauge,
|
|
221
|
+
available_memory_gauge=available_memory_gauge,
|
|
222
|
+
object_memory_gauge=object_memory_gauge,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
logger.info(f"Completed processing {completed} files in {round((time.time() - t_start)/60, 3)} min")
|
|
226
|
+
return actor_failures
|
|
227
|
+
|
|
228
|
+
@staticmethod
|
|
229
|
+
def wait_for_execution_completion(logger: logging.Logger, replies: list[ray.ObjectRef]) -> int:
|
|
230
|
+
"""
|
|
231
|
+
Wait for all requests completed
|
|
232
|
+
:param logger: logger to use
|
|
233
|
+
:param replies: list of request futures
|
|
234
|
+
:return: None
|
|
235
|
+
"""
|
|
236
|
+
actor_failures = 0
|
|
237
|
+
while replies:
|
|
238
|
+
# Wait for replies
|
|
239
|
+
try:
|
|
240
|
+
ready, not_ready = ray.wait(replies)
|
|
241
|
+
except Exception as e:
|
|
242
|
+
logger.error(f"Failed to process request worker exception {e}")
|
|
243
|
+
actor_failures += 1
|
|
244
|
+
not_ready = replies - 1
|
|
245
|
+
replies = not_ready
|
|
246
|
+
return actor_failures
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
|
+
from data_processing.transform import TransformConfiguration
|
|
15
|
+
from data_processing_ray.runtime.ray import DefaultRayTransformRuntime
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RayTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
transform_config: TransformConfiguration,
|
|
22
|
+
runtime_class: type[DefaultRayTransformRuntime] = DefaultRayTransformRuntime,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialization
|
|
26
|
+
:param transform_config - base configuration class
|
|
27
|
+
:param runtime_class: implementation of the transform runtime
|
|
28
|
+
"""
|
|
29
|
+
super().__init__(transform_config=transform_config)
|
|
30
|
+
self.runtime_class = runtime_class
|
|
31
|
+
|
|
32
|
+
def create_transform_runtime(self) -> DefaultRayTransformRuntime:
|
|
33
|
+
"""
|
|
34
|
+
Create transform runtime with the parameters captured during apply_input_params()
|
|
35
|
+
:return: transform runtime object
|
|
36
|
+
"""
|
|
37
|
+
return self.runtime_class(self.transform_config.get_transform_params())
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import ray
|
|
16
|
+
from data_processing.runtime import AbstractTransformFileProcessor
|
|
17
|
+
from data_processing.utils import UnrecoverableException
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@ray.remote(scheduling_strategy="SPREAD")
|
|
21
|
+
class RayTransformFileProcessor(AbstractTransformFileProcessor):
|
|
22
|
+
"""
|
|
23
|
+
This is the class implementing the actual work/actor processing of a single file
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, params: dict[str, Any]):
|
|
27
|
+
"""
|
|
28
|
+
Init method
|
|
29
|
+
:param params: dictionary that has the following key
|
|
30
|
+
data_access_factory: data access factory
|
|
31
|
+
transform_class: local transform class
|
|
32
|
+
transform_params: dictionary of parameters for local transform creation
|
|
33
|
+
statistics: object reference to statistics
|
|
34
|
+
"""
|
|
35
|
+
super().__init__(
|
|
36
|
+
data_access_factory=params.get("data_access_factory", None),
|
|
37
|
+
transform_parameters=dict(params.get("transform_params", {})),
|
|
38
|
+
)
|
|
39
|
+
# Create statistics
|
|
40
|
+
self.stats = params.get("statistics", None)
|
|
41
|
+
if self.stats is None:
|
|
42
|
+
self.logger.error("Transform file processor: statistics is not specified")
|
|
43
|
+
raise UnrecoverableException("statistics is None")
|
|
44
|
+
self.transform_params["statistics"] = self.stats
|
|
45
|
+
# Create local processor
|
|
46
|
+
try:
|
|
47
|
+
self.transform = params.get("transform_class", None)(self.transform_params)
|
|
48
|
+
except Exception as e:
|
|
49
|
+
self.logger.error(f"Exception creating transform {e}")
|
|
50
|
+
raise UnrecoverableException("failed creating transform")
|
|
51
|
+
|
|
52
|
+
def _publish_stats(self, stats: dict[str, Any]) -> None:
|
|
53
|
+
self.stats.add_stats.remote(stats)
|