data-prep-toolkit 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. data_prep_toolkit-0.2.2.dev1.dist-info/METADATA +33 -0
  2. {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/RECORD +26 -5
  3. data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt +3 -0
  4. data_processing/runtime/pure_python/transform_orchestrator.py +24 -2
  5. data_processing_ray/runtime/ray/__init__.py +9 -0
  6. data_processing_ray/runtime/ray/execution_configuration.py +110 -0
  7. data_processing_ray/runtime/ray/ray_utils.py +246 -0
  8. data_processing_ray/runtime/ray/runtime_configuration.py +37 -0
  9. data_processing_ray/runtime/ray/transform_file_processor.py +53 -0
  10. data_processing_ray/runtime/ray/transform_invoker.py +103 -0
  11. data_processing_ray/runtime/ray/transform_launcher.py +117 -0
  12. data_processing_ray/runtime/ray/transform_orchestrator.py +159 -0
  13. data_processing_ray/runtime/ray/transform_runtime.py +53 -0
  14. data_processing_ray/runtime/ray/transform_statistics.py +76 -0
  15. data_processing_ray/test_support/transform/__init__.py +1 -0
  16. data_processing_ray/test_support/transform/noop_transform.py +45 -0
  17. data_processing_spark/runtime/spark/__init__.py +17 -0
  18. data_processing_spark/runtime/spark/execution_configuration.py +85 -0
  19. data_processing_spark/runtime/spark/runtime_configuration.py +37 -0
  20. data_processing_spark/runtime/spark/transform_file_processor.py +65 -0
  21. data_processing_spark/runtime/spark/transform_launcher.py +64 -0
  22. data_processing_spark/runtime/spark/transform_orchestrator.py +158 -0
  23. data_processing_spark/runtime/spark/transform_runtime.py +52 -0
  24. data_processing_spark/test_support/transform/__init__.py +13 -0
  25. data_processing_spark/test_support/transform/noop_transform.py +42 -0
  26. data_prep_toolkit-0.2.2.dev0.dist-info/METADATA +0 -56
  27. data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt +0 -1
  28. {data_prep_toolkit-0.2.2.dev0.dist-info → data_prep_toolkit-0.2.2.dev1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_toolkit
3
+ Version: 0.2.2.dev1
4
+ Summary: Data Preparation Toolkit Library for Ray and Python
5
+ Author-email: Maroun Touma <touma@us.ibm.com>
6
+ License: Apache-2.0
7
+ Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
8
+ Requires-Python: <3.13,>=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: numpy<1.29.0
11
+ Requires-Dist: pyarrow==16.1.0
12
+ Requires-Dist: boto3==1.34.69
13
+ Requires-Dist: argparse
14
+ Requires-Dist: mmh3
15
+ Requires-Dist: psutil
16
+ Provides-Extra: dev
17
+ Requires-Dist: twine; extra == "dev"
18
+ Requires-Dist: pytest>=7.3.2; extra == "dev"
19
+ Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
20
+ Requires-Dist: pytest-env>=1.0.0; extra == "dev"
21
+ Requires-Dist: pre-commit>=3.3.2; extra == "dev"
22
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
23
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
24
+ Requires-Dist: moto==5.0.5; extra == "dev"
25
+ Requires-Dist: markupsafe==2.0.1; extra == "dev"
26
+ Provides-Extra: ray
27
+ Requires-Dist: ray[default]==2.36.1; extra == "ray"
28
+ Requires-Dist: fastapi>=0.110.2; extra == "ray"
29
+ Requires-Dist: pillow>=10.3.0; extra == "ray"
30
+ Provides-Extra: spark
31
+ Requires-Dist: pyspark>=3.5.2; extra == "spark"
32
+ Requires-Dist: psutil>=6.0.0; extra == "spark"
33
+
@@ -18,7 +18,7 @@ data_processing/runtime/pure_python/runtime_configuration.py,sha256=a4vSY98HfRm2
18
18
  data_processing/runtime/pure_python/transform_file_processor.py,sha256=PYWNUSeb6i6q6Ov7nE0jXQfHIhp1u9adArEU3mQ7B24,4394
19
19
  data_processing/runtime/pure_python/transform_invoker.py,sha256=lAG7tfyJyNqtwRB15-db4HJOQsBhT6JahLmjUFQFCRk,5192
20
20
  data_processing/runtime/pure_python/transform_launcher.py,sha256=BDctJnYlR9OVzGCzMwg2cEuGdnV3E9fvhUgoyslvK8k,2447
21
- data_processing/runtime/pure_python/transform_orchestrator.py,sha256=1ChVBTdyM6SwWVyx1HuSqWXRU8UywAQWMiQWaF43sD8,8834
21
+ data_processing/runtime/pure_python/transform_orchestrator.py,sha256=OIQzOL0jT-3ahT7aDs6suySkoEhmvM_T4C_qMDt0JSQ,9468
22
22
  data_processing/runtime/pure_python/transform_runtime.py,sha256=pWvuGJGAB6M798LJU3FZBG6l35VQCuhsh-SyzSf9ok0,2558
23
23
  data_processing/test_support/__init__.py,sha256=O4lySih15vkOYUSa3uhTaoYw0RrV4rM_sUd691JEuVU,83
24
24
  data_processing/test_support/abstract_test.py,sha256=gZ51wnWITEAyb8BzA2WFCM0quJBxQrlD7WBwUfIsWEA,12875
@@ -45,7 +45,28 @@ data_processing/utils/transform_configuration.json,sha256=6YBw0Hk2mokY6JBn1kR6L9
45
45
  data_processing/utils/transform_configurator.py,sha256=9OHSCQ8rFSoDdMW6ZCHYdNe6thRwV9zOaRPnLkWNMYE,3601
46
46
  data_processing/utils/transform_utils.py,sha256=KGNioN35B1i1h-MIsfm3QvXLlU1aGXimheva7NbUhMM,8496
47
47
  data_processing/utils/unrecoverable.py,sha256=svNdVzQaArnf8GdLvB2nP9miv7kYe3bDfFRW--SWvbU,171
48
- data_prep_toolkit-0.2.2.dev0.dist-info/METADATA,sha256=6WIKrryg4jGjhscucT4P-Jhr8mxP0L8sQz6nywPPaRk,1900
49
- data_prep_toolkit-0.2.2.dev0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
50
- data_prep_toolkit-0.2.2.dev0.dist-info/top_level.txt,sha256=OZPYA4BEseUcpIjvbiPpE3Hdcz5cVWwO1Kji5oed2A4,16
51
- data_prep_toolkit-0.2.2.dev0.dist-info/RECORD,,
48
+ data_processing_ray/runtime/ray/__init__.py,sha256=vjQOvb_OJNq3c1F_tG3WjO-pciY77Z1lETO2Ha_GVbw,784
49
+ data_processing_ray/runtime/ray/execution_configuration.py,sha256=C9YFixlATr7PPpkVQ0WzjCCPTWFuP80W2rnzY1bbp5I,4628
50
+ data_processing_ray/runtime/ray/ray_utils.py,sha256=eDPm-pybPOELjKkvoz3l-qFU-k1Iwh-giGlXULiZjEk,10212
51
+ data_processing_ray/runtime/ray/runtime_configuration.py,sha256=js9dXwdxjYbSigMC49F07XmbLjmj9HiipPE6BDaIGfA,1691
52
+ data_processing_ray/runtime/ray/transform_file_processor.py,sha256=eR814VvfmPOlvyv_FU7eyt0HRqIzkkvACURxJCK-xrM,2335
53
+ data_processing_ray/runtime/ray/transform_invoker.py,sha256=apfH8uilpm9sJ4IpHgiNdIzcH_IHGQba5fui4cCfolk,4026
54
+ data_processing_ray/runtime/ray/transform_launcher.py,sha256=oxI3MFZI_-LzTwHbrHBIUqJ0htnliKBuALt86qijRwU,4304
55
+ data_processing_ray/runtime/ray/transform_orchestrator.py,sha256=FZl7NM0eU1SxOcavZm4lru3laCswACPB1rjk3KK3FtY,7102
56
+ data_processing_ray/runtime/ray/transform_runtime.py,sha256=0-b5syOW9zNnZxmMHDdwPo_pvoqDBiM5dHCgSakZhGQ,2531
57
+ data_processing_ray/runtime/ray/transform_statistics.py,sha256=cxrSQVnzRBCGS68IoiVGLoRBWBxPBSFFMDiT29FNt0g,3749
58
+ data_processing_ray/test_support/transform/__init__.py,sha256=CKk-J3aEwH7OgDardyUEbLjlWaZWLUBs93PdukT4Rbc,100
59
+ data_processing_ray/test_support/transform/noop_transform.py,sha256=ZTx09M9vNOaqrVzeuT2VmWM-IF4Upip0g0EtbHaOn-0,1588
60
+ data_processing_spark/runtime/spark/__init__.py,sha256=bhY1xI9lL0GR2v1APahlhC5sh5rdVcGhQbWN4yoXApw,1233
61
+ data_processing_spark/runtime/spark/execution_configuration.py,sha256=BqxUlpXFdHRK-csO2jaJJtktyKbcMtjIn3sjAPBfO58,3643
62
+ data_processing_spark/runtime/spark/runtime_configuration.py,sha256=uABzBvzzFZ5HA_lGYEBFRd1qViMj5sbpKtCSLy64riM,1705
63
+ data_processing_spark/runtime/spark/transform_file_processor.py,sha256=sdDBZZyqCqyKaJmEqZh1QzIqCDkLQzqV0dcAI5TRBjo,2611
64
+ data_processing_spark/runtime/spark/transform_launcher.py,sha256=1PZ-N4Wy2Qqiqr2z9S1xV88cNsAoHrmmuPadiOakJLM,2479
65
+ data_processing_spark/runtime/spark/transform_orchestrator.py,sha256=HadnLNx_icy2n7CXOwqLiUA7vjV-gOvajxE0AQU3_NM,6645
66
+ data_processing_spark/runtime/spark/transform_runtime.py,sha256=IKChGY1uGxFlAqZaL-XeSv_J3BMm3nev9MAs0NTT8og,2506
67
+ data_processing_spark/test_support/transform/__init__.py,sha256=v58HbP2x9KF8MG8SOGWjodrTjU57KXlL0aPPB7z8KQQ,755
68
+ data_processing_spark/test_support/transform/noop_transform.py,sha256=0FR3o-LnXf-UFS5gU0j-i4LVlw1mHDxGaPI40dkkIKY,1694
69
+ data_prep_toolkit-0.2.2.dev1.dist-info/METADATA,sha256=QiZEK2qc8or6csEZk_weYltxiDRFvDb0chVpwLMCMrU,1235
70
+ data_prep_toolkit-0.2.2.dev1.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
71
+ data_prep_toolkit-0.2.2.dev1.dist-info/top_level.txt,sha256=XGMDmY55_pe5KeRWvO0un9a640e2v99tzbBBtjNybPM,58
72
+ data_prep_toolkit-0.2.2.dev1.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ data_processing
2
+ data_processing_ray
3
+ data_processing_spark
@@ -9,9 +9,10 @@
9
9
  # See the License for the specific language governing permissions and
10
10
  # limitations under the License.
11
11
  ################################################################################
12
-
12
+ import os
13
13
  import time
14
14
  import traceback
15
+ import psutil
15
16
  from datetime import datetime
16
17
  from multiprocessing import Pool
17
18
  from typing import Any
@@ -24,12 +25,31 @@ from data_processing.runtime.pure_python import (
24
25
  PythonTransformRuntimeConfiguration,
25
26
  )
26
27
  from data_processing.transform import AbstractBinaryTransform, TransformStatistics
27
- from data_processing.utils import get_logger
28
+ from data_processing.utils import GB, get_logger
28
29
 
29
30
 
30
31
  logger = get_logger(__name__)
31
32
 
32
33
 
34
+ @staticmethod
35
+ def _execution_resources() -> dict[str, Any]:
36
+ """
37
+ Get Execution resource
38
+ :return: tuple of cpu/memory usage
39
+ """
40
+ # Getting loadover15 minutes
41
+ load1, load5, load15 = psutil.getloadavg()
42
+ # Getting memory used
43
+ mused = round(psutil.virtual_memory()[3] / GB, 2)
44
+ return {
45
+ "cpus": round((load15/os.cpu_count()) * 100, 1),
46
+ "gpus": 0,
47
+ "memory": mused,
48
+ "object_store": 0,
49
+ }
50
+
51
+
52
+
33
53
  def orchestrate(
34
54
  data_access_factory: DataAccessFactoryBase,
35
55
  runtime_config: PythonTransformRuntimeConfiguration,
@@ -43,6 +63,7 @@ def orchestrate(
43
63
  :return: 0 - success or 1 - failure
44
64
  """
45
65
  start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
66
+ start_time = time.time()
46
67
  logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
47
68
  # create statistics
48
69
  statistics = TransformStatistics()
@@ -118,6 +139,7 @@ def orchestrate(
118
139
  "job_input_params": input_params
119
140
  | data_access_factory.get_input_params()
120
141
  | execution_config.get_input_params(),
142
+ "execution_stats": _execution_resources() | {"execution time, min": round((time.time() - start_time) / 60.0, 3)},
121
143
  "job_output_stats": stats,
122
144
  }
123
145
  logger.debug(f"Saving job metadata: {metadata}.")
@@ -0,0 +1,9 @@
1
+ from data_processing_ray.runtime.ray.ray_utils import RayUtils
2
+ from data_processing_ray.runtime.ray.transform_statistics import TransformStatisticsRay
3
+ from data_processing_ray.runtime.ray.transform_runtime import DefaultRayTransformRuntime
4
+ from data_processing_ray.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
5
+ from data_processing_ray.runtime.ray.transform_file_processor import RayTransformFileProcessor
6
+ from data_processing_ray.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
7
+ from data_processing_ray.runtime.ray.transform_orchestrator import orchestrate
8
+ from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher
9
+ from data_processing_ray.runtime.ray.transform_invoker import execute_ray_transform
@@ -0,0 +1,110 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import argparse
14
+ import ast
15
+ from typing import Any
16
+
17
+ from data_processing.runtime import TransformExecutionConfiguration
18
+ from data_processing.utils import CLIArgumentProvider, ParamsUtils, get_logger
19
+
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ cli_prefix = "runtime_"
25
+
26
+
27
+ class RayTransformExecutionConfiguration(TransformExecutionConfiguration):
28
+ """
29
+ A class specifying and validating Ray orchestrator configuration
30
+ """
31
+
32
+ def __init__(self, name: str):
33
+ """
34
+ Initialization
35
+ """
36
+ super().__init__(name=name, print_params=False)
37
+ self.worker_options = {}
38
+ self.n_workers = 1
39
+ self.creation_delay = 0
40
+
41
+ def add_input_params(self, parser: argparse.ArgumentParser) -> None:
42
+ """
43
+ This method adds transformer specific parameter to parser
44
+ :param parser: parser
45
+ :return:
46
+ """
47
+ parser.add_argument(f"--{cli_prefix}num_workers", type=int, default=1, help="number of workers")
48
+
49
+ help_example_dict = {
50
+ "num_cpus": ["8", "Required number of CPUs."],
51
+ "num_gpus": ["1", "Required number of GPUs"],
52
+ "resources": [
53
+ '{"special_hardware": 1, "custom_label": 1}',
54
+ """The complete list can be found at
55
+ https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html#ray.remote_function.RemoteFunction.options
56
+ and contains accelerator_type, memory, name, num_cpus, num_gpus, object_store_memory, placement_group,
57
+ placement_group_bundle_index, placement_group_capture_child_tasks, resources, runtime_env,
58
+ scheduling_strategy, _metadata, concurrency_groups, lifetime, max_concurrency, max_restarts,
59
+ max_task_retries, max_pending_calls, namespace, get_if_exists""",
60
+ ],
61
+ }
62
+ parser.add_argument(
63
+ f"--{cli_prefix}worker_options",
64
+ type=ast.literal_eval,
65
+ default="{'num_cpus': 0.8}",
66
+ help="AST string defining worker resource requirements.\n"
67
+ + ParamsUtils.get_ast_help_text(help_example_dict),
68
+ )
69
+ parser.add_argument(f"--{cli_prefix}creation_delay", type=int, default=0, help="delay between actor' creation")
70
+ return TransformExecutionConfiguration.add_input_params(self, parser=parser)
71
+
72
+ def apply_input_params(self, args: argparse.Namespace) -> bool:
73
+ """
74
+ Validate transformer specific parameters
75
+ :param args: user defined arguments
76
+ :return: True, if validate pass or False otherwise
77
+ """
78
+ if not TransformExecutionConfiguration.apply_input_params(self, args=args):
79
+ return False
80
+ captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
81
+ # store parameters locally
82
+ self.worker_options = captured["worker_options"]
83
+ self.n_workers = captured["num_workers"]
84
+ self.creation_delay = captured["creation_delay"]
85
+ self.job_details = {
86
+ "job category": "preprocessing",
87
+ "job name": self.name,
88
+ "job type": "ray",
89
+ "job id": captured["job_id"],
90
+ }
91
+ # if the user did not define actor max_restarts set it up for fault tolerance
92
+ if "max_restarts" not in self.worker_options:
93
+ self.worker_options["max_restarts"] = -1
94
+
95
+ # print them
96
+ logger.info(f"number of workers {self.n_workers} worker options {self.worker_options}")
97
+ logger.info(f"actor creation delay {self.creation_delay}")
98
+ logger.info(f"job details {self.job_details}")
99
+ return True
100
+
101
+ def get_input_params(self) -> dict[str, Any]:
102
+ """
103
+ get input parameters for job_input_params in metadata
104
+ :return: dictionary of parameters
105
+ """
106
+ return {
107
+ "number of workers": self.n_workers,
108
+ "worker options": self.worker_options,
109
+ "actor creation delay": self.creation_delay,
110
+ }
@@ -0,0 +1,246 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import logging
14
+ import time
15
+ from typing import Any
16
+
17
+ import ray
18
+ from ray.experimental.state.api import list_actors
19
+ from data_processing.utils import GB, UnrecoverableException
20
+ from ray.actor import ActorHandle
21
+ from ray.exceptions import RayError
22
+ from ray.util.actor_pool import ActorPool
23
+
24
+
25
+ class RayUtils:
26
+ """
27
+ Class implementing support methods for Ray execution
28
+ """
29
+
30
+ from ray.util.metrics import Gauge
31
+
32
+ @staticmethod
33
+ def get_available_resources(
34
+ available_cpus_gauge: Gauge = None,
35
+ available_gpus_gauge: Gauge = None,
36
+ available_memory_gauge: Gauge = None,
37
+ object_memory_gauge: Gauge = None,
38
+ ) -> dict[str, Any]:
39
+ """
40
+ Get currently available cluster resources
41
+ :param available_cpus_gauge: ray Gauge to report available CPU
42
+ :param available_gpus_gauge: ray Gauge to report available GPU
43
+ :param available_memory_gauge: ray Gauge to report available memory
44
+ :param object_memory_gauge: ray Gauge to report available object memory
45
+ :return: a dict of currently available resources
46
+ """
47
+ resources = ray.available_resources()
48
+ if available_cpus_gauge is not None:
49
+ available_cpus_gauge.set(int(resources.get("CPU", 0.0)))
50
+ if available_gpus_gauge is not None:
51
+ available_gpus_gauge.set(int(resources.get("GPU", 0.0)))
52
+ if available_memory_gauge is not None:
53
+ available_memory_gauge.set(resources.get("memory", 0.0) / GB)
54
+ if object_memory_gauge is not None:
55
+ object_memory_gauge.set(resources.get("object_store_memory", 0.0) / GB)
56
+ return {
57
+ "cpus": int(resources.get("CPU", 0.0)),
58
+ "gpus": int(resources.get("GPU", 0.0)),
59
+ "memory": resources.get("memory", 0.0) / GB,
60
+ "object_store": resources.get("object_store_memory", 0.0) / GB,
61
+ }
62
+
63
+ @staticmethod
64
+ def get_cluster_resources() -> dict[str, Any]:
65
+ """
66
+ Get cluster resources
67
+ :return: cluster resources
68
+ """
69
+ resources = ray.cluster_resources()
70
+ return {
71
+ "cpus": int(resources.get("CPU", 0.0)),
72
+ "gpus": int(resources.get("GPU", 0.0)),
73
+ "memory": resources.get("memory", 0.0) / GB,
74
+ "object_store": resources.get("object_store_memory", 0.0) / GB,
75
+ }
76
+
77
+ @staticmethod
78
+ def get_available_nodes(available_nodes_gauge: Gauge = None) -> int:
79
+ """
80
+ Get the list of the alive Ray nodes and optionally expose it to prometheus
81
+ :param available_nodes_gauge: the gauge used to publish number of available node
82
+ :return: number of available nodes
83
+ """
84
+ # get nodes from Ray
85
+ nodes = ray.nodes()
86
+ # filer out available ones
87
+ nnodes = 0
88
+ for node in nodes:
89
+ if node["Alive"]:
90
+ nnodes += 1
91
+ return nnodes
92
+
93
+ @staticmethod
94
+ def create_actors(
95
+ clazz: type, params: dict[str, Any], actor_options: dict[str, Any], n_actors: int, creation_delay: int = 0
96
+ ) -> list[ActorHandle]:
97
+ """
98
+ Create a set of actors
99
+ :param clazz: actor class, has to be annotated as remote
100
+ :param params: actor init params
101
+ :param actor_options: dictionary of actor options.
102
+ see https://docs.ray.io/en/latest/ray-core/api/doc/ray.actor.ActorClass.options.html
103
+ :param n_actors: number of actors
104
+ :param creation_delay - delay between actor's creations
105
+ :return: a list of actor handles
106
+ """
107
+
108
+ def operator() -> ActorHandle:
109
+ time.sleep(creation_delay)
110
+ return clazz.options(**actor_options).remote(params)
111
+
112
+ cls_name = clazz.__class__.__name__.replace('ActorClass(', '').replace(')','')
113
+ actors = [operator() for _ in range(n_actors)]
114
+ for i in range(120):
115
+ time.sleep(1)
116
+ alive = list_actors(filters=[("class_name", "=", cls_name), ("state", "=", "ALIVE")])
117
+ if len(actors) == len(alive):
118
+ return actors
119
+ # failed - raise an exception
120
+ print(f"created {actors}, alive {alive}")
121
+ raise UnrecoverableException(f"out of {len(actors)} created actors only {len(alive)} alive")
122
+
123
+ @staticmethod
124
+ def process_files(
125
+ executors: ActorPool,
126
+ files: list[str],
127
+ print_interval: int,
128
+ files_in_progress_gauge: Gauge,
129
+ files_completed_gauge: Gauge,
130
+ available_cpus_gauge: Gauge,
131
+ available_gpus_gauge: Gauge,
132
+ available_memory_gauge: Gauge,
133
+ object_memory_gauge: Gauge,
134
+ logger: logging.Logger,
135
+ ) -> int:
136
+ """
137
+ Process files
138
+ :param executors: actor pool of executors
139
+ :param files: list of files to process
140
+ :param print_interval: print interval
141
+ :param files_in_progress_gauge: ray Gauge to report files in process
142
+ :param files_completed_gauge: ray Gauge to report completed files
143
+ :param available_cpus_gauge: ray Gauge to report available CPU
144
+ :param available_gpus_gauge: ray Gauge to report available GPU
145
+ :param available_memory_gauge: ray Gauge to report available memory
146
+ :param object_memory_gauge: ray Gauge to report available object memory
147
+ :param logger: logger
148
+ :return: number of actors failures
149
+ """
150
+ logger.debug("Begin processing files")
151
+ actor_failures = 0
152
+ RayUtils.get_available_resources(
153
+ available_cpus_gauge=available_cpus_gauge,
154
+ available_gpus_gauge=available_gpus_gauge,
155
+ available_memory_gauge=available_memory_gauge,
156
+ object_memory_gauge=object_memory_gauge,
157
+ )
158
+ terminate = False
159
+ running = 0
160
+ t_start = time.time()
161
+ completed = 0
162
+ for path in files:
163
+ if executors.has_free(): # still have room
164
+ executors.submit(lambda a, v: a.process_file.remote(v), path)
165
+ running += 1
166
+ files_in_progress_gauge.set(running)
167
+ else: # need to wait for some actors
168
+ while True:
169
+ # we can have several workers fail here
170
+ try:
171
+ res = executors.get_next_unordered()
172
+ break
173
+ except Exception as e:
174
+ if isinstance(e, RayError):
175
+ # Ray exception - terminate
176
+ logger.error(f"Got Ray worker exception {e}, terminating")
177
+ terminate = True
178
+ break
179
+ logger.error(f"Failed to process request worker exception {e}")
180
+ actor_failures += 1
181
+ completed += 1
182
+ break
183
+ if terminate:
184
+ raise UnrecoverableException
185
+ executors.submit(lambda a, v: a.process_file.remote(v), path)
186
+
187
+ completed += 1
188
+ files_completed_gauge.set(completed)
189
+ RayUtils.get_available_resources(
190
+ available_cpus_gauge=available_cpus_gauge,
191
+ available_gpus_gauge=available_gpus_gauge,
192
+ available_memory_gauge=available_memory_gauge,
193
+ object_memory_gauge=object_memory_gauge,
194
+ )
195
+ if completed % print_interval == 0:
196
+ logger.info(f"Completed {completed} files in {round((time.time() - t_start)/60., 3)} min")
197
+ # Wait for completion
198
+ files_completed_gauge.set(completed)
199
+ # Wait for completion
200
+ logger.info(
201
+ f"Completed {completed} files ({round(100 * completed / len(files), 3)}%) "
202
+ f"in {round((time.time() - t_start)/60., 3)} min. Waiting for completion"
203
+ )
204
+ while executors.has_next():
205
+ while True:
206
+ # we can have several workers fail here
207
+ try:
208
+ executors.get_next_unordered()
209
+ break
210
+ except Exception as e:
211
+ logger.error(f"Failed to process request worker exception {e}")
212
+ actor_failures += 1
213
+ completed += 1
214
+ running -= 1
215
+ completed += 1
216
+ files_in_progress_gauge.set(running)
217
+ files_completed_gauge.set(completed)
218
+ RayUtils.get_available_resources(
219
+ available_cpus_gauge=available_cpus_gauge,
220
+ available_gpus_gauge=available_gpus_gauge,
221
+ available_memory_gauge=available_memory_gauge,
222
+ object_memory_gauge=object_memory_gauge,
223
+ )
224
+
225
+ logger.info(f"Completed processing {completed} files in {round((time.time() - t_start)/60, 3)} min")
226
+ return actor_failures
227
+
228
+ @staticmethod
229
+ def wait_for_execution_completion(logger: logging.Logger, replies: list[ray.ObjectRef]) -> int:
230
+ """
231
+ Wait for all requests completed
232
+ :param logger: logger to use
233
+ :param replies: list of request futures
234
+ :return: None
235
+ """
236
+ actor_failures = 0
237
+ while replies:
238
+ # Wait for replies
239
+ try:
240
+ ready, not_ready = ray.wait(replies)
241
+ except Exception as e:
242
+ logger.error(f"Failed to process request worker exception {e}")
243
+ actor_failures += 1
244
+ not_ready = replies - 1
245
+ replies = not_ready
246
+ return actor_failures
@@ -0,0 +1,37 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.runtime import TransformRuntimeConfiguration
14
+ from data_processing.transform import TransformConfiguration
15
+ from data_processing_ray.runtime.ray import DefaultRayTransformRuntime
16
+
17
+
18
+ class RayTransformRuntimeConfiguration(TransformRuntimeConfiguration):
19
+ def __init__(
20
+ self,
21
+ transform_config: TransformConfiguration,
22
+ runtime_class: type[DefaultRayTransformRuntime] = DefaultRayTransformRuntime,
23
+ ):
24
+ """
25
+ Initialization
26
+ :param transform_config - base configuration class
27
+ :param runtime_class: implementation of the transform runtime
28
+ """
29
+ super().__init__(transform_config=transform_config)
30
+ self.runtime_class = runtime_class
31
+
32
+ def create_transform_runtime(self) -> DefaultRayTransformRuntime:
33
+ """
34
+ Create transform runtime with the parameters captured during apply_input_params()
35
+ :return: transform runtime object
36
+ """
37
+ return self.runtime_class(self.transform_config.get_transform_params())
@@ -0,0 +1,53 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ import ray
16
+ from data_processing.runtime import AbstractTransformFileProcessor
17
+ from data_processing.utils import UnrecoverableException
18
+
19
+
20
+ @ray.remote(scheduling_strategy="SPREAD")
21
+ class RayTransformFileProcessor(AbstractTransformFileProcessor):
22
+ """
23
+ This is the class implementing the actual work/actor processing of a single file
24
+ """
25
+
26
+ def __init__(self, params: dict[str, Any]):
27
+ """
28
+ Init method
29
+ :param params: dictionary that has the following key
30
+ data_access_factory: data access factory
31
+ transform_class: local transform class
32
+ transform_params: dictionary of parameters for local transform creation
33
+ statistics: object reference to statistics
34
+ """
35
+ super().__init__(
36
+ data_access_factory=params.get("data_access_factory", None),
37
+ transform_parameters=dict(params.get("transform_params", {})),
38
+ )
39
+ # Create statistics
40
+ self.stats = params.get("statistics", None)
41
+ if self.stats is None:
42
+ self.logger.error("Transform file processor: statistics is not specified")
43
+ raise UnrecoverableException("statistics is None")
44
+ self.transform_params["statistics"] = self.stats
45
+ # Create local processor
46
+ try:
47
+ self.transform = params.get("transform_class", None)(self.transform_params)
48
+ except Exception as e:
49
+ self.logger.error(f"Exception creating transform {e}")
50
+ raise UnrecoverableException("failed creating transform")
51
+
52
+ def _publish_stats(self, stats: dict[str, Any]) -> None:
53
+ self.stats.add_stats.remote(stats)