data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/METADATA +33 -1
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/RECORD +34 -28
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/WHEEL +1 -1
- data_processing/data_access/data_access.py +4 -1
- data_processing/data_access/data_access_local.py +0 -11
- data_processing/data_access/data_access_s3.py +0 -11
- data_processing/runtime/pure_python/transform_file_processor.py +9 -3
- data_processing/runtime/pure_python/transform_orchestrator.py +30 -17
- data_processing/runtime/pure_python/transform_runtime.py +9 -1
- data_processing/runtime/transform_file_processor.py +53 -32
- data_processing/test_support/data_access/data_access_factory_test.py +12 -0
- data_processing/test_support/transform/__init__.py +9 -4
- data_processing/test_support/transform/noop_folder_transform.py +105 -0
- data_processing/test_support/transform/noop_transform.py +3 -3
- data_processing/transform/__init__.py +2 -0
- data_processing/transform/abstract_transform.py +16 -0
- data_processing/transform/binary_transform.py +3 -2
- data_processing/transform/folder_transform.py +40 -0
- data_processing/transform/transform_configuration.py +3 -3
- data_processing/utils/multilock.py +160 -0
- data_processing/utils/unrecoverable.py +13 -0
- data_processing_ray/runtime/ray/transform_file_processor.py +1 -0
- data_processing_ray/runtime/ray/transform_orchestrator.py +18 -10
- data_processing_ray/runtime/ray/transform_runtime.py +9 -1
- data_processing_ray/test_support/transform/__init__.py +1 -0
- data_processing_ray/test_support/transform/noop_folder_transform.py +56 -0
- data_processing_ray/test_support/transform/noop_transform.py +1 -3
- data_processing_spark/runtime/spark/runtime_configuration.py +13 -0
- data_processing_spark/runtime/spark/transform_file_processor.py +4 -1
- data_processing_spark/runtime/spark/transform_orchestrator.py +78 -15
- data_processing_spark/runtime/spark/transform_runtime.py +24 -6
- data_processing_spark/test_support/transform/__init__.py +1 -0
- data_processing_spark/test_support/transform/noop_folder_transform.py +53 -0
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from data_processing.data_access import DataAccess
|
|
17
|
+
from data_processing.runtime.pure_python import (
|
|
18
|
+
PythonTransformLauncher,
|
|
19
|
+
PythonTransformRuntimeConfiguration,
|
|
20
|
+
DefaultPythonTransformRuntime)
|
|
21
|
+
from data_processing.transform import AbstractFolderTransform
|
|
22
|
+
from data_processing.utils import get_logger
|
|
23
|
+
from data_processing.test_support.transform import NOOPTransformConfiguration
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
logger = get_logger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class NOOPFolderTransform(AbstractFolderTransform):
|
|
30
|
+
"""
|
|
31
|
+
Implements a simple copy of a pyarrow Table.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, config: dict[str, Any]):
|
|
35
|
+
"""
|
|
36
|
+
Initialize based on the dictionary of configuration information.
|
|
37
|
+
This is generally called with configuration parsed from the CLI arguments defined
|
|
38
|
+
by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver,
|
|
39
|
+
these will be provided by that class with help from the RayMutatingDriver.
|
|
40
|
+
"""
|
|
41
|
+
# Make sure that the param name corresponds to the name used in apply_input_params method
|
|
42
|
+
# of NOOPTransformConfiguration class
|
|
43
|
+
super().__init__(config)
|
|
44
|
+
self.sleep = config.get("sleep_sec", 1)
|
|
45
|
+
self.data_access = config.get("data_access")
|
|
46
|
+
|
|
47
|
+
def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
|
|
48
|
+
"""
|
|
49
|
+
Converts input folder into o or more output files.
|
|
50
|
+
If there is an error, an exception must be raised - exit()ing is not generally allowed.
|
|
51
|
+
:param folder_name: the name of the folder containing arbitrary amount of files.
|
|
52
|
+
:return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
|
|
53
|
+
to metadata. Each element of the return list, is a tuple of the transformed bytes and a string
|
|
54
|
+
holding the file name to use.
|
|
55
|
+
"""
|
|
56
|
+
logger.debug(f"Transforming one folder {folder_name}")
|
|
57
|
+
metadata = {}
|
|
58
|
+
# get folder files
|
|
59
|
+
files, retries = self.data_access.get_folder_files(path=folder_name)
|
|
60
|
+
if retries > 0:
|
|
61
|
+
metadata |= {"data access retries": retries}
|
|
62
|
+
result = [()] * len(files)
|
|
63
|
+
index = 0
|
|
64
|
+
for name, file in files.items():
|
|
65
|
+
result[index] = (file, self.data_access.get_output_location(name))
|
|
66
|
+
if self.sleep is not None:
|
|
67
|
+
logger.info(f"Sleep for {self.sleep} seconds")
|
|
68
|
+
time.sleep(self.sleep)
|
|
69
|
+
logger.info("Sleep completed - continue")
|
|
70
|
+
index += 1
|
|
71
|
+
# Add some sample metadata.
|
|
72
|
+
metadata |= {"nfiles": len(files)}
|
|
73
|
+
return result, metadata
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime):
|
|
77
|
+
def get_folders(self, data_access: DataAccess) -> list[str]:
|
|
78
|
+
"""
|
|
79
|
+
Get folders to process
|
|
80
|
+
:param data_access: data access
|
|
81
|
+
:return: list of folders to process
|
|
82
|
+
"""
|
|
83
|
+
return [data_access.get_input_folder()]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
|
|
87
|
+
"""
|
|
88
|
+
Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
|
|
89
|
+
NOOP does not use a RayRuntime class so the superclass only needs the base
|
|
90
|
+
python-only configuration.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self):
|
|
94
|
+
"""
|
|
95
|
+
Initialization
|
|
96
|
+
"""
|
|
97
|
+
super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
|
|
98
|
+
runtime_class=NOOPFolderPythonRuntime)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
# launcher = NOOPRayLauncher()
|
|
103
|
+
launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration())
|
|
104
|
+
logger.info("Launching noop transform")
|
|
105
|
+
launcher.launch()
|
|
@@ -19,7 +19,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
|
19
19
|
from data_processing.runtime.pure_python.runtime_configuration import (
|
|
20
20
|
PythonTransformRuntimeConfiguration,
|
|
21
21
|
)
|
|
22
|
-
from data_processing.transform import AbstractTableTransform, TransformConfiguration
|
|
22
|
+
from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform
|
|
23
23
|
from data_processing.utils import CLIArgumentProvider, get_logger
|
|
24
24
|
|
|
25
25
|
|
|
@@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration):
|
|
|
75
75
|
configuration with CLI args.
|
|
76
76
|
"""
|
|
77
77
|
|
|
78
|
-
def __init__(self):
|
|
78
|
+
def __init__(self, clazz: type[AbstractTransform] = NOOPTransform):
|
|
79
79
|
super().__init__(
|
|
80
80
|
name=short_name,
|
|
81
|
-
transform_class=
|
|
81
|
+
transform_class=clazz,
|
|
82
82
|
remove_from_metadata=[pwd_key],
|
|
83
83
|
)
|
|
84
84
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from data_processing.transform.abstract_transform import AbstractTransform
|
|
2
|
+
from data_processing.transform.folder_transform import AbstractFolderTransform
|
|
1
3
|
from data_processing.transform.binary_transform import AbstractBinaryTransform
|
|
2
4
|
from data_processing.transform.table_transform import AbstractTableTransform
|
|
3
5
|
from data_processing.transform.transform_statistics import TransformStatistics
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
class AbstractTransform:
|
|
14
|
+
"""
|
|
15
|
+
Base class for all transform types
|
|
16
|
+
"""
|
|
@@ -10,10 +10,11 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any
|
|
14
|
+
from data_processing.transform import AbstractTransform
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
class AbstractBinaryTransform:
|
|
17
|
+
class AbstractBinaryTransform(AbstractTransform):
|
|
17
18
|
"""
|
|
18
19
|
Converts input binary file to output file(s) (binary)
|
|
19
20
|
Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
from data_processing.transform import AbstractTransform
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AbstractFolderTransform(AbstractTransform):
|
|
18
|
+
"""
|
|
19
|
+
Converts input folder to output file(s) (binary)
|
|
20
|
+
Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or
|
|
21
|
+
more new binary files and metadata.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, config: dict[str, Any]):
|
|
25
|
+
"""
|
|
26
|
+
Initialize based on the dictionary of configuration information.
|
|
27
|
+
This simply stores the given instance in this instance for later use.
|
|
28
|
+
"""
|
|
29
|
+
self.config = config
|
|
30
|
+
|
|
31
|
+
def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
|
|
32
|
+
"""
|
|
33
|
+
Converts input folder into o or more output files.
|
|
34
|
+
If there is an error, an exception must be raised - exit()ing is not generally allowed.
|
|
35
|
+
:param folder_name: the name of the folder containing arbitrary amount of files.
|
|
36
|
+
:return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
|
|
37
|
+
to metadata. Each element of the return list, is a tuple of the transformed bytes and a string
|
|
38
|
+
holding the file name to use.
|
|
39
|
+
"""
|
|
40
|
+
raise NotImplemented()
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
from argparse import ArgumentParser
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
|
-
from data_processing.transform import
|
|
16
|
+
from data_processing.transform import AbstractTransform
|
|
17
17
|
from data_processing.utils import CLIArgumentProvider
|
|
18
18
|
|
|
19
19
|
|
|
@@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
def __init__(
|
|
26
|
-
self, name: str, transform_class: type[
|
|
26
|
+
self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []
|
|
27
27
|
):
|
|
28
28
|
"""
|
|
29
29
|
Initialization
|
|
@@ -36,7 +36,7 @@ class TransformConfiguration(CLIArgumentProvider):
|
|
|
36
36
|
self.remove_from_metadata = remove_from_metadata
|
|
37
37
|
self.params = {}
|
|
38
38
|
|
|
39
|
-
def get_transform_class(self) -> type[
|
|
39
|
+
def get_transform_class(self) -> type[AbstractTransform]:
|
|
40
40
|
"""
|
|
41
41
|
Get the class extending AbstractBinaryTransform which implements a specific transformation.
|
|
42
42
|
The class will generally be instantiated with a dictionary of configuration produced by
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import abc
|
|
14
|
+
import datetime
|
|
15
|
+
import fcntl
|
|
16
|
+
import os
|
|
17
|
+
import tempfile
|
|
18
|
+
import threading
|
|
19
|
+
import time
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_tempdir = tempfile.gettempdir()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MultiLock(abc.ABC):
|
|
26
|
+
"""
|
|
27
|
+
Provides a process- and thread-locked lock.
|
|
28
|
+
To use
|
|
29
|
+
lock = MultiLock("mylock")
|
|
30
|
+
...
|
|
31
|
+
lock.acquire(block=false, timeout=30)
|
|
32
|
+
# do something critical
|
|
33
|
+
...
|
|
34
|
+
lock.release()
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, name):
|
|
38
|
+
"""
|
|
39
|
+
Create the lock with the given name.
|
|
40
|
+
|
|
41
|
+
:param name: the global name associated with this lock. All processes using the same
|
|
42
|
+
name will be part of the same locking cohort. It is up to the caller to define
|
|
43
|
+
and coordinate lock names.
|
|
44
|
+
"""
|
|
45
|
+
if name is None or len(name) == 0:
|
|
46
|
+
raise ValueError("lock name must not be None or the empty string")
|
|
47
|
+
self.lock_filename = os.path.join(_tempdir, name + ".multilock")
|
|
48
|
+
# print(f"lock file name is {self.lock_filename}")
|
|
49
|
+
self.fd = None
|
|
50
|
+
self.thread_lock = threading.Lock()
|
|
51
|
+
|
|
52
|
+
def acquire(self, block=True, timeout=None):
|
|
53
|
+
"""
|
|
54
|
+
With the block argument set to True (the default), the method call will block until the
|
|
55
|
+
lock is in an unlocked state, then set it to locked and return True.
|
|
56
|
+
|
|
57
|
+
With the block argument set to False, the method call does not block. If the lock
|
|
58
|
+
is currently in a locked state, return False; otherwise set the lock to a locked state and return True.
|
|
59
|
+
|
|
60
|
+
When invoked with a positive, floating-point value for timeout, wait for at most the number
|
|
61
|
+
of seconds specified by timeout as long as the lock can not be acquired. Invocations with a
|
|
62
|
+
negative value for timeout are equivalent to a timeout of zero. Invocations with a timeout
|
|
63
|
+
value of None (the default) set the timeout period to infinite. The timeout argument has no practical
|
|
64
|
+
implications if the block argument is set to False and is thus ignored.
|
|
65
|
+
|
|
66
|
+
Returns True if the lock has been acquired or False if the timeout period has elapsed.
|
|
67
|
+
|
|
68
|
+
"""
|
|
69
|
+
if self.fd is not None: # Already locked.
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
start = time.time()
|
|
73
|
+
if block:
|
|
74
|
+
thread_timeout = timeout if timeout is not None and timeout >= 0 else -1
|
|
75
|
+
locked = self.thread_lock.acquire(blocking=True, timeout=thread_timeout)
|
|
76
|
+
else:
|
|
77
|
+
locked = self.thread_lock.acquire(blocking=False)
|
|
78
|
+
if not locked:
|
|
79
|
+
return False
|
|
80
|
+
end = time.time()
|
|
81
|
+
if not block and timeout > 0:
|
|
82
|
+
timeout -= end - start
|
|
83
|
+
if timeout <= 0:
|
|
84
|
+
self.thread_lock.release()
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
# open a file and create a file descriptor
|
|
88
|
+
self.fd = os.open(self.lock_filename, os.O_RDWR | os.O_CREAT)
|
|
89
|
+
|
|
90
|
+
msg = f"MultiLock last held by process with pid={os.getpid()}\n"
|
|
91
|
+
os.write(self.fd, str.encode(msg))
|
|
92
|
+
|
|
93
|
+
# put a lock on an open file
|
|
94
|
+
locked = False
|
|
95
|
+
waited = 0
|
|
96
|
+
sleep_seconds = 1
|
|
97
|
+
if timeout is not None:
|
|
98
|
+
timeout = max(0, timeout)
|
|
99
|
+
while not locked and (timeout is None or waited <= timeout):
|
|
100
|
+
try:
|
|
101
|
+
fcntl.lockf(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
102
|
+
locked = True
|
|
103
|
+
except Exception as exc:
|
|
104
|
+
# Only get here if lock could not be acquired
|
|
105
|
+
# print(f"sleeping {exc=}")
|
|
106
|
+
time.sleep(sleep_seconds)
|
|
107
|
+
if not block:
|
|
108
|
+
break
|
|
109
|
+
waited += sleep_seconds
|
|
110
|
+
if not locked:
|
|
111
|
+
# If we didn't get the lock, then release the file.
|
|
112
|
+
os.close(self.fd)
|
|
113
|
+
self.fd = None
|
|
114
|
+
|
|
115
|
+
self.thread_lock.release()
|
|
116
|
+
return locked
|
|
117
|
+
|
|
118
|
+
def release(self):
|
|
119
|
+
"""
|
|
120
|
+
Release an acquired lock. Do nothing if the lock is not acquired.
|
|
121
|
+
:return:
|
|
122
|
+
"""
|
|
123
|
+
if self.fd is not None:
|
|
124
|
+
self.thread_lock.acquire()
|
|
125
|
+
if self.fd is not None: # Retest now that we have the thread lock.
|
|
126
|
+
os.close(self.fd)
|
|
127
|
+
self.fd = None
|
|
128
|
+
self.thread_lock.release()
|
|
129
|
+
|
|
130
|
+
def is_locked(self):
|
|
131
|
+
return self.fd is not None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def main(block, timeout, sleep):
|
|
135
|
+
lock = MultiLock("foo")
|
|
136
|
+
if block:
|
|
137
|
+
print(f"going to acquire the blocking lock with timeout={timeout}")
|
|
138
|
+
else:
|
|
139
|
+
print(f"going to acquire the non-blocking lock with timemout={timeout}")
|
|
140
|
+
locked = lock.acquire(block=block, timeout=timeout)
|
|
141
|
+
start = datetime.datetime.now()
|
|
142
|
+
start = start.strftime("%Y-%m-%d %H:%M:%S")
|
|
143
|
+
if not locked:
|
|
144
|
+
print(f"Could not get lock at {start}")
|
|
145
|
+
return
|
|
146
|
+
print(f"{start}: I got the lock")
|
|
147
|
+
time.sleep(sleep)
|
|
148
|
+
lock.release()
|
|
149
|
+
end = datetime.datetime.now()
|
|
150
|
+
end = end.strftime("%Y-%m-%d %H:%M:%S")
|
|
151
|
+
print(f"{end}: lock released")
|
|
152
|
+
print(f"lock held from {start} to {end}")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
sleep = 10
|
|
157
|
+
timeout = 10
|
|
158
|
+
main(True, timeout, sleep)
|
|
159
|
+
time.sleep(1)
|
|
160
|
+
main(False, timeout, sleep)
|
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
|
|
1
14
|
class UnrecoverableException(Exception):
|
|
2
15
|
"""
|
|
3
16
|
Raised when a transform wants to cancel overall execution
|
|
@@ -35,6 +35,7 @@ class RayTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
35
35
|
super().__init__(
|
|
36
36
|
data_access_factory=params.get("data_access_factory", None),
|
|
37
37
|
transform_parameters=dict(params.get("transform_params", {})),
|
|
38
|
+
is_folder=params.get("is_folder", False)
|
|
38
39
|
)
|
|
39
40
|
# Create statistics
|
|
40
41
|
self.stats = params.get("statistics", None)
|
|
@@ -16,6 +16,7 @@ from datetime import datetime
|
|
|
16
16
|
|
|
17
17
|
import ray
|
|
18
18
|
from data_processing.data_access import DataAccessFactoryBase
|
|
19
|
+
from data_processing.transform import AbstractFolderTransform
|
|
19
20
|
from data_processing_ray.runtime.ray import (
|
|
20
21
|
RayTransformExecutionConfiguration,
|
|
21
22
|
RayTransformFileProcessor,
|
|
@@ -56,13 +57,21 @@ def orchestrate(
|
|
|
56
57
|
# create transformer runtime
|
|
57
58
|
runtime = runtime_config.create_transform_runtime()
|
|
58
59
|
resources = RayUtils.get_cluster_resources()
|
|
60
|
+
is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
|
|
59
61
|
try:
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
logger.
|
|
64
|
-
|
|
65
|
-
|
|
62
|
+
if is_folder:
|
|
63
|
+
# folder transform
|
|
64
|
+
files = runtime.get_folders(data_access=data_access)
|
|
65
|
+
logger.info(f"Number of folders is {len(files)}") # Get files to process
|
|
66
|
+
else:
|
|
67
|
+
files, profile, retries = data_access.get_files_to_process()
|
|
68
|
+
if len(files) == 0:
|
|
69
|
+
logger.error("No input files to process - exiting")
|
|
70
|
+
return 0
|
|
71
|
+
# log retries
|
|
72
|
+
if retries > 0:
|
|
73
|
+
statistics.add_stats.remote({"data access retries": retries})
|
|
74
|
+
logger.info(f"Number of files is {len(files)}, source profile {profile}")
|
|
66
75
|
# Print interval
|
|
67
76
|
print_interval = int(len(files) / 100)
|
|
68
77
|
if print_interval == 0:
|
|
@@ -73,9 +82,6 @@ def orchestrate(
|
|
|
73
82
|
logger.info(
|
|
74
83
|
f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each"
|
|
75
84
|
)
|
|
76
|
-
# log retries
|
|
77
|
-
if retries > 0:
|
|
78
|
-
statistics.add_stats.remote({"data access retries": retries})
|
|
79
85
|
# create executors
|
|
80
86
|
processor_params = {
|
|
81
87
|
"data_access_factory": data_access_factory,
|
|
@@ -84,6 +90,7 @@ def orchestrate(
|
|
|
84
90
|
data_access_factory=data_access_factory, statistics=statistics, files=files
|
|
85
91
|
),
|
|
86
92
|
"statistics": statistics,
|
|
93
|
+
"is_folder": is_folder,
|
|
87
94
|
}
|
|
88
95
|
logger.debug("Creating actors")
|
|
89
96
|
processors = RayUtils.create_actors(
|
|
@@ -135,7 +142,8 @@ def orchestrate(
|
|
|
135
142
|
# Compute execution statistics
|
|
136
143
|
logger.debug("Computing execution stats")
|
|
137
144
|
stats = runtime.compute_execution_stats(ray.get(statistics.get_execution_stats.remote()))
|
|
138
|
-
|
|
145
|
+
if "processing_time" in stats:
|
|
146
|
+
stats["processing_time"] = round(stats["processing_time"], 3)
|
|
139
147
|
|
|
140
148
|
# build and save metadata
|
|
141
149
|
logger.debug("Building job metadata")
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
|
-
from data_processing.data_access import DataAccessFactoryBase
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase, DataAccess
|
|
16
16
|
from ray.actor import ActorHandle
|
|
17
17
|
|
|
18
18
|
|
|
@@ -28,6 +28,14 @@ class DefaultRayTransformRuntime:
|
|
|
28
28
|
"""
|
|
29
29
|
self.params = params
|
|
30
30
|
|
|
31
|
+
def get_folders(self, data_access: DataAccess) -> list[str]:
|
|
32
|
+
"""
|
|
33
|
+
Get folders to process
|
|
34
|
+
:param data_access: data access
|
|
35
|
+
:return: list of folders to process
|
|
36
|
+
"""
|
|
37
|
+
raise NotImplemented()
|
|
38
|
+
|
|
31
39
|
def get_transform_config(
|
|
32
40
|
self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
|
|
33
41
|
) -> dict[str, Any]:
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
|
|
15
|
+
from data_processing.utils import get_logger
|
|
16
|
+
from data_processing_ray.runtime.ray import (
|
|
17
|
+
RayTransformLauncher,
|
|
18
|
+
RayTransformRuntimeConfiguration,
|
|
19
|
+
DefaultRayTransformRuntime
|
|
20
|
+
)
|
|
21
|
+
from data_processing.data_access import DataAccess
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = get_logger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class NOOPFolderRayRuntime(DefaultRayTransformRuntime):
|
|
28
|
+
def get_folders(self, data_access: DataAccess) -> list[str]:
|
|
29
|
+
"""
|
|
30
|
+
Get folders to process
|
|
31
|
+
:param data_access: data access
|
|
32
|
+
:return: list of folders to process
|
|
33
|
+
"""
|
|
34
|
+
return [data_access.get_input_folder()]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class NOOPFolderRayTransformConfiguration(RayTransformRuntimeConfiguration):
|
|
38
|
+
"""
|
|
39
|
+
Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
|
|
40
|
+
NOOP does not use a RayRuntime class so the superclass only needs the base
|
|
41
|
+
python-only configuration.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
"""
|
|
46
|
+
Initialization
|
|
47
|
+
"""
|
|
48
|
+
super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
|
|
49
|
+
runtime_class=NOOPFolderRayRuntime)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
# launcher = NOOPRayLauncher()
|
|
54
|
+
launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration())
|
|
55
|
+
logger.info("Launching noop transform")
|
|
56
|
+
launcher.launch()
|
|
@@ -11,9 +11,7 @@
|
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
from data_processing.test_support.transform
|
|
15
|
-
NOOPTransformConfiguration,
|
|
16
|
-
)
|
|
14
|
+
from data_processing.test_support.transform import NOOPTransformConfiguration
|
|
17
15
|
from data_processing.utils import get_logger
|
|
18
16
|
from data_processing_ray.runtime.ray import (
|
|
19
17
|
RayTransformLauncher,
|
|
@@ -10,6 +10,9 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
13
16
|
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
17
|
from data_processing.transform import TransformConfiguration
|
|
15
18
|
from data_processing_spark.runtime.spark import DefaultSparkTransformRuntime
|
|
@@ -29,6 +32,16 @@ class SparkTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
|
29
32
|
super().__init__(transform_config=transform_config)
|
|
30
33
|
self.runtime_class = runtime_class
|
|
31
34
|
|
|
35
|
+
def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
|
|
36
|
+
"""Allows retrieving and broadcasting to all the workers very large
|
|
37
|
+
configuration parameters, like the list of document IDs to remove for
|
|
38
|
+
fuzzy dedup, or the list of blocked web domains for block listing. This
|
|
39
|
+
function is called by the spark runtime after spark initialization, and
|
|
40
|
+
before spark_context.parallelize()
|
|
41
|
+
:param data_access_factory - creates data_access object to download the large config parameter
|
|
42
|
+
"""
|
|
43
|
+
return {}
|
|
44
|
+
|
|
32
45
|
def create_transform_runtime(self) -> DefaultSparkTransformRuntime:
|
|
33
46
|
"""
|
|
34
47
|
Create transform runtime with the parameters captured during apply_input_params()
|
|
@@ -29,12 +29,15 @@ class SparkTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
29
29
|
data_access_factory: DataAccessFactoryBase,
|
|
30
30
|
runtime_configuration: SparkTransformRuntimeConfiguration,
|
|
31
31
|
statistics: TransformStatistics,
|
|
32
|
+
is_folder: bool,
|
|
32
33
|
):
|
|
33
34
|
"""
|
|
34
35
|
Init method
|
|
35
36
|
"""
|
|
36
37
|
super().__init__(
|
|
37
|
-
data_access_factory=data_access_factory,
|
|
38
|
+
data_access_factory=data_access_factory,
|
|
39
|
+
transform_parameters=runtime_configuration.get_transform_params(),
|
|
40
|
+
is_folder=is_folder,
|
|
38
41
|
)
|
|
39
42
|
# Add data access ant statistics to the processor parameters
|
|
40
43
|
self.runtime_configuration = runtime_configuration
|