data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/METADATA +33 -1
  2. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/RECORD +34 -28
  3. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/WHEEL +1 -1
  4. data_processing/data_access/data_access.py +4 -1
  5. data_processing/data_access/data_access_local.py +0 -11
  6. data_processing/data_access/data_access_s3.py +0 -11
  7. data_processing/runtime/pure_python/transform_file_processor.py +9 -3
  8. data_processing/runtime/pure_python/transform_orchestrator.py +30 -17
  9. data_processing/runtime/pure_python/transform_runtime.py +9 -1
  10. data_processing/runtime/transform_file_processor.py +53 -32
  11. data_processing/test_support/data_access/data_access_factory_test.py +12 -0
  12. data_processing/test_support/transform/__init__.py +9 -4
  13. data_processing/test_support/transform/noop_folder_transform.py +105 -0
  14. data_processing/test_support/transform/noop_transform.py +3 -3
  15. data_processing/transform/__init__.py +2 -0
  16. data_processing/transform/abstract_transform.py +16 -0
  17. data_processing/transform/binary_transform.py +3 -2
  18. data_processing/transform/folder_transform.py +40 -0
  19. data_processing/transform/transform_configuration.py +3 -3
  20. data_processing/utils/multilock.py +160 -0
  21. data_processing/utils/unrecoverable.py +13 -0
  22. data_processing_ray/runtime/ray/transform_file_processor.py +1 -0
  23. data_processing_ray/runtime/ray/transform_orchestrator.py +18 -10
  24. data_processing_ray/runtime/ray/transform_runtime.py +9 -1
  25. data_processing_ray/test_support/transform/__init__.py +1 -0
  26. data_processing_ray/test_support/transform/noop_folder_transform.py +56 -0
  27. data_processing_ray/test_support/transform/noop_transform.py +1 -3
  28. data_processing_spark/runtime/spark/runtime_configuration.py +13 -0
  29. data_processing_spark/runtime/spark/transform_file_processor.py +4 -1
  30. data_processing_spark/runtime/spark/transform_orchestrator.py +78 -15
  31. data_processing_spark/runtime/spark/transform_runtime.py +24 -6
  32. data_processing_spark/test_support/transform/__init__.py +1 -0
  33. data_processing_spark/test_support/transform/noop_folder_transform.py +53 -0
  34. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,105 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import time
14
+ from typing import Any
15
+
16
+ from data_processing.data_access import DataAccess
17
+ from data_processing.runtime.pure_python import (
18
+ PythonTransformLauncher,
19
+ PythonTransformRuntimeConfiguration,
20
+ DefaultPythonTransformRuntime)
21
+ from data_processing.transform import AbstractFolderTransform
22
+ from data_processing.utils import get_logger
23
+ from data_processing.test_support.transform import NOOPTransformConfiguration
24
+
25
+
26
+ logger = get_logger(__name__)
27
+
28
+
29
+ class NOOPFolderTransform(AbstractFolderTransform):
30
+ """
31
+ Implements a simple copy of a pyarrow Table.
32
+ """
33
+
34
+ def __init__(self, config: dict[str, Any]):
35
+ """
36
+ Initialize based on the dictionary of configuration information.
37
+ This is generally called with configuration parsed from the CLI arguments defined
38
+ by the companion runtime, NOOPTransformRuntime. If running inside the RayMutatingDriver,
39
+ these will be provided by that class with help from the RayMutatingDriver.
40
+ """
41
+ # Make sure that the param name corresponds to the name used in apply_input_params method
42
+ # of NOOPTransformConfiguration class
43
+ super().__init__(config)
44
+ self.sleep = config.get("sleep_sec", 1)
45
+ self.data_access = config.get("data_access")
46
+
47
+ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
48
+ """
49
+ Converts input folder into o or more output files.
50
+ If there is an error, an exception must be raised - exit()ing is not generally allowed.
51
+ :param folder_name: the name of the folder containing arbitrary amount of files.
52
+ :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
53
+ to metadata. Each element of the return list, is a tuple of the transformed bytes and a string
54
+ holding the file name to use.
55
+ """
56
+ logger.debug(f"Transforming one folder {folder_name}")
57
+ metadata = {}
58
+ # get folder files
59
+ files, retries = self.data_access.get_folder_files(path=folder_name)
60
+ if retries > 0:
61
+ metadata |= {"data access retries": retries}
62
+ result = [()] * len(files)
63
+ index = 0
64
+ for name, file in files.items():
65
+ result[index] = (file, self.data_access.get_output_location(name))
66
+ if self.sleep is not None:
67
+ logger.info(f"Sleep for {self.sleep} seconds")
68
+ time.sleep(self.sleep)
69
+ logger.info("Sleep completed - continue")
70
+ index += 1
71
+ # Add some sample metadata.
72
+ metadata |= {"nfiles": len(files)}
73
+ return result, metadata
74
+
75
+
76
+ class NOOPFolderPythonRuntime(DefaultPythonTransformRuntime):
77
+ def get_folders(self, data_access: DataAccess) -> list[str]:
78
+ """
79
+ Get folders to process
80
+ :param data_access: data access
81
+ :return: list of folders to process
82
+ """
83
+ return [data_access.get_input_folder()]
84
+
85
+
86
+ class NOOPFolderPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
87
+ """
88
+ Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
89
+ NOOP does not use a RayRuntime class so the superclass only needs the base
90
+ python-only configuration.
91
+ """
92
+
93
+ def __init__(self):
94
+ """
95
+ Initialization
96
+ """
97
+ super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
98
+ runtime_class=NOOPFolderPythonRuntime)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ # launcher = NOOPRayLauncher()
103
+ launcher = PythonTransformLauncher(NOOPFolderPythonTransformConfiguration())
104
+ logger.info("Launching noop transform")
105
+ launcher.launch()
@@ -19,7 +19,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher
19
19
  from data_processing.runtime.pure_python.runtime_configuration import (
20
20
  PythonTransformRuntimeConfiguration,
21
21
  )
22
- from data_processing.transform import AbstractTableTransform, TransformConfiguration
22
+ from data_processing.transform import AbstractTableTransform, TransformConfiguration, AbstractTransform
23
23
  from data_processing.utils import CLIArgumentProvider, get_logger
24
24
 
25
25
 
@@ -75,10 +75,10 @@ class NOOPTransformConfiguration(TransformConfiguration):
75
75
  configuration with CLI args.
76
76
  """
77
77
 
78
- def __init__(self):
78
+ def __init__(self, clazz: type[AbstractTransform] = NOOPTransform):
79
79
  super().__init__(
80
80
  name=short_name,
81
- transform_class=NOOPTransform,
81
+ transform_class=clazz,
82
82
  remove_from_metadata=[pwd_key],
83
83
  )
84
84
 
@@ -1,3 +1,5 @@
1
+ from data_processing.transform.abstract_transform import AbstractTransform
2
+ from data_processing.transform.folder_transform import AbstractFolderTransform
1
3
  from data_processing.transform.binary_transform import AbstractBinaryTransform
2
4
  from data_processing.transform.table_transform import AbstractTableTransform
3
5
  from data_processing.transform.transform_statistics import TransformStatistics
@@ -0,0 +1,16 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ class AbstractTransform:
14
+ """
15
+ Base class for all transform types
16
+ """
@@ -10,10 +10,11 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
- from typing import Any, TypeVar
13
+ from typing import Any
14
+ from data_processing.transform import AbstractTransform
14
15
 
15
16
 
16
- class AbstractBinaryTransform:
17
+ class AbstractBinaryTransform(AbstractTransform):
17
18
  """
18
19
  Converts input binary file to output file(s) (binary)
19
20
  Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
@@ -0,0 +1,40 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+ from data_processing.transform import AbstractTransform
15
+
16
+
17
+ class AbstractFolderTransform(AbstractTransform):
18
+ """
19
+ Converts input folder to output file(s) (binary)
20
+ Sub-classes must provide the transform() method to provide the conversion of a folder to 0 or
21
+ more new binary files and metadata.
22
+ """
23
+
24
+ def __init__(self, config: dict[str, Any]):
25
+ """
26
+ Initialize based on the dictionary of configuration information.
27
+ This simply stores the given instance in this instance for later use.
28
+ """
29
+ self.config = config
30
+
31
+ def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
32
+ """
33
+ Converts input folder into o or more output files.
34
+ If there is an error, an exception must be raised - exit()ing is not generally allowed.
35
+ :param folder_name: the name of the folder containing arbitrary amount of files.
36
+ :return: a tuple of a list of 0 or more tuples and a dictionary of statistics that will be propagated
37
+ to metadata. Each element of the return list, is a tuple of the transformed bytes and a string
38
+ holding the file name to use.
39
+ """
40
+ raise NotImplemented()
@@ -13,7 +13,7 @@
13
13
  from argparse import ArgumentParser
14
14
  from typing import Any
15
15
 
16
- from data_processing.transform import AbstractBinaryTransform
16
+ from data_processing.transform import AbstractTransform
17
17
  from data_processing.utils import CLIArgumentProvider
18
18
 
19
19
 
@@ -23,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
23
23
  """
24
24
 
25
25
  def __init__(
26
- self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = []
26
+ self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []
27
27
  ):
28
28
  """
29
29
  Initialization
@@ -36,7 +36,7 @@ class TransformConfiguration(CLIArgumentProvider):
36
36
  self.remove_from_metadata = remove_from_metadata
37
37
  self.params = {}
38
38
 
39
- def get_transform_class(self) -> type[AbstractBinaryTransform]:
39
+ def get_transform_class(self) -> type[AbstractTransform]:
40
40
  """
41
41
  Get the class extending AbstractBinaryTransform which implements a specific transformation.
42
42
  The class will generally be instantiated with a dictionary of configuration produced by
@@ -0,0 +1,160 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import abc
14
+ import datetime
15
+ import fcntl
16
+ import os
17
+ import tempfile
18
+ import threading
19
+ import time
20
+
21
+
22
+ _tempdir = tempfile.gettempdir()
23
+
24
+
25
+ class MultiLock(abc.ABC):
26
+ """
27
+ Provides a process- and thread-locked lock.
28
+ To use
29
+ lock = MultiLock("mylock")
30
+ ...
31
+ lock.acquire(block=false, timeout=30)
32
+ # do something critical
33
+ ...
34
+ lock.release()
35
+ """
36
+
37
+ def __init__(self, name):
38
+ """
39
+ Create the lock with the given name.
40
+
41
+ :param name: the global name associated with this lock. All processes using the same
42
+ name will be part of the same locking cohort. It is up to the caller to define
43
+ and coordinate lock names.
44
+ """
45
+ if name is None or len(name) == 0:
46
+ raise ValueError("lock name must not be None or the empty string")
47
+ self.lock_filename = os.path.join(_tempdir, name + ".multilock")
48
+ # print(f"lock file name is {self.lock_filename}")
49
+ self.fd = None
50
+ self.thread_lock = threading.Lock()
51
+
52
+ def acquire(self, block=True, timeout=None):
53
+ """
54
+ With the block argument set to True (the default), the method call will block until the
55
+ lock is in an unlocked state, then set it to locked and return True.
56
+
57
+ With the block argument set to False, the method call does not block. If the lock
58
+ is currently in a locked state, return False; otherwise set the lock to a locked state and return True.
59
+
60
+ When invoked with a positive, floating-point value for timeout, wait for at most the number
61
+ of seconds specified by timeout as long as the lock can not be acquired. Invocations with a
62
+ negative value for timeout are equivalent to a timeout of zero. Invocations with a timeout
63
+ value of None (the default) set the timeout period to infinite. The timeout argument has no practical
64
+ implications if the block argument is set to False and is thus ignored.
65
+
66
+ Returns True if the lock has been acquired or False if the timeout period has elapsed.
67
+
68
+ """
69
+ if self.fd is not None: # Already locked.
70
+ return True
71
+
72
+ start = time.time()
73
+ if block:
74
+ thread_timeout = timeout if timeout is not None and timeout >= 0 else -1
75
+ locked = self.thread_lock.acquire(blocking=True, timeout=thread_timeout)
76
+ else:
77
+ locked = self.thread_lock.acquire(blocking=False)
78
+ if not locked:
79
+ return False
80
+ end = time.time()
81
+ if not block and timeout > 0:
82
+ timeout -= end - start
83
+ if timeout <= 0:
84
+ self.thread_lock.release()
85
+ return False
86
+
87
+ # open a file and create a file descriptor
88
+ self.fd = os.open(self.lock_filename, os.O_RDWR | os.O_CREAT)
89
+
90
+ msg = f"MultiLock last held by process with pid={os.getpid()}\n"
91
+ os.write(self.fd, str.encode(msg))
92
+
93
+ # put a lock on an open file
94
+ locked = False
95
+ waited = 0
96
+ sleep_seconds = 1
97
+ if timeout is not None:
98
+ timeout = max(0, timeout)
99
+ while not locked and (timeout is None or waited <= timeout):
100
+ try:
101
+ fcntl.lockf(self.fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
102
+ locked = True
103
+ except Exception as exc:
104
+ # Only get here if lock could not be acquired
105
+ # print(f"sleeping {exc=}")
106
+ time.sleep(sleep_seconds)
107
+ if not block:
108
+ break
109
+ waited += sleep_seconds
110
+ if not locked:
111
+ # If we didn't get the lock, then release the file.
112
+ os.close(self.fd)
113
+ self.fd = None
114
+
115
+ self.thread_lock.release()
116
+ return locked
117
+
118
+ def release(self):
119
+ """
120
+ Release an acquired lock. Do nothing if the lock is not acquired.
121
+ :return:
122
+ """
123
+ if self.fd is not None:
124
+ self.thread_lock.acquire()
125
+ if self.fd is not None: # Retest now that we have the thread lock.
126
+ os.close(self.fd)
127
+ self.fd = None
128
+ self.thread_lock.release()
129
+
130
+ def is_locked(self):
131
+ return self.fd is not None
132
+
133
+
134
+ def main(block, timeout, sleep):
135
+ lock = MultiLock("foo")
136
+ if block:
137
+ print(f"going to acquire the blocking lock with timeout={timeout}")
138
+ else:
139
+ print(f"going to acquire the non-blocking lock with timemout={timeout}")
140
+ locked = lock.acquire(block=block, timeout=timeout)
141
+ start = datetime.datetime.now()
142
+ start = start.strftime("%Y-%m-%d %H:%M:%S")
143
+ if not locked:
144
+ print(f"Could not get lock at {start}")
145
+ return
146
+ print(f"{start}: I got the lock")
147
+ time.sleep(sleep)
148
+ lock.release()
149
+ end = datetime.datetime.now()
150
+ end = end.strftime("%Y-%m-%d %H:%M:%S")
151
+ print(f"{end}: lock released")
152
+ print(f"lock held from {start} to {end}")
153
+
154
+
155
+ if __name__ == "__main__":
156
+ sleep = 10
157
+ timeout = 10
158
+ main(True, timeout, sleep)
159
+ time.sleep(1)
160
+ main(False, timeout, sleep)
@@ -1,3 +1,16 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+
1
14
  class UnrecoverableException(Exception):
2
15
  """
3
16
  Raised when a transform wants to cancel overall execution
@@ -35,6 +35,7 @@ class RayTransformFileProcessor(AbstractTransformFileProcessor):
35
35
  super().__init__(
36
36
  data_access_factory=params.get("data_access_factory", None),
37
37
  transform_parameters=dict(params.get("transform_params", {})),
38
+ is_folder=params.get("is_folder", False)
38
39
  )
39
40
  # Create statistics
40
41
  self.stats = params.get("statistics", None)
@@ -16,6 +16,7 @@ from datetime import datetime
16
16
 
17
17
  import ray
18
18
  from data_processing.data_access import DataAccessFactoryBase
19
+ from data_processing.transform import AbstractFolderTransform
19
20
  from data_processing_ray.runtime.ray import (
20
21
  RayTransformExecutionConfiguration,
21
22
  RayTransformFileProcessor,
@@ -56,13 +57,21 @@ def orchestrate(
56
57
  # create transformer runtime
57
58
  runtime = runtime_config.create_transform_runtime()
58
59
  resources = RayUtils.get_cluster_resources()
60
+ is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
59
61
  try:
60
- # Get files to process
61
- files, profile, retries = data_access.get_files_to_process()
62
- if len(files) == 0:
63
- logger.error("No input files to process - exiting")
64
- return 0
65
- logger.info(f"Number of files is {len(files)}, source profile {profile}")
62
+ if is_folder:
63
+ # folder transform
64
+ files = runtime.get_folders(data_access=data_access)
65
+ logger.info(f"Number of folders is {len(files)}") # Get files to process
66
+ else:
67
+ files, profile, retries = data_access.get_files_to_process()
68
+ if len(files) == 0:
69
+ logger.error("No input files to process - exiting")
70
+ return 0
71
+ # log retries
72
+ if retries > 0:
73
+ statistics.add_stats.remote({"data access retries": retries})
74
+ logger.info(f"Number of files is {len(files)}, source profile {profile}")
66
75
  # Print interval
67
76
  print_interval = int(len(files) / 100)
68
77
  if print_interval == 0:
@@ -73,9 +82,6 @@ def orchestrate(
73
82
  logger.info(
74
83
  f"Number of workers - {preprocessing_params.n_workers} " f"with {preprocessing_params.worker_options} each"
75
84
  )
76
- # log retries
77
- if retries > 0:
78
- statistics.add_stats.remote({"data access retries": retries})
79
85
  # create executors
80
86
  processor_params = {
81
87
  "data_access_factory": data_access_factory,
@@ -84,6 +90,7 @@ def orchestrate(
84
90
  data_access_factory=data_access_factory, statistics=statistics, files=files
85
91
  ),
86
92
  "statistics": statistics,
93
+ "is_folder": is_folder,
87
94
  }
88
95
  logger.debug("Creating actors")
89
96
  processors = RayUtils.create_actors(
@@ -135,7 +142,8 @@ def orchestrate(
135
142
  # Compute execution statistics
136
143
  logger.debug("Computing execution stats")
137
144
  stats = runtime.compute_execution_stats(ray.get(statistics.get_execution_stats.remote()))
138
- stats["processing_time"] = round(stats["processing_time"], 3)
145
+ if "processing_time" in stats:
146
+ stats["processing_time"] = round(stats["processing_time"], 3)
139
147
 
140
148
  # build and save metadata
141
149
  logger.debug("Building job metadata")
@@ -12,7 +12,7 @@
12
12
 
13
13
  from typing import Any
14
14
 
15
- from data_processing.data_access import DataAccessFactoryBase
15
+ from data_processing.data_access import DataAccessFactoryBase, DataAccess
16
16
  from ray.actor import ActorHandle
17
17
 
18
18
 
@@ -28,6 +28,14 @@ class DefaultRayTransformRuntime:
28
28
  """
29
29
  self.params = params
30
30
 
31
+ def get_folders(self, data_access: DataAccess) -> list[str]:
32
+ """
33
+ Get folders to process
34
+ :param data_access: data access
35
+ :return: list of folders to process
36
+ """
37
+ raise NotImplemented()
38
+
31
39
  def get_transform_config(
32
40
  self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str]
33
41
  ) -> dict[str, Any]:
@@ -1 +1,2 @@
1
1
  from data_processing_ray.test_support.transform.noop_transform import NOOPRayTransformConfiguration
2
+ from data_processing_ray.test_support.transform.noop_folder_transform import NOOPFolderRayTransformConfiguration
@@ -0,0 +1,56 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+
14
+ from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
15
+ from data_processing.utils import get_logger
16
+ from data_processing_ray.runtime.ray import (
17
+ RayTransformLauncher,
18
+ RayTransformRuntimeConfiguration,
19
+ DefaultRayTransformRuntime
20
+ )
21
+ from data_processing.data_access import DataAccess
22
+
23
+
24
+ logger = get_logger(__name__)
25
+
26
+
27
+ class NOOPFolderRayRuntime(DefaultRayTransformRuntime):
28
+ def get_folders(self, data_access: DataAccess) -> list[str]:
29
+ """
30
+ Get folders to process
31
+ :param data_access: data access
32
+ :return: list of folders to process
33
+ """
34
+ return [data_access.get_input_folder()]
35
+
36
+
37
+ class NOOPFolderRayTransformConfiguration(RayTransformRuntimeConfiguration):
38
+ """
39
+ Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
40
+ NOOP does not use a RayRuntime class so the superclass only needs the base
41
+ python-only configuration.
42
+ """
43
+
44
+ def __init__(self):
45
+ """
46
+ Initialization
47
+ """
48
+ super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
49
+ runtime_class=NOOPFolderRayRuntime)
50
+
51
+
52
+ if __name__ == "__main__":
53
+ # launcher = NOOPRayLauncher()
54
+ launcher = RayTransformLauncher(NOOPFolderRayTransformConfiguration())
55
+ logger.info("Launching noop transform")
56
+ launcher.launch()
@@ -11,9 +11,7 @@
11
11
  ################################################################################
12
12
 
13
13
 
14
- from data_processing.test_support.transform.noop_transform import (
15
- NOOPTransformConfiguration,
16
- )
14
+ from data_processing.test_support.transform import NOOPTransformConfiguration
17
15
  from data_processing.utils import get_logger
18
16
  from data_processing_ray.runtime.ray import (
19
17
  RayTransformLauncher,
@@ -10,6 +10,9 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
13
16
  from data_processing.runtime import TransformRuntimeConfiguration
14
17
  from data_processing.transform import TransformConfiguration
15
18
  from data_processing_spark.runtime.spark import DefaultSparkTransformRuntime
@@ -29,6 +32,16 @@ class SparkTransformRuntimeConfiguration(TransformRuntimeConfiguration):
29
32
  super().__init__(transform_config=transform_config)
30
33
  self.runtime_class = runtime_class
31
34
 
35
+ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
36
+ """Allows retrieving and broadcasting to all the workers very large
37
+ configuration parameters, like the list of document IDs to remove for
38
+ fuzzy dedup, or the list of blocked web domains for block listing. This
39
+ function is called by the spark runtime after spark initialization, and
40
+ before spark_context.parallelize()
41
+ :param data_access_factory - creates data_access object to download the large config parameter
42
+ """
43
+ return {}
44
+
32
45
  def create_transform_runtime(self) -> DefaultSparkTransformRuntime:
33
46
  """
34
47
  Create transform runtime with the parameters captured during apply_input_params()
@@ -29,12 +29,15 @@ class SparkTransformFileProcessor(AbstractTransformFileProcessor):
29
29
  data_access_factory: DataAccessFactoryBase,
30
30
  runtime_configuration: SparkTransformRuntimeConfiguration,
31
31
  statistics: TransformStatistics,
32
+ is_folder: bool,
32
33
  ):
33
34
  """
34
35
  Init method
35
36
  """
36
37
  super().__init__(
37
- data_access_factory=data_access_factory, transform_parameters=runtime_configuration.get_transform_params()
38
+ data_access_factory=data_access_factory,
39
+ transform_parameters=runtime_configuration.get_transform_params(),
40
+ is_folder=is_folder,
38
41
  )
39
42
  # Add data access ant statistics to the processor parameters
40
43
  self.runtime_configuration = runtime_configuration