data-prep-toolkit 0.0.1.dev12__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/Makefile +6 -5
- {data_prep_toolkit-0.0.1.dev12/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.1.0}/PKG-INFO +1 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/advanced-transform-tutorial.md +31 -14
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/architecture.md +8 -4
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/python-launcher-options.md +3 -2
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/ray-runtime.md +3 -3
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/simplest-transform-tutorial.md +47 -37
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/testing-e2e-transform.md +2 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-external-resources.md +1 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-standalone-testing.md +1 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-tutorials.md +4 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transformer-utilities.md +3 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/pyproject.toml +1 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/SOURCES.txt +7 -4
- data_prep_toolkit-0.1.0/src/data_processing/runtime/__init__.py +4 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/pure_python/__init__.py +1 -1
- data_prep_toolkit-0.1.0/src/data_processing/runtime/pure_python/runtime_configuration.py +24 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/pure_python/transform_launcher.py +11 -11
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/pure_python/transform_orchestrator.py +11 -10
- data_prep_toolkit-0.1.0/src/data_processing/runtime/pure_python/transform_table_processor.py +53 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/__init__.py +4 -3
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_orchestrator_configuration.py → data_prep_toolkit-0.1.0/src/data_processing/runtime/ray/execution_configuration.py +1 -1
- data_prep_toolkit-0.1.0/src/data_processing/runtime/ray/runtime_configuration.py +38 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_launcher.py +13 -21
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_orchestrator.py +8 -8
- data_prep_toolkit-0.1.0/src/data_processing/runtime/ray/transform_table_processor.py +46 -0
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/python_launcher_configuration.py → data_prep_toolkit-0.1.0/src/data_processing/runtime/runtime_configuration.py +14 -47
- data_prep_toolkit-0.1.0/src/data_processing/runtime/transform_launcher.py +79 -0
- data_prep_toolkit-0.1.0/src/data_processing/runtime/transform_table_processor.py +176 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/transform/__init__.py +2 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/transform/noop_transform.py +34 -29
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/__init__.py +1 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/transform_configuration.py +34 -19
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/transform_utils.py +3 -7
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/pure_python/launcher_test.py +3 -12
- data_prep_toolkit-0.1.0/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +78 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +7 -9
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/ray/launcher_test.py +16 -23
- data_prep_toolkit-0.1.0/test/data_processing_tests/launch/ray/multi_launcher_test.py +80 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/ray/test_noop_launch.py +0 -1
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/__init__.py +0 -2
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/transform_table_processor.py +0 -191
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_configuration.py +0 -33
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_launch_configuration.py +0 -44
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_table_processor.py +0 -191
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/transform_launcher.py +0 -25
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/.gitignore +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/README.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/overview.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/processing-architecture.jpg +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/python-runtime.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/ray-launcher-options.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-runtimes.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-s3-testing.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-testing.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-tutorial-examples.md +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/setup.cfg +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/arrow_s3.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_factory.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_factory_base.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_local.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_s3.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/execution_configuration.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/ray_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_runtime.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_statistics.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/abstract_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/launch/transform_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/transform/transform_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/table_transform.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/transform_statistics.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/log.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/params_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/ray/ray_util_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/transform/test_noop.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/input/subdir/test1.parquet +0 -0
|
@@ -53,10 +53,11 @@ venv:: pyproject.toml
|
|
|
53
53
|
# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
|
|
54
54
|
test::
|
|
55
55
|
@# Help: Use the already-built virtual environment to run pytest on the test directory.
|
|
56
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
57
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
|
|
58
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python;
|
|
56
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
57
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
|
|
58
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/launcher_test.py;
|
|
59
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/test_noop_launch.py;
|
|
59
60
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/ray_util_test.py;
|
|
60
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
|
|
61
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
|
|
61
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
|
|
62
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
|
|
62
63
|
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/advanced-transform-tutorial.md
RENAMED
|
@@ -13,6 +13,7 @@ removes duplicate documents across all files. In this tutorial, we will show the
|
|
|
13
13
|
the operation of our _noop_ transform.
|
|
14
14
|
|
|
15
15
|
The complete task involves the following:
|
|
16
|
+
|
|
16
17
|
* EdedupTransform - class that implements the specific transformation
|
|
17
18
|
* EdedupRuntime - class that implements custom TransformRuntime to create supporting Ray objects and enhance job output
|
|
18
19
|
statistics
|
|
@@ -39,6 +40,7 @@ First, let's define the transform class. To do this we extend
|
|
|
39
40
|
the base abstract/interface class
|
|
40
41
|
[AbstractTableTransform](../src/data_processing/transform/table_transform.py),
|
|
41
42
|
which requires definition of the following:
|
|
43
|
+
|
|
42
44
|
* an initializer (i.e. `init()`) that accepts a dictionary of configuration
|
|
43
45
|
data. For this example, the configuration data will only be defined by
|
|
44
46
|
command line arguments (defined below).
|
|
@@ -56,15 +58,17 @@ from typing import Any
|
|
|
56
58
|
|
|
57
59
|
import pyarrow as pa
|
|
58
60
|
import ray
|
|
59
|
-
from data_processing.data_access import
|
|
61
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
60
62
|
from data_processing.runtime.ray import (
|
|
61
|
-
RayLauncherConfiguration,
|
|
62
63
|
DefaultTableTransformRuntimeRay,
|
|
63
|
-
RayUtils,
|
|
64
64
|
RayTransformLauncher,
|
|
65
|
+
RayUtils,
|
|
66
|
+
)
|
|
67
|
+
from data_processing.runtime.ray.runtime_configuration import (
|
|
68
|
+
RayTransformRuntimeConfiguration,
|
|
65
69
|
)
|
|
66
|
-
from data_processing.transform import AbstractTableTransform
|
|
67
|
-
from data_processing.utils import GB, TransformUtils
|
|
70
|
+
from data_processing.transform import AbstractTableTransform, TransformConfiguration
|
|
71
|
+
from data_processing.utils import GB, CLIArgumentProvider, TransformUtils, get_logger
|
|
68
72
|
from ray.actor import ActorHandle
|
|
69
73
|
|
|
70
74
|
|
|
@@ -138,6 +142,7 @@ First, let's define the transform runtime class. To do this we extend
|
|
|
138
142
|
the base abstract/interface class
|
|
139
143
|
[DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py),
|
|
140
144
|
which requires definition of the following:
|
|
145
|
+
|
|
141
146
|
* an initializer (i.e. `init()`) that accepts a dictionary of configuration
|
|
142
147
|
data. For this example, the configuration data will only be defined by
|
|
143
148
|
command line arguments (defined below).
|
|
@@ -202,8 +207,10 @@ collected by hash actors and custom computations based on statistics data.
|
|
|
202
207
|
|
|
203
208
|
## EdedupTableTransformConfiguration
|
|
204
209
|
|
|
205
|
-
The final class we need to implement is `
|
|
206
|
-
|
|
210
|
+
The final class we need to implement is `EdedupRayTransformConfiguration` class that provides configuration for
|
|
211
|
+
running our transform. Although we provide only Ray-based implementation, Ray-based configuration relies on Python-based
|
|
212
|
+
configuration that we need to define first. So we first need to define `EdedupTableTransformConfiguration` class,
|
|
213
|
+
defining the following:
|
|
207
214
|
|
|
208
215
|
* The short name for the transform
|
|
209
216
|
* The class implementing the transform - in our case EdedupTransform
|
|
@@ -216,10 +223,12 @@ First we define the class and its initializer,
|
|
|
216
223
|
short_name = "ededup"
|
|
217
224
|
cli_prefix = f"{short_name}_"
|
|
218
225
|
|
|
219
|
-
class EdedupTableTransformConfiguration(
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
226
|
+
class EdedupTableTransformConfiguration(TransformConfiguration):
|
|
227
|
+
def __init__(self):
|
|
228
|
+
super().__init__(
|
|
229
|
+
name=short_name,
|
|
230
|
+
transform_class=EdedupTransform,
|
|
231
|
+
)
|
|
223
232
|
```
|
|
224
233
|
|
|
225
234
|
The initializer extends the DefaultTableTransformConfiguration which provides simple
|
|
@@ -253,6 +262,13 @@ and which allows us to capture the `EdedupTransform`-specific arguments and opti
|
|
|
253
262
|
logger.info(f"exact dedup params are {self.params}")
|
|
254
263
|
return True
|
|
255
264
|
```
|
|
265
|
+
Now we can implement `EdedupRayTransformConfiguration`with the following code
|
|
266
|
+
```python
|
|
267
|
+
class EdedupRayTransformConfiguration(RayTransformConfiguration):
|
|
268
|
+
def __init__(self):
|
|
269
|
+
super().__init__(transform_config=EdedupTableTransformConfiguration(), runtime_class=EdedupRuntime)
|
|
270
|
+
|
|
271
|
+
```
|
|
256
272
|
|
|
257
273
|
## main()
|
|
258
274
|
|
|
@@ -261,11 +277,12 @@ framework's `TransformLauncher` class.
|
|
|
261
277
|
|
|
262
278
|
```python
|
|
263
279
|
if __name__ == "__main__":
|
|
264
|
-
|
|
265
|
-
|
|
280
|
+
launcher = RayTransformLauncher(EdedupRayTransformConfiguration())
|
|
281
|
+
launcher.launch()
|
|
266
282
|
```
|
|
283
|
+
|
|
267
284
|
The launcher requires only an instance of DefaultTableTransformConfiguration
|
|
268
|
-
(our `
|
|
285
|
+
(our `EdedupRayTransformConfiguration` class).
|
|
269
286
|
A single method `launch()` is then invoked to run the transform in a Ray cluster.
|
|
270
287
|
|
|
271
288
|
## Running
|
|
@@ -16,7 +16,7 @@ The architecture includes the following core components:
|
|
|
16
16
|
* [RayLauncher](../src/data_processing/runtime/ray/transform_launcher.py) accepts and validates
|
|
17
17
|
CLI parameters to establish the Ray Orchestrator with the proper configuration.
|
|
18
18
|
It uses the following components, all of which can/do define CLI configuration parameters.:
|
|
19
|
-
* [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/
|
|
19
|
+
* [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/execution_configuration.py) is responsible
|
|
20
20
|
for defining and validating infrastructure parameters
|
|
21
21
|
(e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state
|
|
22
22
|
(several dictionaries) and is fully pickleable. As a result framework uses its instance as a
|
|
@@ -32,20 +32,22 @@ It uses the following components, all of which can/do define CLI configuration p
|
|
|
32
32
|
After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton
|
|
33
33
|
and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation.
|
|
34
34
|
The Launcher waits for the Ray Orchestrator to complete.
|
|
35
|
-
|
|
35
|
+
|
|
36
|
+
* documents with [Ray Orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of
|
|
36
37
|
the data processing job. It creates the actors, determines the set of input data and distributes the
|
|
37
38
|
references to the data files to be processed by the workers. More specifically, it performs the following:
|
|
39
|
+
|
|
38
40
|
1. Uses the DataAccess instance created by the DataAccessFactory to determine the set of the files
|
|
39
41
|
to be processed.
|
|
40
42
|
2. uses the TransformConfiguration to create the TransformRuntime instance
|
|
41
43
|
3. Uses the TransformRuntime to optionally apply additional configuration (ray object storage, etc) for the configuration
|
|
42
44
|
and operation of the Transform.
|
|
43
|
-
|
|
45
|
+
4. uses the TransformOrchestratorConfiguration to determine the set of RayWorkers to create
|
|
44
46
|
to execute transformers in parallel, providing the following to each worker:
|
|
45
47
|
* Ray worker configuration
|
|
46
48
|
* DataAccessFactory
|
|
47
49
|
* Transform class and its TransformConfiguration containing the CLI parameters and any TransformRuntime additions.
|
|
48
|
-
|
|
50
|
+
5. in a load-balanced, round-robin fashion, distributes the names of the input files to the workers for them to transform/process.
|
|
49
51
|
|
|
50
52
|
Additionally, to provide monitoring of long-running transforms, the orchestrator is instrumented with
|
|
51
53
|
[custom metrics](https://docs.ray.io/en/latest/ray-observability/user-guides/add-app-metrics.html), that are exported to localhost:8080 (this is the endpoint that
|
|
@@ -53,11 +55,13 @@ It uses the following components, all of which can/do define CLI configuration p
|
|
|
53
55
|
Once all data is processed, the orchestrator will collect execution statistics (from the statistics actor)
|
|
54
56
|
and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution
|
|
55
57
|
result to the Launcher.
|
|
58
|
+
|
|
56
59
|
* [Ray worker](../src/data_processing/runtime/ray/transform_table_processor.py) is responsible for
|
|
57
60
|
reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8))
|
|
58
61
|
assigned by the orchestrator, applying the transform to the input table and writing out the
|
|
59
62
|
resulting table(s). Metadata produced by each table transformation is aggregated into
|
|
60
63
|
Transform Statistics (below).
|
|
64
|
+
|
|
61
65
|
* [Transform Statistics](../src/data_processing/runtime/ray/transform_statistics.py) is a general
|
|
62
66
|
purpose data collector actor aggregating the numeric metadata from different places of
|
|
63
67
|
the framework (especially metadata produced by the transform).
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
#
|
|
2
|
-
|
|
1
|
+
# Pure Python Launcher Command Line Options
|
|
2
|
+
|
|
3
|
+
A number of command line options are available when launching a transform as a Python class.
|
|
3
4
|
|
|
4
5
|
The following is a current --help output (a work in progress) for
|
|
5
6
|
the `NOOPTransform` (note the --noop_sleep_sec option):
|
|
@@ -5,7 +5,7 @@ The Ray runtime includes the following set of components:
|
|
|
5
5
|
class generally used to implement `main()` that makes use of a `TransformConfiguration` to
|
|
6
6
|
start the Ray runtime and execute the transform over the specified set of input files.
|
|
7
7
|
The RayTransformLauncher is created using a `RayTransformConfiguration` instance.
|
|
8
|
-
* [RayTransformConfiguration](../src/data_processing/runtime/ray/
|
|
8
|
+
* [RayTransformConfiguration](../src/data_processing/runtime/ray/runtime_configuration.py) - this
|
|
9
9
|
class extends transform's base TransformConfiguration implementation to add an optional
|
|
10
10
|
`TranformRuntime` (see next) class to be used by the transform implementation.
|
|
11
11
|
* [TransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py) -
|
|
@@ -40,7 +40,7 @@ launcher = RayTransformLauncher(YourTransformConfiguration())
|
|
|
40
40
|
launcher.launch()
|
|
41
41
|
```
|
|
42
42
|
Note that the launcher defines some additional CLI parameters that are used to control the operation of the
|
|
43
|
-
[orchestrator and workers](../src/data_processing/runtime/ray/
|
|
43
|
+
[orchestrator and workers](../src/data_processing/runtime/ray/execution_configuration.py) and
|
|
44
44
|
[data access](../src/data_processing/data_access/data_access_factory.py). Things such as data access configuration,
|
|
45
45
|
number of workers, worker resources, etc.
|
|
46
46
|
Discussion of these options is beyond the scope of this document
|
|
@@ -51,7 +51,7 @@ In general, a transform should be able to run in both the python and Ray runtime
|
|
|
51
51
|
As such we first define the python-only transform configuration, which will then
|
|
52
52
|
be used by the Ray-runtime-specific transform configuration.
|
|
53
53
|
The python transform configuration implements
|
|
54
|
-
[TransformConfiguration](../src/data_processing/
|
|
54
|
+
[TransformConfiguration](../src/data_processing/runtime/runtime_configuration.py)
|
|
55
55
|
and deifnes with transform-specific name, and implementation
|
|
56
56
|
and class. In addition, it is responsible for providing transform-specific
|
|
57
57
|
methods to define and capture optional command line arguments.
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/simplest-transform-tutorial.md
RENAMED
|
@@ -15,33 +15,38 @@ one table to another. That said, we will show the following:
|
|
|
15
15
|
the operation of our _noop_ transform.
|
|
16
16
|
|
|
17
17
|
We will **not** be showing the following:
|
|
18
|
-
* The creation of a custom TransformRuntime that would enable more global
|
|
19
|
-
state and/or coordination among the transforms running in other
|
|
18
|
+
* The creation of a custom `TransformRuntime` that would enable more global
|
|
19
|
+
state and/or coordination among the transforms running in other Ray actors.
|
|
20
20
|
This will be covered in an advanced tutorial.
|
|
21
21
|
|
|
22
22
|
The complete task involves the following:
|
|
23
|
-
|
|
24
|
-
*
|
|
25
|
-
|
|
26
|
-
|
|
23
|
+
|
|
24
|
+
* `NOOPTransform` - class that implements the specific transformation
|
|
25
|
+
* `NOOPTableTransformConfiguration` - class that provides configuration for the
|
|
26
|
+
`NOOPTransform`, specifically the command line arguments used to configure it.
|
|
27
|
+
* `main()` - simple creation and use of the `TransformLauncher`.
|
|
27
28
|
|
|
28
29
|
(Currently, the complete code for the noop transform used for this
|
|
29
30
|
tutorial can be found in the
|
|
30
31
|
[noop transform](../../transforms/universal/noop) directory.
|
|
31
32
|
|
|
32
|
-
Finally, we show to use the command line to run the transform in a local ray cluster
|
|
33
|
+
Finally, we show how to use the command line to run the transform in a local ray cluster.
|
|
34
|
+
|
|
35
|
+
> **Note:** You will need to run the setup commands in the [`../README`](..) before running the following examples.
|
|
36
|
+
|
|
33
37
|
|
|
34
|
-
## NOOPTransform
|
|
38
|
+
## `NOOPTransform`
|
|
35
39
|
|
|
36
40
|
First, let's define the transform class. To do this we extend
|
|
37
41
|
the base abstract/interface class
|
|
38
|
-
[AbstractTableTransform](../src/data_processing_ibm/transform/table_transform.py),
|
|
42
|
+
[`AbstractTableTransform`](../src/data_processing_ibm/transform/table_transform.py),
|
|
39
43
|
which requires definition of the following:
|
|
44
|
+
|
|
40
45
|
* an initializer (i.e. `init()`) that accepts a dictionary of configuration
|
|
41
46
|
data. For this example, the configuration data will only be defined by
|
|
42
47
|
command line arguments (defined below).
|
|
43
|
-
* the `transform()` method itself that takes an input table produces an output
|
|
44
|
-
table
|
|
48
|
+
* the `transform()` method itself that takes an input table and produces an output
|
|
49
|
+
table with any associated metadata for that table transformation.
|
|
45
50
|
|
|
46
51
|
Other methods such as `flush()` need not be overridden/redefined for this simple example.
|
|
47
52
|
|
|
@@ -54,18 +59,18 @@ from argparse import ArgumentParser, Namespace
|
|
|
54
59
|
from typing import Any
|
|
55
60
|
|
|
56
61
|
import pyarrow as pa
|
|
57
|
-
from data_processing.runtime.ray import
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
RayTransformLauncher,
|
|
62
|
+
from data_processing.runtime.ray import RayTransformLauncher
|
|
63
|
+
from data_processing.runtime.ray.runtime_configuration import (
|
|
64
|
+
RayTransformRuntimeConfiguration,
|
|
61
65
|
)
|
|
62
|
-
from data_processing.transform import AbstractTableTransform
|
|
66
|
+
from data_processing.transform import AbstractTableTransform, TransformConfiguration
|
|
67
|
+
from data_processing.utils import CLIArgumentProvider, get_logger
|
|
63
68
|
|
|
64
69
|
|
|
65
70
|
class NOOPTransform(AbstractTableTransform):
|
|
66
71
|
|
|
67
|
-
|
|
68
|
-
|
|
72
|
+
def __init__(self, config: dict[str, Any]):
|
|
73
|
+
self.sleep = config.get("sleep", 1)
|
|
69
74
|
```
|
|
70
75
|
The `NOOPTransform` class extends the `AbstractTableTransform`, which defines the required methods.
|
|
71
76
|
|
|
@@ -75,7 +80,7 @@ with an amount of seconds to sleep/delay during the call to `transform()`.
|
|
|
75
80
|
Configuration is provided by the framework in a dictionary provided to the initializer.
|
|
76
81
|
Below we will cover how this `sleep` argument is made available to the initializer.
|
|
77
82
|
|
|
78
|
-
Note that in more complex transforms that might, for example, load a
|
|
83
|
+
Note that in more complex transforms that might, for example, load a Hugging Face or other model,
|
|
79
84
|
or perform other deep initializations, these can be done in the initializer.
|
|
80
85
|
|
|
81
86
|
Next we define the `transform()` method itself, which includes the addition of some
|
|
@@ -90,19 +95,18 @@ almost trivial metadata.
|
|
|
90
95
|
return [table], metadata
|
|
91
96
|
```
|
|
92
97
|
The single input to this method is the in-memory pyarrow table to be transformed.
|
|
93
|
-
The return of this
|
|
94
|
-
case
|
|
98
|
+
The return value of this method is a list of tables and optional metadata. In this
|
|
99
|
+
case, we are doing a simple 1:1 table conversion, so the list will contain a single table, the input table.
|
|
95
100
|
The metadata is a free-form dictionary of keys with numeric values that will be aggregated
|
|
96
101
|
by the framework and reported as aggregated job statistics metadata.
|
|
97
102
|
If there is no metadata then simply return an empty dictionary.
|
|
98
103
|
|
|
99
|
-
## NOOPTransformConfiguration
|
|
104
|
+
## `NOOPTransformConfiguration`
|
|
100
105
|
|
|
101
|
-
Next we define the `NOOPTransformConfiguration` and
|
|
102
|
-
classes and there initializer that define the following:
|
|
106
|
+
Next we define the `NOOPTransformConfiguration` class and its initializer that defines the following:
|
|
103
107
|
|
|
104
108
|
* The short name for the transform
|
|
105
|
-
* The class implementing the transform - in our case NOOPTransform
|
|
109
|
+
* The class implementing the transform - in our case `NOOPTransform`
|
|
106
110
|
* Command line argument support.
|
|
107
111
|
|
|
108
112
|
We also define the `NOOPRayTransformationConfiguration` so we can run the transform
|
|
@@ -125,18 +129,20 @@ class NOOPTransformConfiguration(TransformConfiguration):
|
|
|
125
129
|
remove_from_metadata=[pwd_key],
|
|
126
130
|
)
|
|
127
131
|
```
|
|
128
|
-
|
|
129
|
-
|
|
132
|
+
|
|
133
|
+
The initializer extends the `TransformConfiguration` that provides simple
|
|
134
|
+
capture of our configuration data and enables the ability to pickle through the network.
|
|
130
135
|
It also adds a `params` field that will be used below to hold the transform's
|
|
131
136
|
configuration data (used in `NOOPTransform.init()` above).
|
|
132
137
|
|
|
133
138
|
Next, we provide two methods that define and capture the command line configuration that
|
|
134
|
-
is specific to the `NOOPTransform`, in this case the number of seconds to sleep during transformation
|
|
135
|
-
and an example command line, `pwd
|
|
136
|
-
in the job metadata produced by the
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
It is
|
|
139
|
+
is specific to the `NOOPTransform`, in this case the parameters are the number of seconds to sleep during transformation
|
|
140
|
+
and an example command line parameter, `pwd` ("password"), option holding sensitive data that we don't want reported
|
|
141
|
+
in the job metadata produced by the Ray orchestrator.
|
|
142
|
+
|
|
143
|
+
The first method establishes the command line arguments.
|
|
144
|
+
It is given a global argument parser to which the `NOOPTransform` arguments are added.
|
|
145
|
+
It is a good practice to include a common prefix to all transform-specific options (i.e. pii, lang, etc).
|
|
140
146
|
In our case we will use `noop_`.
|
|
141
147
|
|
|
142
148
|
```python
|
|
@@ -159,6 +165,7 @@ In our case we will use `noop_`.
|
|
|
159
165
|
```
|
|
160
166
|
Next we implement a method that is called after the CLI args are parsed (usually by one
|
|
161
167
|
of the runtimes) and which allows us to capture the `NOOPTransform`-specific arguments.
|
|
168
|
+
|
|
162
169
|
|
|
163
170
|
```python
|
|
164
171
|
|
|
@@ -176,13 +183,16 @@ To run the transform on a set of input data, we use one of the runtimes, each de
|
|
|
176
183
|
### Python Runtime
|
|
177
184
|
To run in the python runtime, we need to create the instance of `PythonTransformLauncher`
|
|
178
185
|
using the `NOOPTransformConfiguration`, and launch it as follows:
|
|
186
|
+
|
|
179
187
|
```python
|
|
180
188
|
if __name__ == "__main__":
|
|
181
189
|
launcher = PythonTransformLauncher(transform_config=NOOPTransformConfiguration())
|
|
182
190
|
launcher.launch()
|
|
183
191
|
```
|
|
184
192
|
|
|
185
|
-
|
|
193
|
+
## Running
|
|
194
|
+
|
|
195
|
+
Assuming the above `main` code is placed in `noop_main.py` we can run the transform on some test data. We'll use data in the repo for the noop transform
|
|
186
196
|
and create a temporary directory to hold the output:
|
|
187
197
|
```shell
|
|
188
198
|
export DPK_REPOROOT=...
|
|
@@ -191,9 +201,9 @@ export NOOP_INPUT=$DPK_REPOROOT/transforms/universal/noop/test-data/input
|
|
|
191
201
|
To run
|
|
192
202
|
```shell
|
|
193
203
|
python noop_main.py --noop_sleep_msec 2 \
|
|
194
|
-
--data_local_config "{'input_folder': '"$NOOP_INPUT"', 'output_folder': '/tmp/noop-output'}"
|
|
204
|
+
--data_local_config "{'input_folder': '"$NOOP_INPUT"', 'output_folder': '/tmp/noop-output'}"
|
|
195
205
|
```
|
|
196
|
-
See the [python launcher options](python-launcher-options) for a complete list of
|
|
206
|
+
See the [python launcher options](python-launcher-options.md) for a complete list of
|
|
197
207
|
transform-independent command line options.
|
|
198
208
|
|
|
199
209
|
### Ray Runtime
|
|
@@ -207,5 +217,5 @@ if __name__ == "__main__":
|
|
|
207
217
|
```
|
|
208
218
|
We can run this with the same command as for the python runtime but to run in local Ray
|
|
209
219
|
add the `--run_locally True` option.
|
|
210
|
-
See the [ray launcher options](ray-launcher-options) for a complete list of
|
|
220
|
+
See the [ray launcher options](ray-launcher-options.md) for a complete list of
|
|
211
221
|
transform-independent command line options.
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# Testing End-to-End Transform operation
|
|
2
2
|
WIP - Points to discuss
|
|
3
|
+
|
|
3
4
|
1. Reading input files and writing output files.
|
|
4
|
-
2. Testing of the transform runtime and use of ray components in the transform
|
|
5
|
+
2. Testing of the transform runtime and use of ray components in the transform
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-external-resources.md
RENAMED
|
@@ -8,6 +8,7 @@ In addition to actually loading the resource(s), the transform needs to define t
|
|
|
8
8
|
defines the location of the domain list.
|
|
9
9
|
|
|
10
10
|
In the next sections we cover the following:
|
|
11
|
+
|
|
11
12
|
1. How to define the transform-specific resource location(s) as command line arguments
|
|
12
13
|
2. How to load the transform-specific resources, either or both of:
|
|
13
14
|
1. During transform initialization - this is useful for testing outside of ray, and optionally
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-standalone-testing.md
RENAMED
|
@@ -15,6 +15,7 @@ transform implementation tests will easily leverage.
|
|
|
15
15
|
|
|
16
16
|
The first (currently only test) is a the `test_transform()` method that takes the
|
|
17
17
|
following inputs:
|
|
18
|
+
|
|
18
19
|
* the transform implementation being tested, properly configured with the configuration
|
|
19
20
|
dictionary for the associated test data.
|
|
20
21
|
* a list of N (1 or more) input tables to be processed with the transform's `transform(Table)` method.
|
|
@@ -41,19 +41,22 @@ The return values are handled the same waa as the return values for `transform()
|
|
|
41
41
|
not need this feature, a default implementation is provided to return an empty list and empty dictionary.
|
|
42
42
|
|
|
43
43
|
#### TransformConfiguration class
|
|
44
|
-
The [TransformConfiguration](../src/data_processing/
|
|
44
|
+
The [TransformConfiguration](../src/data_processing/runtime/runtime_configuration.py)
|
|
45
45
|
serves as an interface and must be implemented by the any `AbstractTableTransform`
|
|
46
46
|
implementation to provide the following configuration:
|
|
47
|
+
|
|
47
48
|
* the transform class to be used,
|
|
48
49
|
* command line arguments used to initialize the Transform Runtime and generally, the Transform.
|
|
49
50
|
* Transform Runtime class to use
|
|
50
51
|
* transform short name
|
|
52
|
+
|
|
51
53
|
It is expected that transforms are initialized with a fixed name, the class of its corresponding
|
|
52
54
|
`AbstractTableTransform` implementation and optionally the configuration keys that should not
|
|
53
55
|
be exposed as metadata for a run.
|
|
54
56
|
To support command line configuration, the `TransformConfiguration` extends the
|
|
55
57
|
[CLIArgumentProvider](../src/data_processing/utils/cli_utils.py) class.
|
|
56
58
|
The set of methods of interest are
|
|
59
|
+
|
|
57
60
|
* ```__init__(self, name:str, transform_class:type[AbstractTableTransform], list[str]:remove_from_metadata )``` - sets the required fields
|
|
58
61
|
* ```add_input_params(self, parser:ArgumentParser)``` - adds transform-specific command line options that will
|
|
59
62
|
be made available in the dictionary provided to the transform's initializer.
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
A class [TransformUtils](../src/data_processing/utils/transform_utils.py) provides several methods that simplify
|
|
4
4
|
transformer's implementation. Currently it includes the following methods:
|
|
5
|
+
|
|
5
6
|
* `deep_get_size` is the method to get the complete size of the Python object based on
|
|
6
7
|
https://www.askpython.com/python/built-in-methods/variables-memory-size-in-python
|
|
7
8
|
It supports Python structures: list, tuple and set
|
|
@@ -17,8 +18,9 @@ be removed before it is added
|
|
|
17
18
|
removes URL encoding
|
|
18
19
|
|
|
19
20
|
It also contain two variables:
|
|
21
|
+
|
|
20
22
|
* `RANDOM_SEED` number that is used for methods that require seed
|
|
21
23
|
* `LOCAL_TO_DISK` rough local size to size on disk/S3
|
|
22
24
|
|
|
23
25
|
This class should be extended with additional methods, generally useful across multiple transformers and documentation
|
|
24
|
-
should be added here
|
|
26
|
+
should be added here
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/SOURCES.txt
RENAMED
|
@@ -35,19 +35,20 @@ src/data_processing/data_access/data_access_local.py
|
|
|
35
35
|
src/data_processing/data_access/data_access_s3.py
|
|
36
36
|
src/data_processing/runtime/__init__.py
|
|
37
37
|
src/data_processing/runtime/execution_configuration.py
|
|
38
|
+
src/data_processing/runtime/runtime_configuration.py
|
|
38
39
|
src/data_processing/runtime/transform_launcher.py
|
|
40
|
+
src/data_processing/runtime/transform_table_processor.py
|
|
39
41
|
src/data_processing/runtime/pure_python/__init__.py
|
|
40
|
-
src/data_processing/runtime/pure_python/
|
|
42
|
+
src/data_processing/runtime/pure_python/runtime_configuration.py
|
|
41
43
|
src/data_processing/runtime/pure_python/transform_launcher.py
|
|
42
44
|
src/data_processing/runtime/pure_python/transform_orchestrator.py
|
|
43
45
|
src/data_processing/runtime/pure_python/transform_table_processor.py
|
|
44
46
|
src/data_processing/runtime/ray/__init__.py
|
|
47
|
+
src/data_processing/runtime/ray/execution_configuration.py
|
|
45
48
|
src/data_processing/runtime/ray/ray_utils.py
|
|
46
|
-
src/data_processing/runtime/ray/
|
|
47
|
-
src/data_processing/runtime/ray/transform_launch_configuration.py
|
|
49
|
+
src/data_processing/runtime/ray/runtime_configuration.py
|
|
48
50
|
src/data_processing/runtime/ray/transform_launcher.py
|
|
49
51
|
src/data_processing/runtime/ray/transform_orchestrator.py
|
|
50
|
-
src/data_processing/runtime/ray/transform_orchestrator_configuration.py
|
|
51
52
|
src/data_processing/runtime/ray/transform_runtime.py
|
|
52
53
|
src/data_processing/runtime/ray/transform_statistics.py
|
|
53
54
|
src/data_processing/runtime/ray/transform_table_processor.py
|
|
@@ -88,8 +89,10 @@ test/data_processing_tests/data_access/data_access_local_test.py
|
|
|
88
89
|
test/data_processing_tests/data_access/data_access_s3_test.py
|
|
89
90
|
test/data_processing_tests/data_access/sample_input_data_test.py
|
|
90
91
|
test/data_processing_tests/launch/pure_python/launcher_test.py
|
|
92
|
+
test/data_processing_tests/launch/pure_python/multi_launcher_test.py
|
|
91
93
|
test/data_processing_tests/launch/pure_python/test_noop_launch.py
|
|
92
94
|
test/data_processing_tests/launch/ray/launcher_test.py
|
|
95
|
+
test/data_processing_tests/launch/ray/multi_launcher_test.py
|
|
93
96
|
test/data_processing_tests/launch/ray/ray_util_test.py
|
|
94
97
|
test/data_processing_tests/launch/ray/test_noop_launch.py
|
|
95
98
|
test/data_processing_tests/transform/test_noop.py
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from data_processing.runtime.execution_configuration import TransformExecutionConfiguration
|
|
2
|
+
from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration
|
|
3
|
+
from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_luncher
|
|
4
|
+
from data_processing.runtime.transform_table_processor import AbstractTransformTableProcessor
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from data_processing.runtime.pure_python.
|
|
1
|
+
from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
|
|
2
2
|
from data_processing.runtime.pure_python.transform_table_processor import TransformTableProcessor
|
|
3
3
|
from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
|
|
4
4
|
from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
|
+
from data_processing.transform import TransformConfiguration
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
18
|
+
def __init__(self, transform_config: TransformConfiguration):
|
|
19
|
+
"""
|
|
20
|
+
Initialization
|
|
21
|
+
:param transform_config - base configuration class
|
|
22
|
+
"""
|
|
23
|
+
self.transform_config = transform_config
|
|
24
|
+
super().__init__(transform_config=transform_config)
|
|
@@ -15,9 +15,11 @@ import time
|
|
|
15
15
|
|
|
16
16
|
from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
|
|
17
17
|
from data_processing.runtime import TransformExecutionConfiguration
|
|
18
|
-
from data_processing.runtime.pure_python import
|
|
18
|
+
from data_processing.runtime.pure_python import (
|
|
19
|
+
PythonTransformRuntimeConfiguration,
|
|
20
|
+
orchestrate,
|
|
21
|
+
)
|
|
19
22
|
from data_processing.runtime.transform_launcher import AbstractTransformLauncher
|
|
20
|
-
from data_processing.transform import TransformConfiguration
|
|
21
23
|
from data_processing.utils import get_logger
|
|
22
24
|
|
|
23
25
|
|
|
@@ -31,18 +33,16 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
31
33
|
|
|
32
34
|
def __init__(
|
|
33
35
|
self,
|
|
34
|
-
|
|
35
|
-
transform_config: TransformConfiguration,
|
|
36
|
+
runtime_config: PythonTransformRuntimeConfiguration,
|
|
36
37
|
data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
|
|
37
38
|
):
|
|
38
39
|
"""
|
|
39
40
|
Creates driver
|
|
40
|
-
:param
|
|
41
|
+
:param runtime_config: transform runtime factory
|
|
41
42
|
:param data_access_factory: the factory to create DataAccess instances.
|
|
42
43
|
"""
|
|
43
|
-
super().__init__(
|
|
44
|
-
self.
|
|
45
|
-
self.execution_config = TransformExecutionConfiguration(name=self.transform_runtime_config.get_name())
|
|
44
|
+
super().__init__(runtime_config, data_access_factory)
|
|
45
|
+
self.execution_config = TransformExecutionConfiguration(name=runtime_config.get_name())
|
|
46
46
|
|
|
47
47
|
def __get_parameters(self) -> bool:
|
|
48
48
|
"""
|
|
@@ -57,12 +57,12 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
57
57
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
58
58
|
)
|
|
59
59
|
# add additional arguments
|
|
60
|
-
self.
|
|
60
|
+
self.runtime_config.add_input_params(parser=parser)
|
|
61
61
|
self.data_access_factory.add_input_params(parser=parser)
|
|
62
62
|
self.execution_config.add_input_params(parser=parser)
|
|
63
63
|
args = parser.parse_args()
|
|
64
64
|
return (
|
|
65
|
-
self.
|
|
65
|
+
self.runtime_config.apply_input_params(args=args)
|
|
66
66
|
and self.execution_config.apply_input_params(args=args)
|
|
67
67
|
and self.data_access_factory.apply_input_params(args=args)
|
|
68
68
|
)
|
|
@@ -78,7 +78,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
78
78
|
logger.debug("Starting orchestrator")
|
|
79
79
|
res = orchestrate(
|
|
80
80
|
data_access_factory=self.data_access_factory,
|
|
81
|
-
|
|
81
|
+
runtime_config=self.runtime_config,
|
|
82
82
|
execution_config=self.execution_config,
|
|
83
83
|
)
|
|
84
84
|
logger.debug("Completed orchestrator")
|