data-prep-toolkit 0.0.1.dev12__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/Makefile +6 -5
  2. {data_prep_toolkit-0.0.1.dev12/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.1.0}/PKG-INFO +1 -1
  3. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/advanced-transform-tutorial.md +31 -14
  4. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/architecture.md +8 -4
  5. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/python-launcher-options.md +3 -2
  6. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/ray-runtime.md +3 -3
  7. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/simplest-transform-tutorial.md +47 -37
  8. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/testing-e2e-transform.md +2 -1
  9. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-external-resources.md +1 -0
  10. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-standalone-testing.md +1 -0
  11. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-tutorials.md +4 -1
  12. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transformer-utilities.md +3 -1
  13. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/pyproject.toml +1 -1
  14. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
  15. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/SOURCES.txt +7 -4
  16. data_prep_toolkit-0.1.0/src/data_processing/runtime/__init__.py +4 -0
  17. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/pure_python/__init__.py +1 -1
  18. data_prep_toolkit-0.1.0/src/data_processing/runtime/pure_python/runtime_configuration.py +24 -0
  19. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/pure_python/transform_launcher.py +11 -11
  20. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/pure_python/transform_orchestrator.py +11 -10
  21. data_prep_toolkit-0.1.0/src/data_processing/runtime/pure_python/transform_table_processor.py +53 -0
  22. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/__init__.py +4 -3
  23. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_orchestrator_configuration.py → data_prep_toolkit-0.1.0/src/data_processing/runtime/ray/execution_configuration.py +1 -1
  24. data_prep_toolkit-0.1.0/src/data_processing/runtime/ray/runtime_configuration.py +38 -0
  25. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_launcher.py +13 -21
  26. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_orchestrator.py +8 -8
  27. data_prep_toolkit-0.1.0/src/data_processing/runtime/ray/transform_table_processor.py +46 -0
  28. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/python_launcher_configuration.py → data_prep_toolkit-0.1.0/src/data_processing/runtime/runtime_configuration.py +14 -47
  29. data_prep_toolkit-0.1.0/src/data_processing/runtime/transform_launcher.py +79 -0
  30. data_prep_toolkit-0.1.0/src/data_processing/runtime/transform_table_processor.py +176 -0
  31. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/transform/__init__.py +2 -1
  32. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/transform/noop_transform.py +34 -29
  33. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/__init__.py +1 -1
  34. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/transform_configuration.py +34 -19
  35. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/transform_utils.py +3 -7
  36. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/pure_python/launcher_test.py +3 -12
  37. data_prep_toolkit-0.1.0/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +78 -0
  38. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +7 -9
  39. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/ray/launcher_test.py +16 -23
  40. data_prep_toolkit-0.1.0/test/data_processing_tests/launch/ray/multi_launcher_test.py +80 -0
  41. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/ray/test_noop_launch.py +0 -1
  42. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/__init__.py +0 -2
  43. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/transform_table_processor.py +0 -191
  44. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_configuration.py +0 -33
  45. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_launch_configuration.py +0 -44
  46. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_table_processor.py +0 -191
  47. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/transform_launcher.py +0 -25
  48. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/.gitignore +0 -0
  49. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/README.md +0 -0
  50. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/overview.md +0 -0
  51. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/processing-architecture.jpg +0 -0
  52. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/python-runtime.md +0 -0
  53. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/ray-launcher-options.md +0 -0
  54. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-runtimes.md +0 -0
  55. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-s3-testing.md +0 -0
  56. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-testing.md +0 -0
  57. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/doc/transform-tutorial-examples.md +0 -0
  58. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/setup.cfg +0 -0
  59. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  60. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
  61. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  62. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/__init__.py +0 -0
  63. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/__init__.py +0 -0
  64. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/arrow_s3.py +0 -0
  65. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access.py +0 -0
  66. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_factory.py +0 -0
  67. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  68. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_local.py +0 -0
  69. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/data_access/data_access_s3.py +0 -0
  70. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/execution_configuration.py +0 -0
  71. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/ray_utils.py +0 -0
  72. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_runtime.py +0 -0
  73. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/runtime/ray/transform_statistics.py +0 -0
  74. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/__init__.py +0 -0
  75. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/abstract_test.py +0 -0
  76. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/data_access/__init__.py +0 -0
  77. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  78. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/launch/__init__.py +0 -0
  79. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/launch/transform_test.py +0 -0
  80. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/test_support/transform/transform_test.py +0 -0
  81. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/table_transform.py +0 -0
  82. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/transform/transform_statistics.py +0 -0
  83. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/__init__.py +0 -0
  84. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/cli_utils.py +0 -0
  85. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/config.py +0 -0
  86. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/log.py +0 -0
  87. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/src/data_processing/utils/params_utils.py +0 -0
  88. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  89. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
  90. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
  91. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  92. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/launch/ray/ray_util_test.py +0 -0
  93. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/transform/test_noop.py +0 -0
  94. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  95. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  96. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  97. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  98. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  99. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input/sample1.parquet +0 -0
  100. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  101. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  102. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  103. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/expected/metadata.json +0 -0
  104. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/expected/sample1.parquet +0 -0
  105. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/expected/subdir/test1.parquet +0 -0
  106. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/input/sample1.parquet +0 -0
  107. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.0}/test-data/data_processing/ray/noop/input/subdir/test1.parquet +0 -0
@@ -53,10 +53,11 @@ venv:: pyproject.toml
53
53
  # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
54
54
  test::
55
55
  @# Help: Use the already-built virtual environment to run pytest on the test directory.
56
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
57
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
58
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python;
56
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
57
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
58
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/launcher_test.py;
59
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/test_noop_launch.py;
59
60
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/ray_util_test.py;
60
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
61
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
61
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
62
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
62
63
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.0.1.dev12
3
+ Version: 0.1.0
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -13,6 +13,7 @@ removes duplicate documents across all files. In this tutorial, we will show the
13
13
  the operation of our _noop_ transform.
14
14
 
15
15
  The complete task involves the following:
16
+
16
17
  * EdedupTransform - class that implements the specific transformation
17
18
  * EdedupRuntime - class that implements custom TransformRuntime to create supporting Ray objects and enhance job output
18
19
  statistics
@@ -39,6 +40,7 @@ First, let's define the transform class. To do this we extend
39
40
  the base abstract/interface class
40
41
  [AbstractTableTransform](../src/data_processing/transform/table_transform.py),
41
42
  which requires definition of the following:
43
+
42
44
  * an initializer (i.e. `init()`) that accepts a dictionary of configuration
43
45
  data. For this example, the configuration data will only be defined by
44
46
  command line arguments (defined below).
@@ -56,15 +58,17 @@ from typing import Any
56
58
 
57
59
  import pyarrow as pa
58
60
  import ray
59
- from data_processing.data_access import DataAccessFactory
61
+ from data_processing.data_access import DataAccessFactoryBase
60
62
  from data_processing.runtime.ray import (
61
- RayLauncherConfiguration,
62
63
  DefaultTableTransformRuntimeRay,
63
- RayUtils,
64
64
  RayTransformLauncher,
65
+ RayUtils,
66
+ )
67
+ from data_processing.runtime.ray.runtime_configuration import (
68
+ RayTransformRuntimeConfiguration,
65
69
  )
66
- from data_processing.transform import AbstractTableTransform
67
- from data_processing.utils import GB, TransformUtils
70
+ from data_processing.transform import AbstractTableTransform, TransformConfiguration
71
+ from data_processing.utils import GB, CLIArgumentProvider, TransformUtils, get_logger
68
72
  from ray.actor import ActorHandle
69
73
 
70
74
 
@@ -138,6 +142,7 @@ First, let's define the transform runtime class. To do this we extend
138
142
  the base abstract/interface class
139
143
  [DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py),
140
144
  which requires definition of the following:
145
+
141
146
  * an initializer (i.e. `init()`) that accepts a dictionary of configuration
142
147
  data. For this example, the configuration data will only be defined by
143
148
  command line arguments (defined below).
@@ -202,8 +207,10 @@ collected by hash actors and custom computations based on statistics data.
202
207
 
203
208
  ## EdedupTableTransformConfiguration
204
209
 
205
- The final class we need to implement is `EdedupTableTransformConfiguration` class and its initializer that
206
- define the following:
210
+ The final class we need to implement is `EdedupRayTransformConfiguration` class that provides configuration for
211
+ running our transform. Although we provide only Ray-based implementation, Ray-based configuration relies on Python-based
212
+ configuration that we need to define first. So we first need to define `EdedupTableTransformConfiguration` class,
213
+ defining the following:
207
214
 
208
215
  * The short name for the transform
209
216
  * The class implementing the transform - in our case EdedupTransform
@@ -216,10 +223,12 @@ First we define the class and its initializer,
216
223
  short_name = "ededup"
217
224
  cli_prefix = f"{short_name}_"
218
225
 
219
- class EdedupTableTransformConfiguration(DefaultTableTransformConfiguration):
220
- def __init__(self):
221
- super().__init__(name=short_name, runtime_class=EdedupRuntime, transform_class=EdedupTransform)
222
- self.params = {}
226
+ class EdedupTableTransformConfiguration(TransformConfiguration):
227
+ def __init__(self):
228
+ super().__init__(
229
+ name=short_name,
230
+ transform_class=EdedupTransform,
231
+ )
223
232
  ```
224
233
 
225
234
  The initializer extends the DefaultTableTransformConfiguration which provides simple
@@ -253,6 +262,13 @@ and which allows us to capture the `EdedupTransform`-specific arguments and opti
253
262
  logger.info(f"exact dedup params are {self.params}")
254
263
  return True
255
264
  ```
265
+ Now we can implement `EdedupRayTransformConfiguration`with the following code
266
+ ```python
267
+ class EdedupRayTransformConfiguration(RayTransformConfiguration):
268
+ def __init__(self):
269
+ super().__init__(transform_config=EdedupTableTransformConfiguration(), runtime_class=EdedupRuntime)
270
+
271
+ ```
256
272
 
257
273
  ## main()
258
274
 
@@ -261,11 +277,12 @@ framework's `TransformLauncher` class.
261
277
 
262
278
  ```python
263
279
  if __name__ == "__main__":
264
- launcher = TransformLauncher(transform_runtime_config=EdedupTransformConfiguration())
265
- launcher.launch()
280
+ launcher = RayTransformLauncher(EdedupRayTransformConfiguration())
281
+ launcher.launch()
266
282
  ```
283
+
267
284
  The launcher requires only an instance of DefaultTableTransformConfiguration
268
- (our `EdedupTransformConfiguration` class).
285
+ (our `EdedupRayTransformConfiguration` class).
269
286
  A single method `launch()` is then invoked to run the transform in a Ray cluster.
270
287
 
271
288
  ## Running
@@ -16,7 +16,7 @@ The architecture includes the following core components:
16
16
  * [RayLauncher](../src/data_processing/runtime/ray/transform_launcher.py) accepts and validates
17
17
  CLI parameters to establish the Ray Orchestrator with the proper configuration.
18
18
  It uses the following components, all of which can/do define CLI configuration parameters.:
19
- * [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/transform_orchestrator_configuration.py) is responsible
19
+ * [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/execution_configuration.py) is responsible
20
20
  for defining and validating infrastructure parameters
21
21
  (e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state
22
22
  (several dictionaries) and is fully pickleable. As a result framework uses its instance as a
@@ -32,20 +32,22 @@ It uses the following components, all of which can/do define CLI configuration p
32
32
  After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton
33
33
  and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation.
34
34
  The Launcher waits for the Ray Orchestrator to complete.
35
- * [Ray Orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of
35
+
36
+ * documents with [Ray Orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of
36
37
  the data processing job. It creates the actors, determines the set of input data and distributes the
37
38
  references to the data files to be processed by the workers. More specifically, it performs the following:
39
+
38
40
  1. Uses the DataAccess instance created by the DataAccessFactory to determine the set of the files
39
41
  to be processed.
40
42
  2. uses the TransformConfiguration to create the TransformRuntime instance
41
43
  3. Uses the TransformRuntime to optionally apply additional configuration (ray object storage, etc) for the configuration
42
44
  and operation of the Transform.
43
- 3. uses the TransformOrchestratorConfiguration to determine the set of RayWorkers to create
45
+ 4. uses the TransformOrchestratorConfiguration to determine the set of RayWorkers to create
44
46
  to execute transformers in parallel, providing the following to each worker:
45
47
  * Ray worker configuration
46
48
  * DataAccessFactory
47
49
  * Transform class and its TransformConfiguration containing the CLI parameters and any TransformRuntime additions.
48
- 4. in a load-balanced, round-robin fashion, distributes the names of the input files to the workers for them to transform/process.
50
+ 5. in a load-balanced, round-robin fashion, distributes the names of the input files to the workers for them to transform/process.
49
51
 
50
52
  Additionally, to provide monitoring of long-running transforms, the orchestrator is instrumented with
51
53
  [custom metrics](https://docs.ray.io/en/latest/ray-observability/user-guides/add-app-metrics.html), that are exported to localhost:8080 (this is the endpoint that
@@ -53,11 +55,13 @@ It uses the following components, all of which can/do define CLI configuration p
53
55
  Once all data is processed, the orchestrator will collect execution statistics (from the statistics actor)
54
56
  and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution
55
57
  result to the Launcher.
58
+
56
59
  * [Ray worker](../src/data_processing/runtime/ray/transform_table_processor.py) is responsible for
57
60
  reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8))
58
61
  assigned by the orchestrator, applying the transform to the input table and writing out the
59
62
  resulting table(s). Metadata produced by each table transformation is aggregated into
60
63
  Transform Statistics (below).
64
+
61
65
  * [Transform Statistics](../src/data_processing/runtime/ray/transform_statistics.py) is a general
62
66
  purpose data collector actor aggregating the numeric metadata from different places of
63
67
  the framework (especially metadata produced by the transform).
@@ -1,5 +1,6 @@
1
- # Ray Launcher Command Line Options
2
- A number of command line options are available when launching a transform.
1
+ # Pure Python Launcher Command Line Options
2
+
3
+ A number of command line options are available when launching a transform as a Python class.
3
4
 
4
5
  The following is a current --help output (a work in progress) for
5
6
  the `NOOPTransform` (note the --noop_sleep_sec option):
@@ -5,7 +5,7 @@ The Ray runtime includes the following set of components:
5
5
  class generally used to implement `main()` that makes use of a `TransformConfiguration` to
6
6
  start the Ray runtime and execute the transform over the specified set of input files.
7
7
  The RayTransformLauncher is created using a `RayTransformConfiguration` instance.
8
- * [RayTransformConfiguration](../src/data_processing/runtime/ray/transform_configuration.py) - this
8
+ * [RayTransformConfiguration](../src/data_processing/runtime/ray/runtime_configuration.py) - this
9
9
  class extends transform's base TransformConfiguration implementation to add an optional
10
10
  `TranformRuntime` (see next) class to be used by the transform implementation.
11
11
  * [TransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py) -
@@ -40,7 +40,7 @@ launcher = RayTransformLauncher(YourTransformConfiguration())
40
40
  launcher.launch()
41
41
  ```
42
42
  Note that the launcher defines some additional CLI parameters that are used to control the operation of the
43
- [orchestrator and workers](../src/data_processing/runtime/ray/transform_orchestrator_configuration.py) and
43
+ [orchestrator and workers](../src/data_processing/runtime/ray/execution_configuration.py) and
44
44
  [data access](../src/data_processing/data_access/data_access_factory.py). Things such as data access configuration,
45
45
  number of workers, worker resources, etc.
46
46
  Discussion of these options is beyond the scope of this document
@@ -51,7 +51,7 @@ In general, a transform should be able to run in both the python and Ray runtime
51
51
  As such we first define the python-only transform configuration, which will then
52
52
  be used by the Ray-runtime-specific transform configuration.
53
53
  The python transform configuration implements
54
- [TransformConfiguration](../src/data_processing/transform/transform_configuration.py)
54
+ [TransformConfiguration](../src/data_processing/runtime/runtime_configuration.py)
55
55
  and deifnes with transform-specific name, and implementation
56
56
  and class. In addition, it is responsible for providing transform-specific
57
57
  methods to define and capture optional command line arguments.
@@ -15,33 +15,38 @@ one table to another. That said, we will show the following:
15
15
  the operation of our _noop_ transform.
16
16
 
17
17
  We will **not** be showing the following:
18
- * The creation of a custom TransformRuntime that would enable more global
19
- state and/or coordination among the transforms running in other ray actors.
18
+ * The creation of a custom `TransformRuntime` that would enable more global
19
+ state and/or coordination among the transforms running in other Ray actors.
20
20
  This will be covered in an advanced tutorial.
21
21
 
22
22
  The complete task involves the following:
23
- * NOOPTransform - class that implements the specific transformation
24
- * NOOPTableTransformConfiguration - class that provides configuration for the
25
- NOOPTransform, specifically the command line arguments used to configure it.
26
- * main() - simple creation and use of the TransformLauncher.
23
+
24
+ * `NOOPTransform` - class that implements the specific transformation
25
+ * `NOOPTableTransformConfiguration` - class that provides configuration for the
26
+ `NOOPTransform`, specifically the command line arguments used to configure it.
27
+ * `main()` - simple creation and use of the `TransformLauncher`.
27
28
 
28
29
  (Currently, the complete code for the noop transform used for this
29
30
  tutorial can be found in the
30
31
  [noop transform](../../transforms/universal/noop) directory.
31
32
 
32
- Finally, we show to use the command line to run the transform in a local ray cluster
33
+ Finally, we show how to use the command line to run the transform in a local ray cluster.
34
+
35
+ > **Note:** You will need to run the setup commands in the [`../README`](..) before running the following examples.
36
+
33
37
 
34
- ## NOOPTransform
38
+ ## `NOOPTransform`
35
39
 
36
40
  First, let's define the transform class. To do this we extend
37
41
  the base abstract/interface class
38
- [AbstractTableTransform](../src/data_processing_ibm/transform/table_transform.py),
42
+ [`AbstractTableTransform`](../src/data_processing_ibm/transform/table_transform.py),
39
43
  which requires definition of the following:
44
+
40
45
  * an initializer (i.e. `init()`) that accepts a dictionary of configuration
41
46
  data. For this example, the configuration data will only be defined by
42
47
  command line arguments (defined below).
43
- * the `transform()` method itself that takes an input table produces an output
44
- table and any associated metadata for that table transformation.
48
+ * the `transform()` method itself that takes an input table and produces an output
49
+ table with any associated metadata for that table transformation.
45
50
 
46
51
  Other methods such as `flush()` need not be overridden/redefined for this simple example.
47
52
 
@@ -54,18 +59,18 @@ from argparse import ArgumentParser, Namespace
54
59
  from typing import Any
55
60
 
56
61
  import pyarrow as pa
57
- from data_processing.runtime.ray import (
58
- RayLauncherConfiguration,
59
- DefaultTableTransformRuntimeRay,
60
- RayTransformLauncher,
62
+ from data_processing.runtime.ray import RayTransformLauncher
63
+ from data_processing.runtime.ray.runtime_configuration import (
64
+ RayTransformRuntimeConfiguration,
61
65
  )
62
- from data_processing.transform import AbstractTableTransform
66
+ from data_processing.transform import AbstractTableTransform, TransformConfiguration
67
+ from data_processing.utils import CLIArgumentProvider, get_logger
63
68
 
64
69
 
65
70
  class NOOPTransform(AbstractTableTransform):
66
71
 
67
- def __init__(self, config: dict[str, Any]):
68
- self.sleep = config.get("sleep", 1)
72
+ def __init__(self, config: dict[str, Any]):
73
+ self.sleep = config.get("sleep", 1)
69
74
  ```
70
75
  The `NOOPTransform` class extends the `AbstractTableTransform`, which defines the required methods.
71
76
 
@@ -75,7 +80,7 @@ with an amount of seconds to sleep/delay during the call to `transform()`.
75
80
  Configuration is provided by the framework in a dictionary provided to the initializer.
76
81
  Below we will cover how this `sleep` argument is made available to the initializer.
77
82
 
78
- Note that in more complex transforms that might, for example, load a hugging face or other model,
83
+ Note that in more complex transforms that might, for example, load a Hugging Face or other model,
79
84
  or perform other deep initializations, these can be done in the initializer.
80
85
 
81
86
  Next we define the `transform()` method itself, which includes the addition of some
@@ -90,19 +95,18 @@ almost trivial metadata.
90
95
  return [table], metadata
91
96
  ```
92
97
  The single input to this method is the in-memory pyarrow table to be transformed.
93
- The return of this function is a list of tables and optional metadata. In this
94
- case of simple 1:1 table conversion the list will contain a single table, the input.
98
+ The return value of this method is a list of tables and optional metadata. In this
99
+ case, we are doing a simple 1:1 table conversion, so the list will contain a single table, the input table.
95
100
  The metadata is a free-form dictionary of keys with numeric values that will be aggregated
96
101
  by the framework and reported as aggregated job statistics metadata.
97
102
  If there is no metadata then simply return an empty dictionary.
98
103
 
99
- ## NOOPTransformConfiguration
104
+ ## `NOOPTransformConfiguration`
100
105
 
101
- Next we define the `NOOPTransformConfiguration` and
102
- classes and there initializer that define the following:
106
+ Next we define the `NOOPTransformConfiguration` class and its initializer that defines the following:
103
107
 
104
108
  * The short name for the transform
105
- * The class implementing the transform - in our case NOOPTransform
109
+ * The class implementing the transform - in our case `NOOPTransform`
106
110
  * Command line argument support.
107
111
 
108
112
  We also define the `NOOPRayTransformationConfiguration` so we can run the transform
@@ -125,18 +129,20 @@ class NOOPTransformConfiguration(TransformConfiguration):
125
129
  remove_from_metadata=[pwd_key],
126
130
  )
127
131
  ```
128
- The initializer extends the TransformConfiguration which provides simple
129
- capture of our configuration data and enables picklability through the network.
132
+
133
+ The initializer extends the `TransformConfiguration` that provides simple
134
+ capture of our configuration data and enables the ability to pickle through the network.
130
135
  It also adds a `params` field that will be used below to hold the transform's
131
136
  configuration data (used in `NOOPTransform.init()` above).
132
137
 
133
138
  Next, we provide two methods that define and capture the command line configuration that
134
- is specific to the `NOOPTransform`, in this case the number of seconds to sleep during transformation
135
- and an example command line, `pwd`, option holding sensitive data that we don't want reported
136
- in the job metadata produced by the ray orchestrator.
137
- First we define the method establishes the command line arguments.
138
- This method is given a global argument parser to which the `NOOPTransform` arguments are added.
139
- It is good practice to include a common prefix to all transform-specific options (i.e. pii, lang, etc).
139
+ is specific to the `NOOPTransform`, in this case the parameters are the number of seconds to sleep during transformation
140
+ and an example command line parameter, `pwd` ("password"), option holding sensitive data that we don't want reported
141
+ in the job metadata produced by the Ray orchestrator.
142
+
143
+ The first method establishes the command line arguments.
144
+ It is given a global argument parser to which the `NOOPTransform` arguments are added.
145
+ It is a good practice to include a common prefix to all transform-specific options (i.e. pii, lang, etc).
140
146
  In our case we will use `noop_`.
141
147
 
142
148
  ```python
@@ -159,6 +165,7 @@ In our case we will use `noop_`.
159
165
  ```
160
166
  Next we implement a method that is called after the CLI args are parsed (usually by one
161
167
  of the runtimes) and which allows us to capture the `NOOPTransform`-specific arguments.
168
+
162
169
 
163
170
  ```python
164
171
 
@@ -176,13 +183,16 @@ To run the transform on a set of input data, we use one of the runtimes, each de
176
183
  ### Python Runtime
177
184
  To run in the python runtime, we need to create the instance of `PythonTransformLauncher`
178
185
  using the `NOOPTransformConfiguration`, and launch it as follows:
186
+
179
187
  ```python
180
188
  if __name__ == "__main__":
181
189
  launcher = PythonTransformLauncher(transform_config=NOOPTransformConfiguration())
182
190
  launcher.launch()
183
191
  ```
184
192
 
185
- To run this on some test data, we'll use data in the repo for the noop transform
193
+ ## Running
194
+
195
+ Assuming the above `main` code is placed in `noop_main.py` we can run the transform on some test data. We'll use data in the repo for the noop transform
186
196
  and create a temporary directory to hold the output:
187
197
  ```shell
188
198
  export DPK_REPOROOT=...
@@ -191,9 +201,9 @@ export NOOP_INPUT=$DPK_REPOROOT/transforms/universal/noop/test-data/input
191
201
  To run
192
202
  ```shell
193
203
  python noop_main.py --noop_sleep_msec 2 \
194
- --data_local_config "{'input_folder': '"$NOOP_INPUT"', 'output_folder': '/tmp/noop-output'}"
204
+ --data_local_config "{'input_folder': '"$NOOP_INPUT"', 'output_folder': '/tmp/noop-output'}"
195
205
  ```
196
- See the [python launcher options](python-launcher-options) for a complete list of
206
+ See the [python launcher options](python-launcher-options.md) for a complete list of
197
207
  transform-independent command line options.
198
208
 
199
209
  ### Ray Runtime
@@ -207,5 +217,5 @@ if __name__ == "__main__":
207
217
  ```
208
218
  We can run this with the same command as for the python runtime but to run in local Ray
209
219
  add the `--run_locally True` option.
210
- See the [ray launcher options](ray-launcher-options) for a complete list of
220
+ See the [ray launcher options](ray-launcher-options.md) for a complete list of
211
221
  transform-independent command line options.
@@ -1,4 +1,5 @@
1
1
  # Testing End-to-End Transform operation
2
2
  WIP - Points to discuss
3
+
3
4
  1. Reading input files and writing output files.
4
- 2. Testing of the transform runtime and use of ray components in the transform
5
+ 2. Testing of the transform runtime and use of ray components in the transform
@@ -8,6 +8,7 @@ In addition to actually loading the resource(s), the transform needs to define t
8
8
  defines the location of the domain list.
9
9
 
10
10
  In the next sections we cover the following:
11
+
11
12
  1. How to define the transform-specific resource location(s) as command line arguments
12
13
  2. How to load the transform-specific resources, either or both of:
13
14
  1. During transform initialization - this is useful for testing outside of ray, and optionally
@@ -15,6 +15,7 @@ transform implementation tests will easily leverage.
15
15
 
16
16
  The first (currently only test) is a the `test_transform()` method that takes the
17
17
  following inputs:
18
+
18
19
  * the transform implementation being tested, properly configured with the configuration
19
20
  dictionary for the associated test data.
20
21
  * a list of N (1 or more) input tables to be processed with the transform's `transform(Table)` method.
@@ -41,19 +41,22 @@ The return values are handled the same waa as the return values for `transform()
41
41
  not need this feature, a default implementation is provided to return an empty list and empty dictionary.
42
42
 
43
43
  #### TransformConfiguration class
44
- The [TransformConfiguration](../src/data_processing/transform/transform_configuration.py)
44
+ The [TransformConfiguration](../src/data_processing/runtime/runtime_configuration.py)
45
45
  serves as an interface and must be implemented by the any `AbstractTableTransform`
46
46
  implementation to provide the following configuration:
47
+
47
48
  * the transform class to be used,
48
49
  * command line arguments used to initialize the Transform Runtime and generally, the Transform.
49
50
  * Transform Runtime class to use
50
51
  * transform short name
52
+
51
53
  It is expected that transforms are initialized with a fixed name, the class of its corresponding
52
54
  `AbstractTableTransform` implementation and optionally the configuration keys that should not
53
55
  be exposed as metadata for a run.
54
56
  To support command line configuration, the `TransformConfiguration` extends the
55
57
  [CLIArgumentProvider](../src/data_processing/utils/cli_utils.py) class.
56
58
  The set of methods of interest are
59
+
57
60
  * ```__init__(self, name:str, transform_class:type[AbstractTableTransform], list[str]:remove_from_metadata )``` - sets the required fields
58
61
  * ```add_input_params(self, parser:ArgumentParser)``` - adds transform-specific command line options that will
59
62
  be made available in the dictionary provided to the transform's initializer.
@@ -2,6 +2,7 @@
2
2
 
3
3
  A class [TransformUtils](../src/data_processing/utils/transform_utils.py) provides several methods that simplify
4
4
  transformer's implementation. Currently it includes the following methods:
5
+
5
6
  * `deep_get_size` is the method to get the complete size of the Python object based on
6
7
  https://www.askpython.com/python/built-in-methods/variables-memory-size-in-python
7
8
  It supports Python structures: list, tuple and set
@@ -17,8 +18,9 @@ be removed before it is added
17
18
  removes URL encoding
18
19
 
19
20
  It also contain two variables:
21
+
20
22
  * `RANDOM_SEED` number that is used for methods that require seed
21
23
  * `LOCAL_TO_DISK` rough local size to size on disk/S3
22
24
 
23
25
  This class should be extended with additional methods, generally useful across multiple transformers and documentation
24
- should be added here
26
+ should be added here
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.0.1-dev12"
3
+ version = "0.1.0"
4
4
  requires-python = ">=3.10"
5
5
  description = "Data Preparation Toolkit Library"
6
6
  license = {text = "Apache-2.0"}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.0.1.dev12
3
+ Version: 0.1.0
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -35,19 +35,20 @@ src/data_processing/data_access/data_access_local.py
35
35
  src/data_processing/data_access/data_access_s3.py
36
36
  src/data_processing/runtime/__init__.py
37
37
  src/data_processing/runtime/execution_configuration.py
38
+ src/data_processing/runtime/runtime_configuration.py
38
39
  src/data_processing/runtime/transform_launcher.py
40
+ src/data_processing/runtime/transform_table_processor.py
39
41
  src/data_processing/runtime/pure_python/__init__.py
40
- src/data_processing/runtime/pure_python/python_launcher_configuration.py
42
+ src/data_processing/runtime/pure_python/runtime_configuration.py
41
43
  src/data_processing/runtime/pure_python/transform_launcher.py
42
44
  src/data_processing/runtime/pure_python/transform_orchestrator.py
43
45
  src/data_processing/runtime/pure_python/transform_table_processor.py
44
46
  src/data_processing/runtime/ray/__init__.py
47
+ src/data_processing/runtime/ray/execution_configuration.py
45
48
  src/data_processing/runtime/ray/ray_utils.py
46
- src/data_processing/runtime/ray/transform_configuration.py
47
- src/data_processing/runtime/ray/transform_launch_configuration.py
49
+ src/data_processing/runtime/ray/runtime_configuration.py
48
50
  src/data_processing/runtime/ray/transform_launcher.py
49
51
  src/data_processing/runtime/ray/transform_orchestrator.py
50
- src/data_processing/runtime/ray/transform_orchestrator_configuration.py
51
52
  src/data_processing/runtime/ray/transform_runtime.py
52
53
  src/data_processing/runtime/ray/transform_statistics.py
53
54
  src/data_processing/runtime/ray/transform_table_processor.py
@@ -88,8 +89,10 @@ test/data_processing_tests/data_access/data_access_local_test.py
88
89
  test/data_processing_tests/data_access/data_access_s3_test.py
89
90
  test/data_processing_tests/data_access/sample_input_data_test.py
90
91
  test/data_processing_tests/launch/pure_python/launcher_test.py
92
+ test/data_processing_tests/launch/pure_python/multi_launcher_test.py
91
93
  test/data_processing_tests/launch/pure_python/test_noop_launch.py
92
94
  test/data_processing_tests/launch/ray/launcher_test.py
95
+ test/data_processing_tests/launch/ray/multi_launcher_test.py
93
96
  test/data_processing_tests/launch/ray/ray_util_test.py
94
97
  test/data_processing_tests/launch/ray/test_noop_launch.py
95
98
  test/data_processing_tests/transform/test_noop.py
@@ -0,0 +1,4 @@
1
+ from data_processing.runtime.execution_configuration import TransformExecutionConfiguration
2
+ from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration
3
+ from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_luncher
4
+ from data_processing.runtime.transform_table_processor import AbstractTransformTableProcessor
@@ -1,4 +1,4 @@
1
- from data_processing.runtime.pure_python.python_launcher_configuration import PythonLauncherConfiguration
1
+ from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
2
2
  from data_processing.runtime.pure_python.transform_table_processor import TransformTableProcessor
3
3
  from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
4
4
  from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
@@ -0,0 +1,24 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.runtime import TransformRuntimeConfiguration
14
+ from data_processing.transform import TransformConfiguration
15
+
16
+
17
+ class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
18
+ def __init__(self, transform_config: TransformConfiguration):
19
+ """
20
+ Initialization
21
+ :param transform_config - base configuration class
22
+ """
23
+ self.transform_config = transform_config
24
+ super().__init__(transform_config=transform_config)
@@ -15,9 +15,11 @@ import time
15
15
 
16
16
  from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
17
17
  from data_processing.runtime import TransformExecutionConfiguration
18
- from data_processing.runtime.pure_python import PythonLauncherConfiguration, orchestrate
18
+ from data_processing.runtime.pure_python import (
19
+ PythonTransformRuntimeConfiguration,
20
+ orchestrate,
21
+ )
19
22
  from data_processing.runtime.transform_launcher import AbstractTransformLauncher
20
- from data_processing.transform import TransformConfiguration
21
23
  from data_processing.utils import get_logger
22
24
 
23
25
 
@@ -31,18 +33,16 @@ class PythonTransformLauncher(AbstractTransformLauncher):
31
33
 
32
34
  def __init__(
33
35
  self,
34
- # transform_runtime_config: PythonLauncherConfiguration,
35
- transform_config: TransformConfiguration,
36
+ runtime_config: PythonTransformRuntimeConfiguration,
36
37
  data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
37
38
  ):
38
39
  """
39
40
  Creates driver
40
- :param transform_runtime_config: transform runtime factory
41
+ :param runtime_config: transform runtime factory
41
42
  :param data_access_factory: the factory to create DataAccess instances.
42
43
  """
43
- super().__init__(transform_config, data_access_factory)
44
- self.transform_runtime_config = PythonLauncherConfiguration(transform_config)
45
- self.execution_config = TransformExecutionConfiguration(name=self.transform_runtime_config.get_name())
44
+ super().__init__(runtime_config, data_access_factory)
45
+ self.execution_config = TransformExecutionConfiguration(name=runtime_config.get_name())
46
46
 
47
47
  def __get_parameters(self) -> bool:
48
48
  """
@@ -57,12 +57,12 @@ class PythonTransformLauncher(AbstractTransformLauncher):
57
57
  formatter_class=argparse.RawTextHelpFormatter,
58
58
  )
59
59
  # add additional arguments
60
- self.transform_runtime_config.add_input_params(parser=parser)
60
+ self.runtime_config.add_input_params(parser=parser)
61
61
  self.data_access_factory.add_input_params(parser=parser)
62
62
  self.execution_config.add_input_params(parser=parser)
63
63
  args = parser.parse_args()
64
64
  return (
65
- self.transform_runtime_config.apply_input_params(args=args)
65
+ self.runtime_config.apply_input_params(args=args)
66
66
  and self.execution_config.apply_input_params(args=args)
67
67
  and self.data_access_factory.apply_input_params(args=args)
68
68
  )
@@ -78,7 +78,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
78
78
  logger.debug("Starting orchestrator")
79
79
  res = orchestrate(
80
80
  data_access_factory=self.data_access_factory,
81
- transform_config=self.transform_runtime_config,
81
+ runtime_config=self.runtime_config,
82
82
  execution_config=self.execution_config,
83
83
  )
84
84
  logger.debug("Completed orchestrator")