data-prep-toolkit 0.0.1.dev3__tar.gz → 0.0.1.dev12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/Makefile +4 -4
- {data_prep_toolkit-0.0.1.dev3/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.0.1.dev12}/PKG-INFO +1 -1
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/advanced-transform-tutorial.md +3 -3
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/architecture.md +8 -8
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/overview.md +7 -6
- data_prep_toolkit-0.0.1.dev12/doc/python-launcher-options.md +60 -0
- data_prep_toolkit-0.0.1.dev12/doc/python-runtime.md +12 -0
- data_prep_toolkit-0.0.1.dev3/doc/launcher-options.md → data_prep_toolkit-0.0.1.dev12/doc/ray-launcher-options.md +37 -42
- data_prep_toolkit-0.0.1.dev12/doc/ray-runtime.md +143 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/simplest-transform-tutorial.md +64 -51
- data_prep_toolkit-0.0.1.dev12/doc/transform-runtimes.md +9 -0
- data_prep_toolkit-0.0.1.dev12/doc/transform-testing.md +6 -0
- data_prep_toolkit-0.0.1.dev12/doc/transform-tutorial-examples.md +15 -0
- data_prep_toolkit-0.0.1.dev12/doc/transform-tutorials.md +67 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/pyproject.toml +2 -2
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_prep_toolkit.egg-info/SOURCES.txt +35 -26
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/arrow_s3.py +5 -17
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/__init__.py +2 -0
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/__init__.py +4 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/pure_python/python_launcher_configuration.py +12 -14
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/pure_python/transform_launcher.py +10 -11
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/pure_python/transform_orchestrator.py +5 -5
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/pure_python/transform_table_processor.py +2 -1
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/__init__.py +8 -0
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_configuration.py +33 -0
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_launch_configuration.py +44 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/transform_launcher.py +18 -17
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/transform_orchestrator.py +2 -2
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/transform_orchestrator_configuration.py +1 -1
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/transform_runtime.py +0 -52
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/transform_launcher.py +25 -0
- data_prep_toolkit-0.0.1.dev12/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing/test_support/ray → data_prep_toolkit-0.0.1.dev12/src/data_processing/test_support/launch}/transform_test.py +13 -11
- data_prep_toolkit-0.0.1.dev12/src/data_processing/test_support/transform/__init__.py +5 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/test_support/transform/noop_transform.py +37 -29
- data_prep_toolkit-0.0.1.dev12/src/data_processing/transform/__init__.py +3 -0
- data_prep_toolkit-0.0.1.dev3/src/data_processing/transform/launcher_configuration.py → data_prep_toolkit-0.0.1.dev12/src/data_processing/transform/transform_configuration.py +31 -4
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/utils/transform_utils.py +33 -0
- {data_prep_toolkit-0.0.1.dev3/test/data_processing_tests → data_prep_toolkit-0.0.1.dev12/test/data_processing_tests/launch}/pure_python/launcher_test.py +26 -74
- data_prep_toolkit-0.0.1.dev12/test/data_processing_tests/launch/pure_python/test_noop_launch.py +41 -0
- {data_prep_toolkit-0.0.1.dev3/test/data_processing_tests → data_prep_toolkit-0.0.1.dev12/test/data_processing_tests/launch}/ray/launcher_test.py +26 -117
- {data_prep_toolkit-0.0.1.dev3/test/data_processing_tests → data_prep_toolkit-0.0.1.dev12/test/data_processing_tests/launch}/ray/ray_util_test.py +1 -1
- {data_prep_toolkit-0.0.1.dev3/test/data_processing_tests → data_prep_toolkit-0.0.1.dev12/test/data_processing_tests/launch}/ray/test_noop_launch.py +11 -4
- data_prep_toolkit-0.0.1.dev3/doc/logo-ibm-dark.png +0 -0
- data_prep_toolkit-0.0.1.dev3/doc/logo-ibm.png +0 -0
- data_prep_toolkit-0.0.1.dev3/doc/transform-tutorials.md +0 -194
- data_prep_toolkit-0.0.1.dev3/src/data_processing/pure_python/__init__.py +0 -4
- data_prep_toolkit-0.0.1.dev3/src/data_processing/ray/__init__.py +0 -10
- data_prep_toolkit-0.0.1.dev3/src/data_processing/test_support/ray/__init__.py +0 -1
- data_prep_toolkit-0.0.1.dev3/src/data_processing/test_support/transform/__init__.py +0 -7
- data_prep_toolkit-0.0.1.dev3/src/data_processing/transform/__init__.py +0 -7
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/.gitignore +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/README.md +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/processing-architecture.jpg +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/testing-e2e-transform.md +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/transform-external-resources.md +0 -0
- /data_prep_toolkit-0.0.1.dev3/doc/using_s3_transformers.md → /data_prep_toolkit-0.0.1.dev12/doc/transform-s3-testing.md +0 -0
- /data_prep_toolkit-0.0.1.dev3/doc/testing-transforms.md → /data_prep_toolkit-0.0.1.dev12/doc/transform-standalone-testing.md +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/transformer-utilities.md +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/setup.cfg +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/data_access.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/data_access_factory.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/data_access_factory_base.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/data_access_local.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/data_access/data_access_s3.py +0 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing/transform → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/execution_configuration.py +0 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/ray_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/transform_statistics.py +0 -0
- {data_prep_toolkit-0.0.1.dev3/src/data_processing → data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime}/ray/transform_table_processor.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/test_support/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/test_support/abstract_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/test_support/transform/transform_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/transform/table_transform.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/transform/transform_statistics.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/utils/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/utils/log.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/src/data_processing/utils/params_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test/data_processing_tests/transform/test_noop.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/ray/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/ray/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/ray/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/ray/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/test-data/data_processing/ray/noop/input/subdir/test1.parquet +0 -0
|
@@ -55,8 +55,8 @@ test::
|
|
|
55
55
|
@# Help: Use the already-built virtual environment to run pytest on the test directory.
|
|
56
56
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
57
57
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
|
|
58
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/pure_python;
|
|
59
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/ray/ray_util_test.py;
|
|
60
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/ray/launcher_test.py;
|
|
61
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/ray/test_noop_launch.py;
|
|
58
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python;
|
|
59
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/ray_util_test.py;
|
|
60
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
|
|
61
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
|
|
62
62
|
|
{data_prep_toolkit-0.0.1.dev3 → data_prep_toolkit-0.0.1.dev12}/doc/advanced-transform-tutorial.md
RENAMED
|
@@ -57,7 +57,7 @@ from typing import Any
|
|
|
57
57
|
import pyarrow as pa
|
|
58
58
|
import ray
|
|
59
59
|
from data_processing.data_access import DataAccessFactory
|
|
60
|
-
from data_processing.ray import (
|
|
60
|
+
from data_processing.runtime.ray import (
|
|
61
61
|
RayLauncherConfiguration,
|
|
62
62
|
DefaultTableTransformRuntimeRay,
|
|
63
63
|
RayUtils,
|
|
@@ -136,7 +136,7 @@ If there is no metadata then simply return an empty dictionary.
|
|
|
136
136
|
|
|
137
137
|
First, let's define the transform runtime class. To do this we extend
|
|
138
138
|
the base abstract/interface class
|
|
139
|
-
[DefaultTableTransformRuntime](../src/data_processing/ray/transform_runtime.py),
|
|
139
|
+
[DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py),
|
|
140
140
|
which requires definition of the following:
|
|
141
141
|
* an initializer (i.e. `init()`) that accepts a dictionary of configuration
|
|
142
142
|
data. For this example, the configuration data will only be defined by
|
|
@@ -280,5 +280,5 @@ python ededup_transform.py --hash_cpu 0.5 --num_hashes 2 --doc_column "contents"
|
|
|
280
280
|
--s3_config "{'input_folder': 'cos-optimal-llm-pile/test/david/input/', 'output_folder': 'cos-optimal-llm-pile/test/david/output/'}"
|
|
281
281
|
```
|
|
282
282
|
This is a minimal set of options to run locally.
|
|
283
|
-
See the [launcher options](launcher-options
|
|
283
|
+
See the [launcher options](ray-launcher-options) for a complete list of
|
|
284
284
|
transform-independent command line options.
|
|
@@ -13,10 +13,10 @@ process many input files in parallel using a distribute network of RayWorkers.
|
|
|
13
13
|
|
|
14
14
|
The architecture includes the following core components:
|
|
15
15
|
|
|
16
|
-
* [RayLauncher](../src/data_processing/ray/transform_launcher.py) accepts and validates
|
|
16
|
+
* [RayLauncher](../src/data_processing/runtime/ray/transform_launcher.py) accepts and validates
|
|
17
17
|
CLI parameters to establish the Ray Orchestrator with the proper configuration.
|
|
18
18
|
It uses the following components, all of which can/do define CLI configuration parameters.:
|
|
19
|
-
* [Transform Orchestrator Configuration](../src/data_processing/ray/transform_orchestrator_configuration.py) is responsible
|
|
19
|
+
* [Transform Orchestrator Configuration](../src/data_processing/runtime/ray/transform_orchestrator_configuration.py) is responsible
|
|
20
20
|
for defining and validating infrastructure parameters
|
|
21
21
|
(e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state
|
|
22
22
|
(several dictionaries) and is fully pickleable. As a result framework uses its instance as a
|
|
@@ -25,14 +25,14 @@ It uses the following components, all of which can/do define CLI configuration p
|
|
|
25
25
|
configuration for the type of DataAccess to use when reading/writing the input/output data for
|
|
26
26
|
the transforms. Similar to Transform Orchestrator Configuration, this is a pickleable
|
|
27
27
|
instance that is passed between Launcher, Orchestrator and Workers.
|
|
28
|
-
* [TransformConfiguration](../src/data_processing/ray/transform_runtime.py) - defines specifics
|
|
28
|
+
* [TransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) - defines specifics
|
|
29
29
|
of the transform implementation including transform implementation class, its short name, any transform-
|
|
30
30
|
specific CLI parameters, and an optional TransformRuntime class, discussed below.
|
|
31
31
|
|
|
32
32
|
After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton
|
|
33
33
|
and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation.
|
|
34
34
|
The Launcher waits for the Ray Orchestrator to complete.
|
|
35
|
-
* [Ray Orchestrator](../src/data_processing/ray/transform_orchestrator.py) is responsible for overall management of
|
|
35
|
+
* [Ray Orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py) is responsible for overall management of
|
|
36
36
|
the data processing job. It creates the actors, determines the set of input data and distributes the
|
|
37
37
|
references to the data files to be processed by the workers. More specifically, it performs the following:
|
|
38
38
|
1. Uses the DataAccess instance created by the DataAccessFactory to determine the set of the files
|
|
@@ -53,12 +53,12 @@ It uses the following components, all of which can/do define CLI configuration p
|
|
|
53
53
|
Once all data is processed, the orchestrator will collect execution statistics (from the statistics actor)
|
|
54
54
|
and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution
|
|
55
55
|
result to the Launcher.
|
|
56
|
-
* [Ray worker](../src/data_processing/ray/transform_table_processor.py) is responsible for
|
|
56
|
+
* [Ray worker](../src/data_processing/runtime/ray/transform_table_processor.py) is responsible for
|
|
57
57
|
reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8))
|
|
58
58
|
assigned by the orchestrator, applying the transform to the input table and writing out the
|
|
59
59
|
resulting table(s). Metadata produced by each table transformation is aggregated into
|
|
60
60
|
Transform Statistics (below).
|
|
61
|
-
* [Transform Statistics](../src/data_processing/ray/transform_statistics.py) is a general
|
|
61
|
+
* [Transform Statistics](../src/data_processing/runtime/ray/transform_statistics.py) is a general
|
|
62
62
|
purpose data collector actor aggregating the numeric metadata from different places of
|
|
63
63
|
the framework (especially metadata produced by the transform).
|
|
64
64
|
These statistics are reported as metadata (`metadata.json`) by the orchestrator upon completion.
|
|
@@ -92,10 +92,10 @@ For a more complete discussion, see the [tutorials](transform-tutorials.md).
|
|
|
92
92
|
of any transform implementation - `transform()` and `flush()` - and provides the bulk of any transform implementation
|
|
93
93
|
convert one Table to 0 or more new Tables. In general, this is not tied to the above Ray infrastructure
|
|
94
94
|
and so can usually be used independent of Ray.
|
|
95
|
-
* [TransformRuntime ](../src/data_processing/ray/transform_runtime.py) - this class only needs to be
|
|
95
|
+
* [TransformRuntime ](../src/data_processing/runtime/ray/transform_runtime.py) - this class only needs to be
|
|
96
96
|
extended/implemented when additional Ray components (actors, shared memory objects, etc.) are used
|
|
97
97
|
by the transform. The main method `get_transform_config()` is used to enable these extensions.
|
|
98
|
-
* [TransformConfiguration](../src/data_processing/ray/transform_runtime.py) - this is the bootstrap
|
|
98
|
+
* [TransformConfiguration](../src/data_processing/runtime/ray/transform_runtime.py) - this is the bootstrap
|
|
99
99
|
class provided to the Launcher that enables the instantiation of the Transform and the TransformRuntime within
|
|
100
100
|
the architecture. It is a CLIProvider, which allows it to define transform-specific CLI configuration
|
|
101
101
|
that is made available to the Transform's initializer.
|
|
@@ -12,16 +12,17 @@ developers of data transformation are:
|
|
|
12
12
|
|
|
13
13
|
* [Transformation](../src/data_processing/transform/table_transform.py) - a simple, easily-implemented interface defines
|
|
14
14
|
the specifics of a given data transformation.
|
|
15
|
-
* [Transform Configuration](../src/data_processing/ray/transform_runtime.py) - defines
|
|
16
|
-
the transform implementation and
|
|
17
|
-
|
|
18
|
-
* [Transformation Runtime](../src/data_processing/ray/transform_runtime.py) - allows for customization of the Ray environment for the transformer.
|
|
19
|
-
This might include provisioning of shared memory objects or creation of additional actors.
|
|
15
|
+
* [Transform Configuration](../src/data_processing/runtime/ray/transform_runtime.py) - defines
|
|
16
|
+
the transform short name, its implementation class, and command line configuration
|
|
17
|
+
parameters.
|
|
20
18
|
|
|
21
19
|
To learn more consider the following:
|
|
22
20
|
|
|
23
21
|
* [Transform Tutorials](transform-tutorials.md)
|
|
24
|
-
* [
|
|
22
|
+
* [Transform Runtimes](transform-runtimes.md)
|
|
23
|
+
* [Transform Examples](transform-tutorial-examples.md)
|
|
24
|
+
* [Testing Transforms](transform-testing.md)
|
|
25
|
+
* [Utilities](transformer-utilities.md)
|
|
25
26
|
* [Architecture Deep Dive](architecture.md)
|
|
26
27
|
* [Transform project root readme](../../transforms/README.md)
|
|
27
28
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Ray Launcher Command Line Options
|
|
2
|
+
A number of command line options are available when launching a transform.
|
|
3
|
+
|
|
4
|
+
The following is a current --help output (a work in progress) for
|
|
5
|
+
the `NOOPTransform` (note the --noop_sleep_sec option):
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
usage: noop_python_runtime.py [-h] [--noop_sleep_sec NOOP_SLEEP_SEC] [--noop_pwd NOOP_PWD] [--data_s3_cred DATA_S3_CRED] [--data_s3_config DATA_S3_CONFIG] [--data_local_config DATA_LOCAL_CONFIG] [--data_max_files DATA_MAX_FILES]
|
|
9
|
+
[--data_checkpointing DATA_CHECKPOINTING] [--data_data_sets DATA_DATA_SETS] [--data_files_to_use DATA_FILES_TO_USE] [--data_num_samples DATA_NUM_SAMPLES] [--runtime_pipeline_id RUNTIME_PIPELINE_ID]
|
|
10
|
+
[--runtime_job_id RUNTIME_JOB_ID] [--runtime_code_location RUNTIME_CODE_LOCATION]
|
|
11
|
+
|
|
12
|
+
Driver for noop processing
|
|
13
|
+
|
|
14
|
+
options:
|
|
15
|
+
-h, --help show this help message and exit
|
|
16
|
+
--noop_sleep_sec NOOP_SLEEP_SEC
|
|
17
|
+
Sleep actor for a number of seconds while processing the data frame, before writing the file to COS
|
|
18
|
+
--noop_pwd NOOP_PWD A dummy password which should be filtered out of the metadata
|
|
19
|
+
--data_s3_cred DATA_S3_CRED
|
|
20
|
+
AST string of options for s3 credentials. Only required for S3 data access.
|
|
21
|
+
access_key: access key help text
|
|
22
|
+
secret_key: secret key help text
|
|
23
|
+
url: optional s3 url
|
|
24
|
+
region: optional s3 region
|
|
25
|
+
Example: { 'access_key': 'access', 'secret_key': 'secret',
|
|
26
|
+
'url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud',
|
|
27
|
+
'region': 'us-east-1' }
|
|
28
|
+
--data_s3_config DATA_S3_CONFIG
|
|
29
|
+
AST string containing input/output paths.
|
|
30
|
+
input_folder: Path to input folder of files to be processed
|
|
31
|
+
output_folder: Path to output folder of processed files
|
|
32
|
+
Example: { 'input_folder': 's3-path/your-input-bucket',
|
|
33
|
+
'output_folder': 's3-path/your-output-bucket' }
|
|
34
|
+
--data_local_config DATA_LOCAL_CONFIG
|
|
35
|
+
ast string containing input/output folders using local fs.
|
|
36
|
+
input_folder: Path to input folder of files to be processed
|
|
37
|
+
output_folder: Path to output folder of processed files
|
|
38
|
+
Example: { 'input_folder': './input', 'output_folder': '/tmp/output' }
|
|
39
|
+
--data_max_files DATA_MAX_FILES
|
|
40
|
+
Max amount of files to process
|
|
41
|
+
--data_checkpointing DATA_CHECKPOINTING
|
|
42
|
+
checkpointing flag
|
|
43
|
+
--data_data_sets DATA_DATA_SETS
|
|
44
|
+
List of sub-directories of input directory to use for input. For example, ['dir1', 'dir2']
|
|
45
|
+
--data_files_to_use DATA_FILES_TO_USE
|
|
46
|
+
list of file extensions to choose for input.
|
|
47
|
+
--data_num_samples DATA_NUM_SAMPLES
|
|
48
|
+
number of random input files to process
|
|
49
|
+
--runtime_pipeline_id RUNTIME_PIPELINE_ID
|
|
50
|
+
pipeline id
|
|
51
|
+
--runtime_job_id RUNTIME_JOB_ID
|
|
52
|
+
job id
|
|
53
|
+
--runtime_code_location RUNTIME_CODE_LOCATION
|
|
54
|
+
AST string containing code location
|
|
55
|
+
github: Github repository URL.
|
|
56
|
+
commit_hash: github commit hash
|
|
57
|
+
path: Path within the repository
|
|
58
|
+
Example: { 'github': 'https://github.com/somerepo', 'commit_hash': '1324',
|
|
59
|
+
'path': 'transforms/universal/code' }
|
|
60
|
+
```
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
## Python Runtime
|
|
2
|
+
The python runtime provides a simple mechanism to run a transform on a set of input data to produce
|
|
3
|
+
a set of output data, all within the python execution environment.
|
|
4
|
+
|
|
5
|
+
A `PythonTransformLauncher` class is provided that enables the running of the transform. For example,
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
launcher = PythonTransformLauncher(YourTransformConfiguration())
|
|
9
|
+
launcher.launch()
|
|
10
|
+
```
|
|
11
|
+
The `YourTransformConfiguration` class configures your transform.
|
|
12
|
+
More details can be found in the [transform tutorial](transform-tutorials.md).
|
|
@@ -1,29 +1,16 @@
|
|
|
1
|
-
# Launcher Command Line Options
|
|
1
|
+
# Ray Launcher Command Line Options
|
|
2
2
|
A number of command line options are available when launching a transform.
|
|
3
3
|
|
|
4
4
|
The following is a current --help output (a work in progress) for
|
|
5
|
-
the `NOOPTransform` (note the --noop_sleep_sec
|
|
5
|
+
the `NOOPTransform` (note the --noop_sleep_sec and --noop_pwd options):
|
|
6
6
|
|
|
7
7
|
```
|
|
8
|
-
usage: noop_transform.py [-h]
|
|
9
|
-
[--
|
|
10
|
-
[--
|
|
11
|
-
[--
|
|
12
|
-
[--data_s3_config DATA_S3_CONFIG]
|
|
13
|
-
[--data_local_config DATA_LOCAL_CONFIG]
|
|
14
|
-
[--data_max_files DATA_MAX_FILES]
|
|
15
|
-
[--data_checkpointing DATA_CHECKPOINTING]
|
|
16
|
-
[--data_data_sets DATA_DATA_SETS]
|
|
17
|
-
[--data_max_files MAX_FILES]
|
|
18
|
-
[--data_files_to_use DATA_FILES_TO_USE]
|
|
19
|
-
[--data_num_samples DATA_NUM_SAMPLES]
|
|
20
|
-
[--runtime_num_workers NUM_WORKERS]
|
|
21
|
-
[--runtime_worker_options WORKER_OPTIONS]
|
|
22
|
-
[--runtime_pipeline_id PIPELINE_ID] [--job_id JOB_ID]
|
|
23
|
-
[--runtime_creation_delay CREATION_DELAY]
|
|
24
|
-
[--runtime_code_location CODE_LOCATION]
|
|
8
|
+
usage: noop_transform.py [-h] [--run_locally RUN_LOCALLY] [--noop_sleep_sec NOOP_SLEEP_SEC] [--noop_pwd NOOP_PWD] [--data_s3_cred DATA_S3_CRED] [--data_s3_config DATA_S3_CONFIG] [--data_local_config DATA_LOCAL_CONFIG]
|
|
9
|
+
[--data_max_files DATA_MAX_FILES] [--data_checkpointing DATA_CHECKPOINTING] [--data_data_sets DATA_DATA_SETS] [--data_files_to_use DATA_FILES_TO_USE] [--data_num_samples DATA_NUM_SAMPLES]
|
|
10
|
+
[--runtime_num_workers RUNTIME_NUM_WORKERS] [--runtime_worker_options RUNTIME_WORKER_OPTIONS] [--runtime_creation_delay RUNTIME_CREATION_DELAY] [--runtime_pipeline_id RUNTIME_PIPELINE_ID]
|
|
11
|
+
[--runtime_job_id RUNTIME_JOB_ID] [--runtime_code_location RUNTIME_CODE_LOCATION]
|
|
25
12
|
|
|
26
|
-
Driver for
|
|
13
|
+
Driver for noop processing
|
|
27
14
|
|
|
28
15
|
options:
|
|
29
16
|
-h, --help show this help message and exit
|
|
@@ -31,35 +18,40 @@ options:
|
|
|
31
18
|
running ray local flag
|
|
32
19
|
--noop_sleep_sec NOOP_SLEEP_SEC
|
|
33
20
|
Sleep actor for a number of seconds while processing the data frame, before writing the file to COS
|
|
34
|
-
--
|
|
35
|
-
|
|
21
|
+
--noop_pwd NOOP_PWD A dummy password which should be filtered out of the metadata
|
|
22
|
+
--data_s3_cred DATA_S3_CRED
|
|
23
|
+
AST string of options for s3 credentials. Only required for S3 data access.
|
|
36
24
|
access_key: access key help text
|
|
37
25
|
secret_key: secret key help text
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
26
|
+
url: optional s3 url
|
|
27
|
+
region: optional s3 region
|
|
28
|
+
Example: { 'access_key': 'access', 'secret_key': 'secret',
|
|
29
|
+
'url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud',
|
|
30
|
+
'region': 'us-east-1' }
|
|
31
|
+
--data_s3_config DATA_S3_CONFIG
|
|
41
32
|
AST string containing input/output paths.
|
|
42
33
|
input_folder: Path to input folder of files to be processed
|
|
43
34
|
output_folder: Path to output folder of processed files
|
|
44
|
-
Example: { 'input_folder': 'your
|
|
45
|
-
|
|
35
|
+
Example: { 'input_folder': 's3-path/your-input-bucket',
|
|
36
|
+
'output_folder': 's3-path/your-output-bucket' }
|
|
37
|
+
--data_local_config DATA_LOCAL_CONFIG
|
|
46
38
|
ast string containing input/output folders using local fs.
|
|
47
39
|
input_folder: Path to input folder of files to be processed
|
|
48
40
|
output_folder: Path to output folder of processed files
|
|
49
41
|
Example: { 'input_folder': './input', 'output_folder': '/tmp/output' }
|
|
50
|
-
--data_max_files
|
|
42
|
+
--data_max_files DATA_MAX_FILES
|
|
51
43
|
Max amount of files to process
|
|
52
|
-
--data_checkpointing
|
|
44
|
+
--data_checkpointing DATA_CHECKPOINTING
|
|
53
45
|
checkpointing flag
|
|
54
|
-
--data_data_sets
|
|
55
|
-
List of
|
|
46
|
+
--data_data_sets DATA_DATA_SETS
|
|
47
|
+
List of sub-directories of input directory to use for input. For example, ['dir1', 'dir2']
|
|
56
48
|
--data_files_to_use DATA_FILES_TO_USE
|
|
57
|
-
|
|
49
|
+
list of file extensions to choose for input.
|
|
58
50
|
--data_num_samples DATA_NUM_SAMPLES
|
|
59
|
-
number of
|
|
60
|
-
--runtime_num_workers
|
|
51
|
+
number of random input files to process
|
|
52
|
+
--runtime_num_workers RUNTIME_NUM_WORKERS
|
|
61
53
|
number of workers
|
|
62
|
-
--runtime_worker_options
|
|
54
|
+
--runtime_worker_options RUNTIME_WORKER_OPTIONS
|
|
63
55
|
AST string defining worker resource requirements.
|
|
64
56
|
num_cpus: Required number of CPUs.
|
|
65
57
|
num_gpus: Required number of GPUs
|
|
@@ -69,16 +61,19 @@ options:
|
|
|
69
61
|
placement_group_bundle_index, placement_group_capture_child_tasks, resources, runtime_env,
|
|
70
62
|
scheduling_strategy, _metadata, concurrency_groups, lifetime, max_concurrency, max_restarts,
|
|
71
63
|
max_task_retries, max_pending_calls, namespace, get_if_exists
|
|
72
|
-
Example: { 'num_cpus': '8', 'num_gpus': '1',
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
--runtime_job_id JOB_ID job id
|
|
76
|
-
--runtime_creation_delay CREATION_DELAY
|
|
64
|
+
Example: { 'num_cpus': '8', 'num_gpus': '1',
|
|
65
|
+
'resources': '{"special_hardware": 1, "custom_label": 1}' }
|
|
66
|
+
--runtime_creation_delay RUNTIME_CREATION_DELAY
|
|
77
67
|
delay between actor' creation
|
|
78
|
-
--
|
|
68
|
+
--runtime_pipeline_id RUNTIME_PIPELINE_ID
|
|
69
|
+
pipeline id
|
|
70
|
+
--runtime_job_id RUNTIME_JOB_ID
|
|
71
|
+
job id
|
|
72
|
+
--runtime_code_location RUNTIME_CODE_LOCATION
|
|
79
73
|
AST string containing code location
|
|
80
74
|
github: Github repository URL.
|
|
81
75
|
commit_hash: github commit hash
|
|
82
76
|
path: Path within the repository
|
|
83
|
-
Example: { 'github': 'https://github.com/somerepo', 'commit_hash': '
|
|
77
|
+
Example: { 'github': 'https://github.com/somerepo', 'commit_hash': '1324',
|
|
78
|
+
'path': 'transforms/universal/code' }
|
|
84
79
|
```
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# Ray Runtime
|
|
2
|
+
The Ray runtime includes the following set of components:
|
|
3
|
+
|
|
4
|
+
* [RayTransformLauncher](../src/data_processing/runtime/ray/transform_launcher.py) - this is a
|
|
5
|
+
class generally used to implement `main()` that makes use of a `TransformConfiguration` to
|
|
6
|
+
start the Ray runtime and execute the transform over the specified set of input files.
|
|
7
|
+
The RayTransformLauncher is created using a `RayTransformConfiguration` instance.
|
|
8
|
+
* [RayTransformConfiguration](../src/data_processing/runtime/ray/transform_configuration.py) - this
|
|
9
|
+
class extends transform's base TransformConfiguration implementation to add an optional
|
|
10
|
+
`TranformRuntime` (see next) class to be used by the transform implementation.
|
|
11
|
+
* [TransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py) -
|
|
12
|
+
this provides the ability for the transform implementor to create additional Ray resources
|
|
13
|
+
and include them in the configuration used to create a transform
|
|
14
|
+
(see, for example, [exact dedup](../../transforms/universal/ededup/src/ededup_transform.py)).
|
|
15
|
+
This also provide the ability to supplement the statics collected by
|
|
16
|
+
[Statistics](../src/data_processing/runtime/ray/transform_statistics.py) (see below).
|
|
17
|
+
|
|
18
|
+
Roughly speaking the following steps are completed to establish transforms in the RayWorkers
|
|
19
|
+
|
|
20
|
+
1. Launcher parses the CLI parameters using an ArgumentParser configured with its own CLI parameters
|
|
21
|
+
along with those of the Transform Configuration,
|
|
22
|
+
2. Launcher passes the Transform Configuration and CLI parameters to the [RayOrchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py)
|
|
23
|
+
3. RayOrchestrator creates the Transform Runtime using the Transform Configuration and its CLI parameter values
|
|
24
|
+
4. Transform Runtime creates transform initialization/configuration including the CLI parameters,
|
|
25
|
+
and any Ray components need by the transform.
|
|
26
|
+
5. [RayWorker](../src/data_processing/runtime/ray/transform_table_processor.py) is started with configuration from the Transform Runtime.
|
|
27
|
+
6. RayWorker creates the Transform using the configuration provided by the Transform Runtime.
|
|
28
|
+
7. Statistics is used to collect the statistics submitted by the individual transform, that
|
|
29
|
+
is used for building execution metadata.
|
|
30
|
+
|
|
31
|
+

|
|
32
|
+
|
|
33
|
+
## Ray Transform Launcher
|
|
34
|
+
The [RayTransformLauncher](../src/data_processing/runtime/ray/transform_launcher.py) uses the Transform Configuration
|
|
35
|
+
and provides a single method, `launch()`, that kicks off the Ray environment and transform execution coordinated
|
|
36
|
+
by [orchestrator](../src/data_processing/runtime/ray/transform_orchestrator.py).
|
|
37
|
+
For example,
|
|
38
|
+
```python
|
|
39
|
+
launcher = RayTransformLauncher(YourTransformConfiguration())
|
|
40
|
+
launcher.launch()
|
|
41
|
+
```
|
|
42
|
+
Note that the launcher defines some additional CLI parameters that are used to control the operation of the
|
|
43
|
+
[orchestrator and workers](../src/data_processing/runtime/ray/transform_orchestrator_configuration.py) and
|
|
44
|
+
[data access](../src/data_processing/data_access/data_access_factory.py). Things such as data access configuration,
|
|
45
|
+
number of workers, worker resources, etc.
|
|
46
|
+
Discussion of these options is beyond the scope of this document
|
|
47
|
+
(see [Launcher Options](ray-launcher-options) for a list of available options.)
|
|
48
|
+
|
|
49
|
+
## Ray Transform Configuration
|
|
50
|
+
In general, a transform should be able to run in both the python and Ray runtimes.
|
|
51
|
+
As such we first define the python-only transform configuration, which will then
|
|
52
|
+
be used by the Ray-runtime-specific transform configuration.
|
|
53
|
+
The python transform configuration implements
|
|
54
|
+
[TransformConfiguration](../src/data_processing/transform/transform_configuration.py)
|
|
55
|
+
and deifnes with transform-specific name, and implementation
|
|
56
|
+
and class. In addition, it is responsible for providing transform-specific
|
|
57
|
+
methods to define and capture optional command line arguments.
|
|
58
|
+
```python
|
|
59
|
+
|
|
60
|
+
class YourTransformConfiguration(TransformConfiguration):
|
|
61
|
+
|
|
62
|
+
def __init__(self):
|
|
63
|
+
super().__init__(name="YourTransform", transform_class=YourTransform)
|
|
64
|
+
self.params = {}
|
|
65
|
+
|
|
66
|
+
def add_input_params(self, parser: ArgumentParser) -> None:
|
|
67
|
+
...
|
|
68
|
+
def apply_input_params(self, args: Namespace) -> bool:
|
|
69
|
+
...
|
|
70
|
+
```
|
|
71
|
+
Next we define the Ray-runtime specific transform configuration as an exension of
|
|
72
|
+
the RayTransformConfiguration and uses the `YourTransformConfiguration` above.
|
|
73
|
+
```python
|
|
74
|
+
|
|
75
|
+
class YourTransformConfiguration(RayTransformConfiguration):
|
|
76
|
+
def __init__(self):
|
|
77
|
+
super().__init__(YourTransformConfiguration(),
|
|
78
|
+
runtime_class=YourTransformRuntime
|
|
79
|
+
```
|
|
80
|
+
This class provides the ability to create the instance of `YourTransformRuntime` class (see below)
|
|
81
|
+
as needed by the Ray runtime. Note, that not all transforms will require a `runtime_class`
|
|
82
|
+
and can omit this parameter to default to an acceptable runtime class.
|
|
83
|
+
Details are covered in the [advanced transform tutorial](advanced-transform-tutorial.md).
|
|
84
|
+
|
|
85
|
+
## Transform Runtime
|
|
86
|
+
The
|
|
87
|
+
[DefaultTableTransformRuntime](../src/data_processing/runtime/ray/transform_runtime.py)
|
|
88
|
+
class is provided and will be
|
|
89
|
+
sufficient for many use cases, especially 1:1 table transformation.
|
|
90
|
+
However, some transforms will require use of the Ray environment, for example,
|
|
91
|
+
to create additional workers, establish a shared memory object, etc.
|
|
92
|
+
Of course, these transforms will generally not run outside of Ray environment.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
class DefaultTableTransformRuntime:
|
|
96
|
+
|
|
97
|
+
def __init__(self, params: dict[str, Any]):
|
|
98
|
+
...
|
|
99
|
+
|
|
100
|
+
def get_transform_config(
|
|
101
|
+
self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
|
|
102
|
+
) -> dict[str, Any]:
|
|
103
|
+
...
|
|
104
|
+
|
|
105
|
+
def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
|
|
106
|
+
...
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The RayOrchestrator initializes the instance with the CLI parameters provided by the Transform Configurations
|
|
110
|
+
`get_input_params()` method.
|
|
111
|
+
|
|
112
|
+
The `get_transform_config()` method is used by the RayOrchestrator to create the parameters
|
|
113
|
+
used to initialize the Transform in the RayWorker.
|
|
114
|
+
This is where additional Ray components would be added to the environment
|
|
115
|
+
and references added to them, as needed, in the returned dictionary of configuration data
|
|
116
|
+
that will initialize the transform.
|
|
117
|
+
For those transforms that don't need this support, the default implementation
|
|
118
|
+
simpy returns the CLI parameters used to initialize the runtime instance.
|
|
119
|
+
|
|
120
|
+
The `computed_execution_stats()` provides an opportunity to augment the statistics
|
|
121
|
+
collected and aggregated by the TransformStatistics actor. It is called by the RayOrchestrator
|
|
122
|
+
after all files have been processed.
|
|
123
|
+
|
|
124
|
+
## Exceptions
|
|
125
|
+
A transform may find that it needs to signal error conditions.
|
|
126
|
+
For example, if a referenced model could not be loaded or
|
|
127
|
+
a given table does not have the expected column.
|
|
128
|
+
In general, it should identify such conditions by raising an exception.
|
|
129
|
+
With this in mind, there are two types of exceptions:
|
|
130
|
+
|
|
131
|
+
1. Those that would not allow any tables to be processed (e.g. model loading problem).
|
|
132
|
+
2. Those that would not allow a specific table to be processed (e.g. missing column).
|
|
133
|
+
|
|
134
|
+
In the first situation the transform should throw an exception from the initializer, which
|
|
135
|
+
will cause the Ray framework to terminate processing of all tables.
|
|
136
|
+
In the second situation (identified in the `transform()` or `flush()` methods), the transform
|
|
137
|
+
should throw an exception from the associated method. This will cause only the
|
|
138
|
+
error-causing
|
|
139
|
+
table to be ignored and not written out, but allow continued processing of tables by
|
|
140
|
+
the transform.
|
|
141
|
+
In both cases, the framework will log the exception as an error.
|
|
142
|
+
|
|
143
|
+
|