data-prep-toolkit 0.0.1.dev12__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/Makefile +10 -8
- {data_prep_toolkit-0.0.1.dev12/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.1.1}/PKG-INFO +3 -3
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/README.md +2 -2
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/pyproject.toml +1 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info}/PKG-INFO +3 -3
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/SOURCES.txt +10 -24
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_factory.py +6 -6
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_local.py +16 -16
- data_prep_toolkit-0.1.1/src/data_processing/runtime/__init__.py +4 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/execution_configuration.py +8 -7
- data_prep_toolkit-0.1.1/src/data_processing/runtime/pure_python/__init__.py +4 -0
- data_prep_toolkit-0.1.1/src/data_processing/runtime/pure_python/runtime_configuration.py +24 -0
- data_prep_toolkit-0.1.1/src/data_processing/runtime/pure_python/transform_file_processor.py +53 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/pure_python/transform_launcher.py +11 -11
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/pure_python/transform_orchestrator.py +13 -12
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/__init__.py +4 -4
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_orchestrator_configuration.py → data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/execution_configuration.py +3 -5
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/ray_utils.py +2 -2
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/runtime_configuration.py +38 -0
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_file_processor.py +46 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_launcher.py +13 -21
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_orchestrator.py +10 -10
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_runtime.py +1 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_statistics.py +10 -4
- data_prep_toolkit-0.1.1/src/data_processing/runtime/runtime_configuration.py +64 -0
- data_prep_toolkit-0.1.1/src/data_processing/runtime/transform_file_processor.py +173 -0
- data_prep_toolkit-0.1.1/src/data_processing/runtime/transform_launcher.py +76 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/transform/__init__.py +2 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/transform/noop_transform.py +34 -29
- data_prep_toolkit-0.1.1/src/data_processing/transform/__init__.py +4 -0
- data_prep_toolkit-0.1.1/src/data_processing/transform/binary_transform.py +53 -0
- data_prep_toolkit-0.1.1/src/data_processing/transform/table_transform.py +116 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/transform/transform_configuration.py +35 -20
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/params_utils.py +16 -1
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/transform_utils.py +4 -9
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/pure_python/launcher_test.py +3 -12
- data_prep_toolkit-0.1.1/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +77 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +7 -9
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/ray/launcher_test.py +16 -23
- data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/multi_launcher_test.py +80 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/ray/test_noop_launch.py +0 -1
- data_prep_toolkit-0.0.1.dev12/doc/advanced-transform-tutorial.md +0 -284
- data_prep_toolkit-0.0.1.dev12/doc/architecture.md +0 -104
- data_prep_toolkit-0.0.1.dev12/doc/overview.md +0 -28
- data_prep_toolkit-0.0.1.dev12/doc/processing-architecture.jpg +0 -0
- data_prep_toolkit-0.0.1.dev12/doc/python-launcher-options.md +0 -60
- data_prep_toolkit-0.0.1.dev12/doc/python-runtime.md +0 -12
- data_prep_toolkit-0.0.1.dev12/doc/ray-launcher-options.md +0 -79
- data_prep_toolkit-0.0.1.dev12/doc/ray-runtime.md +0 -143
- data_prep_toolkit-0.0.1.dev12/doc/simplest-transform-tutorial.md +0 -211
- data_prep_toolkit-0.0.1.dev12/doc/testing-e2e-transform.md +0 -4
- data_prep_toolkit-0.0.1.dev12/doc/transform-external-resources.md +0 -224
- data_prep_toolkit-0.0.1.dev12/doc/transform-runtimes.md +0 -9
- data_prep_toolkit-0.0.1.dev12/doc/transform-s3-testing.md +0 -91
- data_prep_toolkit-0.0.1.dev12/doc/transform-standalone-testing.md +0 -99
- data_prep_toolkit-0.0.1.dev12/doc/transform-testing.md +0 -6
- data_prep_toolkit-0.0.1.dev12/doc/transform-tutorial-examples.md +0 -15
- data_prep_toolkit-0.0.1.dev12/doc/transform-tutorials.md +0 -67
- data_prep_toolkit-0.0.1.dev12/doc/transformer-utilities.md +0 -24
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/__init__.py +0 -2
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/__init__.py +0 -4
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/python_launcher_configuration.py +0 -97
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/transform_table_processor.py +0 -191
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_configuration.py +0 -33
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_launch_configuration.py +0 -44
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_table_processor.py +0 -191
- data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/transform_launcher.py +0 -25
- data_prep_toolkit-0.0.1.dev12/src/data_processing/transform/__init__.py +0 -3
- data_prep_toolkit-0.0.1.dev12/src/data_processing/transform/table_transform.py +0 -50
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/.gitignore +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/setup.cfg +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/arrow_s3.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_factory_base.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_s3.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/abstract_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/launch/transform_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/transform/transform_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/transform/transform_statistics.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/__init__.py +2 -2
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/log.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/ray/ray_util_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/transform/test_noop.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/input/subdir/test1.parquet +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Use make help, to see the available rules
|
|
2
|
-
REPOROOT
|
|
3
|
-
include
|
|
4
|
-
include ../.make.versions
|
|
2
|
+
REPOROOT=../..
|
|
3
|
+
include $(REPOROOT)/.make.defaults
|
|
5
4
|
|
|
6
5
|
TAG := "v${DPK_LIB_VERSION}"
|
|
7
6
|
|
|
@@ -53,10 +52,13 @@ venv:: pyproject.toml
|
|
|
53
52
|
# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
|
|
54
53
|
test::
|
|
55
54
|
@# Help: Use the already-built virtual environment to run pytest on the test directory.
|
|
56
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
57
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
|
|
58
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python;
|
|
55
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
56
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
|
|
57
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/launcher_test.py;
|
|
58
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/multi_launcher_test.py;
|
|
59
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/test_noop_launch.py;
|
|
59
60
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/ray_util_test.py;
|
|
60
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/
|
|
61
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/
|
|
61
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/multi_launcher_test.py;
|
|
62
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
|
|
63
|
+
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
|
|
62
64
|
|
{data_prep_toolkit-0.0.1.dev12/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.1.1}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Data Preparation Toolkit Library
|
|
5
5
|
Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -27,9 +27,9 @@ Requires-Dist: markupsafe==2.0.1; extra == "dev"
|
|
|
27
27
|
# Data Processing Library
|
|
28
28
|
This provides a python framework for developing _transforms_
|
|
29
29
|
on data stored in files - currently parquet files are supported -
|
|
30
|
-
and running them in a [ray](https://ray.
|
|
30
|
+
and running them in a [ray](https://www.ray.io/) cluster.
|
|
31
31
|
Data files may be stored in the local file system or COS/S3.
|
|
32
|
-
For more details see the [documentation](doc/overview.md).
|
|
32
|
+
For more details see the [documentation](../doc/overview.md).
|
|
33
33
|
|
|
34
34
|
### Virtual Environment
|
|
35
35
|
The project uses `pyproject.toml` and a Makefile for operations.
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
# Data Processing Library
|
|
2
2
|
This provides a python framework for developing _transforms_
|
|
3
3
|
on data stored in files - currently parquet files are supported -
|
|
4
|
-
and running them in a [ray](https://ray.
|
|
4
|
+
and running them in a [ray](https://www.ray.io/) cluster.
|
|
5
5
|
Data files may be stored in the local file system or COS/S3.
|
|
6
|
-
For more details see the [documentation](doc/overview.md).
|
|
6
|
+
For more details see the [documentation](../doc/overview.md).
|
|
7
7
|
|
|
8
8
|
### Virtual Environment
|
|
9
9
|
The project uses `pyproject.toml` and a Makefile for operations.
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Data Preparation Toolkit Library
|
|
5
5
|
Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -27,9 +27,9 @@ Requires-Dist: markupsafe==2.0.1; extra == "dev"
|
|
|
27
27
|
# Data Processing Library
|
|
28
28
|
This provides a python framework for developing _transforms_
|
|
29
29
|
on data stored in files - currently parquet files are supported -
|
|
30
|
-
and running them in a [ray](https://ray.
|
|
30
|
+
and running them in a [ray](https://www.ray.io/) cluster.
|
|
31
31
|
Data files may be stored in the local file system or COS/S3.
|
|
32
|
-
For more details see the [documentation](doc/overview.md).
|
|
32
|
+
For more details see the [documentation](../doc/overview.md).
|
|
33
33
|
|
|
34
34
|
### Virtual Environment
|
|
35
35
|
The project uses `pyproject.toml` and a Makefile for operations.
|
{data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/SOURCES.txt
RENAMED
|
@@ -2,24 +2,6 @@
|
|
|
2
2
|
Makefile
|
|
3
3
|
README.md
|
|
4
4
|
pyproject.toml
|
|
5
|
-
doc/advanced-transform-tutorial.md
|
|
6
|
-
doc/architecture.md
|
|
7
|
-
doc/overview.md
|
|
8
|
-
doc/processing-architecture.jpg
|
|
9
|
-
doc/python-launcher-options.md
|
|
10
|
-
doc/python-runtime.md
|
|
11
|
-
doc/ray-launcher-options.md
|
|
12
|
-
doc/ray-runtime.md
|
|
13
|
-
doc/simplest-transform-tutorial.md
|
|
14
|
-
doc/testing-e2e-transform.md
|
|
15
|
-
doc/transform-external-resources.md
|
|
16
|
-
doc/transform-runtimes.md
|
|
17
|
-
doc/transform-s3-testing.md
|
|
18
|
-
doc/transform-standalone-testing.md
|
|
19
|
-
doc/transform-testing.md
|
|
20
|
-
doc/transform-tutorial-examples.md
|
|
21
|
-
doc/transform-tutorials.md
|
|
22
|
-
doc/transformer-utilities.md
|
|
23
5
|
src/data_prep_toolkit.egg-info/PKG-INFO
|
|
24
6
|
src/data_prep_toolkit.egg-info/SOURCES.txt
|
|
25
7
|
src/data_prep_toolkit.egg-info/dependency_links.txt
|
|
@@ -35,22 +17,23 @@ src/data_processing/data_access/data_access_local.py
|
|
|
35
17
|
src/data_processing/data_access/data_access_s3.py
|
|
36
18
|
src/data_processing/runtime/__init__.py
|
|
37
19
|
src/data_processing/runtime/execution_configuration.py
|
|
20
|
+
src/data_processing/runtime/runtime_configuration.py
|
|
21
|
+
src/data_processing/runtime/transform_file_processor.py
|
|
38
22
|
src/data_processing/runtime/transform_launcher.py
|
|
39
23
|
src/data_processing/runtime/pure_python/__init__.py
|
|
40
|
-
src/data_processing/runtime/pure_python/
|
|
24
|
+
src/data_processing/runtime/pure_python/runtime_configuration.py
|
|
25
|
+
src/data_processing/runtime/pure_python/transform_file_processor.py
|
|
41
26
|
src/data_processing/runtime/pure_python/transform_launcher.py
|
|
42
27
|
src/data_processing/runtime/pure_python/transform_orchestrator.py
|
|
43
|
-
src/data_processing/runtime/pure_python/transform_table_processor.py
|
|
44
28
|
src/data_processing/runtime/ray/__init__.py
|
|
29
|
+
src/data_processing/runtime/ray/execution_configuration.py
|
|
45
30
|
src/data_processing/runtime/ray/ray_utils.py
|
|
46
|
-
src/data_processing/runtime/ray/
|
|
47
|
-
src/data_processing/runtime/ray/
|
|
31
|
+
src/data_processing/runtime/ray/runtime_configuration.py
|
|
32
|
+
src/data_processing/runtime/ray/transform_file_processor.py
|
|
48
33
|
src/data_processing/runtime/ray/transform_launcher.py
|
|
49
34
|
src/data_processing/runtime/ray/transform_orchestrator.py
|
|
50
|
-
src/data_processing/runtime/ray/transform_orchestrator_configuration.py
|
|
51
35
|
src/data_processing/runtime/ray/transform_runtime.py
|
|
52
36
|
src/data_processing/runtime/ray/transform_statistics.py
|
|
53
|
-
src/data_processing/runtime/ray/transform_table_processor.py
|
|
54
37
|
src/data_processing/test_support/__init__.py
|
|
55
38
|
src/data_processing/test_support/abstract_test.py
|
|
56
39
|
src/data_processing/test_support/data_access/__init__.py
|
|
@@ -61,6 +44,7 @@ src/data_processing/test_support/transform/__init__.py
|
|
|
61
44
|
src/data_processing/test_support/transform/noop_transform.py
|
|
62
45
|
src/data_processing/test_support/transform/transform_test.py
|
|
63
46
|
src/data_processing/transform/__init__.py
|
|
47
|
+
src/data_processing/transform/binary_transform.py
|
|
64
48
|
src/data_processing/transform/table_transform.py
|
|
65
49
|
src/data_processing/transform/transform_configuration.py
|
|
66
50
|
src/data_processing/transform/transform_statistics.py
|
|
@@ -88,8 +72,10 @@ test/data_processing_tests/data_access/data_access_local_test.py
|
|
|
88
72
|
test/data_processing_tests/data_access/data_access_s3_test.py
|
|
89
73
|
test/data_processing_tests/data_access/sample_input_data_test.py
|
|
90
74
|
test/data_processing_tests/launch/pure_python/launcher_test.py
|
|
75
|
+
test/data_processing_tests/launch/pure_python/multi_launcher_test.py
|
|
91
76
|
test/data_processing_tests/launch/pure_python/test_noop_launch.py
|
|
92
77
|
test/data_processing_tests/launch/ray/launcher_test.py
|
|
78
|
+
test/data_processing_tests/launch/ray/multi_launcher_test.py
|
|
93
79
|
test/data_processing_tests/launch/ray/ray_util_test.py
|
|
94
80
|
test/data_processing_tests/launch/ray/test_noop_launch.py
|
|
95
81
|
test/data_processing_tests/transform/test_noop.py
|
|
@@ -142,14 +142,14 @@ class DataAccessFactory(DataAccessFactoryBase):
|
|
|
142
142
|
arg_dict = args
|
|
143
143
|
else:
|
|
144
144
|
raise ValueError("args must be Namespace or dictionary")
|
|
145
|
-
s3_cred = arg_dict.get(f"{self.cli_arg_prefix}s3_cred")
|
|
146
|
-
s3_config = arg_dict.get(f"{self.cli_arg_prefix}s3_config")
|
|
147
|
-
local_config = arg_dict.get(f"{self.cli_arg_prefix}local_config")
|
|
148
|
-
checkpointing = arg_dict.get(f"{self.cli_arg_prefix}checkpointing")
|
|
145
|
+
s3_cred = arg_dict.get(f"{self.cli_arg_prefix}s3_cred", None)
|
|
146
|
+
s3_config = arg_dict.get(f"{self.cli_arg_prefix}s3_config", None)
|
|
147
|
+
local_config = arg_dict.get(f"{self.cli_arg_prefix}local_config", None)
|
|
148
|
+
checkpointing = arg_dict.get(f"{self.cli_arg_prefix}checkpointing", False)
|
|
149
149
|
max_files = arg_dict.get(f"{self.cli_arg_prefix}max_files", -1)
|
|
150
|
-
data_sets = arg_dict.get(f"{self.cli_arg_prefix}data_sets")
|
|
150
|
+
data_sets = arg_dict.get(f"{self.cli_arg_prefix}data_sets", None)
|
|
151
151
|
n_samples = arg_dict.get(f"{self.cli_arg_prefix}num_samples", -1)
|
|
152
|
-
files_to_use = arg_dict.get(f"{self.cli_arg_prefix}files_to_use")
|
|
152
|
+
files_to_use = arg_dict.get(f"{self.cli_arg_prefix}files_to_use", [".parquet"])
|
|
153
153
|
# check which configuration (S3, LakeHouse, or Local) is specified
|
|
154
154
|
s3_config_specified = 1 if s3_config is not None else 0
|
|
155
155
|
local_config_specified = 1 if local_config is not None else 0
|
|
@@ -318,32 +318,32 @@ class DataAccessLocal(DataAccess):
|
|
|
318
318
|
metadata["source"] = {"name": self.input_folder, "type": "path"}
|
|
319
319
|
metadata["target"] = {"name": self.output_folder, "type": "path"}
|
|
320
320
|
return self.save_file(
|
|
321
|
-
|
|
322
|
-
|
|
321
|
+
path=os.path.join(self.output_folder, "metadata.json"),
|
|
322
|
+
data=json.dumps(metadata, indent=2).encode(),
|
|
323
323
|
)
|
|
324
324
|
|
|
325
|
-
def get_file(self,
|
|
325
|
+
def get_file(self, path: str) -> bytes:
|
|
326
326
|
"""
|
|
327
327
|
Gets the contents of a file as a byte array, decompressing gz files if needed.
|
|
328
328
|
|
|
329
329
|
Args:
|
|
330
|
-
|
|
330
|
+
path (str): The path to the file.
|
|
331
331
|
|
|
332
332
|
Returns:
|
|
333
333
|
bytes: The contents of the file as a byte array, or None if an error occurs.
|
|
334
334
|
"""
|
|
335
335
|
|
|
336
336
|
try:
|
|
337
|
-
if
|
|
338
|
-
with gzip.open(
|
|
337
|
+
if path.endswith(".gz"):
|
|
338
|
+
with gzip.open(path, "rb") as f:
|
|
339
339
|
data = f.read()
|
|
340
340
|
else:
|
|
341
|
-
with open(
|
|
341
|
+
with open(path, "rb") as f:
|
|
342
342
|
data = f.read()
|
|
343
343
|
return data
|
|
344
344
|
|
|
345
345
|
except (FileNotFoundError, gzip.BadGzipFile) as e:
|
|
346
|
-
logger.error(f"Error reading file {
|
|
346
|
+
logger.error(f"Error reading file {path}: {e}")
|
|
347
347
|
raise e
|
|
348
348
|
|
|
349
349
|
def get_folder_files(self, path: str, extensions: list[str] = None, return_data: bool = True) -> dict[str, bytes]:
|
|
@@ -374,13 +374,13 @@ class DataAccessLocal(DataAccess):
|
|
|
374
374
|
matching_files[filename] = _get_file_content(filename, return_data)
|
|
375
375
|
return matching_files
|
|
376
376
|
|
|
377
|
-
def save_file(self,
|
|
377
|
+
def save_file(self, path: str, data: bytes) -> dict[str, Any]:
|
|
378
378
|
"""
|
|
379
379
|
Saves bytes to a file and returns a dictionary with file information.
|
|
380
380
|
|
|
381
381
|
Args:
|
|
382
|
-
|
|
383
|
-
|
|
382
|
+
data (bytes): The bytes data to save.
|
|
383
|
+
path (str): The full name of the file to save.
|
|
384
384
|
|
|
385
385
|
Returns:
|
|
386
386
|
dict or None: A dictionary with "name" and "size" keys if successful,
|
|
@@ -388,12 +388,12 @@ class DataAccessLocal(DataAccess):
|
|
|
388
388
|
"""
|
|
389
389
|
|
|
390
390
|
try:
|
|
391
|
-
os.makedirs(os.path.dirname(
|
|
392
|
-
with open(
|
|
393
|
-
f.write(
|
|
394
|
-
file_info = {"name":
|
|
391
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
|
392
|
+
with open(path, "wb") as f:
|
|
393
|
+
f.write(data)
|
|
394
|
+
file_info = {"name": path, "size": os.path.getsize(path)}
|
|
395
395
|
return file_info
|
|
396
396
|
|
|
397
397
|
except Exception as e:
|
|
398
|
-
logger.error(f"Error saving bytes to file {
|
|
398
|
+
logger.error(f"Error saving bytes to file {path}: {e}")
|
|
399
399
|
return None
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from data_processing.runtime.execution_configuration import TransformExecutionConfiguration
|
|
2
|
+
from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration
|
|
3
|
+
from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher
|
|
4
|
+
from data_processing.runtime.transform_file_processor import AbstractTransformFileProcessor
|
|
@@ -27,15 +27,17 @@ class TransformExecutionConfiguration(CLIArgumentProvider):
|
|
|
27
27
|
A class specifying and validating transform execution configuration
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
def __init__(self, name: str,
|
|
30
|
+
def __init__(self, name: str, print_params: bool = True):
|
|
31
31
|
"""
|
|
32
32
|
Initialization
|
|
33
|
+
:param name: job name
|
|
34
|
+
:param print_params: flag to print parameters
|
|
33
35
|
"""
|
|
34
36
|
self.pipeline_id = ""
|
|
35
37
|
self.job_details = {}
|
|
36
38
|
self.code_location = {}
|
|
37
39
|
self.name = name
|
|
38
|
-
self.
|
|
40
|
+
self.print_params = print_params
|
|
39
41
|
|
|
40
42
|
def add_input_params(self, parser: argparse.ArgumentParser) -> None:
|
|
41
43
|
"""
|
|
@@ -74,10 +76,9 @@ class TransformExecutionConfiguration(CLIArgumentProvider):
|
|
|
74
76
|
"job id": captured["job_id"],
|
|
75
77
|
}
|
|
76
78
|
self.code_location = captured["code_location"]
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
logger.info(f"pipeline id {self.pipeline_id}")
|
|
79
|
+
# print parameters
|
|
80
|
+
logger.info(f"pipeline id {self.pipeline_id}")
|
|
81
|
+
if self.print_params:
|
|
81
82
|
logger.info(f"job details {self.job_details}")
|
|
82
|
-
|
|
83
|
+
logger.info(f"code location {self.code_location}")
|
|
83
84
|
return True
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
|
|
2
|
+
from data_processing.runtime.pure_python.transform_file_processor import PythonTransformFileProcessor
|
|
3
|
+
from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
|
|
4
|
+
from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
|
+
from data_processing.transform import TransformConfiguration
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
18
|
+
def __init__(self, transform_config: TransformConfiguration):
|
|
19
|
+
"""
|
|
20
|
+
Initialization
|
|
21
|
+
:param transform_config - base configuration class
|
|
22
|
+
"""
|
|
23
|
+
self.transform_config = transform_config
|
|
24
|
+
super().__init__(transform_config=transform_config)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
16
|
+
from data_processing.runtime import (
|
|
17
|
+
AbstractTransformFileProcessor,
|
|
18
|
+
)
|
|
19
|
+
from data_processing.transform import TransformStatistics
|
|
20
|
+
from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class PythonTransformFileProcessor(AbstractTransformFileProcessor):
|
|
24
|
+
"""
|
|
25
|
+
This is the class implementing the worker class processing of a single file
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
data_access_factory: DataAccessFactoryBase,
|
|
31
|
+
statistics: TransformStatistics,
|
|
32
|
+
runtime_configuration: PythonTransformRuntimeConfiguration,
|
|
33
|
+
):
|
|
34
|
+
"""
|
|
35
|
+
Init method
|
|
36
|
+
:param data_access_factory - data access factory
|
|
37
|
+
:param statistics - reference to statistics class
|
|
38
|
+
:param runtime_configuration: transform configuration class
|
|
39
|
+
"""
|
|
40
|
+
# Create data access
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.data_access = data_access_factory.create_data_access()
|
|
43
|
+
# Add data access and statistics to the processor parameters
|
|
44
|
+
transform_params = dict(runtime_configuration.get_transform_params())
|
|
45
|
+
transform_params["data_access"] = self.data_access
|
|
46
|
+
transform_params["statistics"] = statistics
|
|
47
|
+
# Create local processor
|
|
48
|
+
self.transform = runtime_configuration.get_transform_class()(transform_params)
|
|
49
|
+
# Create statistics
|
|
50
|
+
self.stats = statistics
|
|
51
|
+
|
|
52
|
+
def _publish_stats(self, stats: dict[str, Any]) -> None:
|
|
53
|
+
self.stats.add_stats(stats)
|
|
@@ -15,9 +15,11 @@ import time
|
|
|
15
15
|
|
|
16
16
|
from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
|
|
17
17
|
from data_processing.runtime import TransformExecutionConfiguration
|
|
18
|
-
from data_processing.runtime.pure_python import
|
|
18
|
+
from data_processing.runtime.pure_python import (
|
|
19
|
+
PythonTransformRuntimeConfiguration,
|
|
20
|
+
orchestrate,
|
|
21
|
+
)
|
|
19
22
|
from data_processing.runtime.transform_launcher import AbstractTransformLauncher
|
|
20
|
-
from data_processing.transform import TransformConfiguration
|
|
21
23
|
from data_processing.utils import get_logger
|
|
22
24
|
|
|
23
25
|
|
|
@@ -31,18 +33,16 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
31
33
|
|
|
32
34
|
def __init__(
|
|
33
35
|
self,
|
|
34
|
-
|
|
35
|
-
transform_config: TransformConfiguration,
|
|
36
|
+
runtime_config: PythonTransformRuntimeConfiguration,
|
|
36
37
|
data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
|
|
37
38
|
):
|
|
38
39
|
"""
|
|
39
40
|
Creates driver
|
|
40
|
-
:param
|
|
41
|
+
:param runtime_config: transform runtime factory
|
|
41
42
|
:param data_access_factory: the factory to create DataAccess instances.
|
|
42
43
|
"""
|
|
43
|
-
super().__init__(
|
|
44
|
-
self.
|
|
45
|
-
self.execution_config = TransformExecutionConfiguration(name=self.transform_runtime_config.get_name())
|
|
44
|
+
super().__init__(runtime_config, data_access_factory)
|
|
45
|
+
self.execution_config = TransformExecutionConfiguration(name=runtime_config.get_name())
|
|
46
46
|
|
|
47
47
|
def __get_parameters(self) -> bool:
|
|
48
48
|
"""
|
|
@@ -57,12 +57,12 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
57
57
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
58
58
|
)
|
|
59
59
|
# add additional arguments
|
|
60
|
-
self.
|
|
60
|
+
self.runtime_config.add_input_params(parser=parser)
|
|
61
61
|
self.data_access_factory.add_input_params(parser=parser)
|
|
62
62
|
self.execution_config.add_input_params(parser=parser)
|
|
63
63
|
args = parser.parse_args()
|
|
64
64
|
return (
|
|
65
|
-
self.
|
|
65
|
+
self.runtime_config.apply_input_params(args=args)
|
|
66
66
|
and self.execution_config.apply_input_params(args=args)
|
|
67
67
|
and self.data_access_factory.apply_input_params(args=args)
|
|
68
68
|
)
|
|
@@ -78,7 +78,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
78
78
|
logger.debug("Starting orchestrator")
|
|
79
79
|
res = orchestrate(
|
|
80
80
|
data_access_factory=self.data_access_factory,
|
|
81
|
-
|
|
81
|
+
runtime_config=self.runtime_config,
|
|
82
82
|
execution_config=self.execution_config,
|
|
83
83
|
)
|
|
84
84
|
logger.debug("Completed orchestrator")
|
|
@@ -15,11 +15,11 @@ import traceback
|
|
|
15
15
|
from datetime import datetime
|
|
16
16
|
|
|
17
17
|
from data_processing.data_access import DataAccessFactoryBase
|
|
18
|
-
from data_processing.runtime import
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
TransformTableProcessor,
|
|
18
|
+
from data_processing.runtime import (
|
|
19
|
+
TransformExecutionConfiguration,
|
|
20
|
+
TransformRuntimeConfiguration,
|
|
22
21
|
)
|
|
22
|
+
from data_processing.runtime.pure_python import PythonTransformFileProcessor
|
|
23
23
|
from data_processing.transform import TransformStatistics
|
|
24
24
|
from data_processing.utils import get_logger
|
|
25
25
|
|
|
@@ -29,17 +29,18 @@ logger = get_logger(__name__)
|
|
|
29
29
|
|
|
30
30
|
def orchestrate(
|
|
31
31
|
data_access_factory: DataAccessFactoryBase,
|
|
32
|
-
|
|
32
|
+
runtime_config: TransformRuntimeConfiguration,
|
|
33
33
|
execution_config: TransformExecutionConfiguration,
|
|
34
34
|
) -> int:
|
|
35
35
|
"""
|
|
36
36
|
orchestrator for transformer execution
|
|
37
37
|
:param data_access_factory: data access factory
|
|
38
|
-
:param
|
|
38
|
+
:param runtime_config: transformer configuration
|
|
39
|
+
:param execution_config: execution configuration
|
|
39
40
|
:return: 0 - success or 1 - failure
|
|
40
41
|
"""
|
|
41
42
|
start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
42
|
-
logger.info(f"orchestrator {
|
|
43
|
+
logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
|
|
43
44
|
try:
|
|
44
45
|
# create data access
|
|
45
46
|
data_access = data_access_factory.create_data_access()
|
|
@@ -59,15 +60,15 @@ def orchestrate(
|
|
|
59
60
|
# create statistics
|
|
60
61
|
statistics = TransformStatistics()
|
|
61
62
|
# create executor
|
|
62
|
-
executor =
|
|
63
|
-
data_access_factory=data_access_factory, statistics=statistics,
|
|
63
|
+
executor = PythonTransformFileProcessor(
|
|
64
|
+
data_access_factory=data_access_factory, statistics=statistics, runtime_configuration=runtime_config
|
|
64
65
|
)
|
|
65
66
|
# process data
|
|
66
|
-
logger.debug(f"{
|
|
67
|
+
logger.debug(f"{runtime_config.get_name()} Begin processing files")
|
|
67
68
|
t_start = time.time()
|
|
68
69
|
completed = 0
|
|
69
70
|
for path in files:
|
|
70
|
-
executor.
|
|
71
|
+
executor.process_file(path)
|
|
71
72
|
completed += 1
|
|
72
73
|
if completed % print_interval == 0:
|
|
73
74
|
logger.info(f"Completed {completed} files in {(time.time() - t_start)/60} min")
|
|
@@ -81,7 +82,7 @@ def orchestrate(
|
|
|
81
82
|
stats = statistics.get_execution_stats()
|
|
82
83
|
# build and save metadata
|
|
83
84
|
logger.debug("Building job metadata")
|
|
84
|
-
input_params =
|
|
85
|
+
input_params = runtime_config.get_transform_metadata()
|
|
85
86
|
metadata = {
|
|
86
87
|
"pipeline": execution_config.pipeline_id,
|
|
87
88
|
"job details": execution_config.job_details
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from data_processing.runtime.ray.ray_utils import RayUtils
|
|
2
2
|
from data_processing.runtime.ray.transform_statistics import TransformStatisticsRay
|
|
3
|
-
from data_processing.runtime.ray.
|
|
4
|
-
from data_processing.runtime.ray.
|
|
5
|
-
from data_processing.runtime.ray.
|
|
6
|
-
from data_processing.runtime.ray.
|
|
3
|
+
from data_processing.runtime.ray.transform_runtime import DefaultRayTransformRuntime
|
|
4
|
+
from data_processing.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
|
|
5
|
+
from data_processing.runtime.ray.transform_file_processor import RayTransformFileProcessor
|
|
6
|
+
from data_processing.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
|
|
7
7
|
from data_processing.runtime.ray.transform_orchestrator import orchestrate
|
|
8
8
|
from data_processing.runtime.ray.transform_launcher import RayTransformLauncher
|
|
@@ -24,7 +24,7 @@ logger = get_logger(__name__)
|
|
|
24
24
|
cli_prefix = "runtime_"
|
|
25
25
|
|
|
26
26
|
|
|
27
|
-
class
|
|
27
|
+
class RayTransformExecutionConfiguration(TransformExecutionConfiguration):
|
|
28
28
|
"""
|
|
29
29
|
A class specifying and validating Ray orchestrator configuration
|
|
30
30
|
"""
|
|
@@ -33,7 +33,7 @@ class TransformOrchestratorConfiguration(TransformExecutionConfiguration):
|
|
|
33
33
|
"""
|
|
34
34
|
Initialization
|
|
35
35
|
"""
|
|
36
|
-
super().__init__(name=name,
|
|
36
|
+
super().__init__(name=name, print_params=False)
|
|
37
37
|
self.worker_options = {}
|
|
38
38
|
self.n_workers = 1
|
|
39
39
|
self.creation_delay = 0
|
|
@@ -91,10 +91,8 @@ class TransformOrchestratorConfiguration(TransformExecutionConfiguration):
|
|
|
91
91
|
|
|
92
92
|
# print them
|
|
93
93
|
logger.info(f"number of workers {self.n_workers} worker options {self.worker_options}")
|
|
94
|
-
logger.info(f"pipeline id {self.pipeline_id}; number workers {self.n_workers}")
|
|
95
|
-
logger.info(f"job details {self.job_details}")
|
|
96
|
-
logger.info(f"code location {self.code_location}")
|
|
97
94
|
logger.info(f"actor creation delay {self.creation_delay}")
|
|
95
|
+
logger.info(f"job details {self.job_details}")
|
|
98
96
|
return True
|
|
99
97
|
|
|
100
98
|
def get_input_params(self) -> dict[str, Any]:
|
|
@@ -132,12 +132,12 @@ class RayUtils:
|
|
|
132
132
|
completed = 0
|
|
133
133
|
for path in files:
|
|
134
134
|
if executors.has_free(): # still have room
|
|
135
|
-
executors.submit(lambda a, v: a.
|
|
135
|
+
executors.submit(lambda a, v: a.process_file.remote(v), path)
|
|
136
136
|
running = running + 1
|
|
137
137
|
files_in_progress_gauge.set(running)
|
|
138
138
|
else: # need to wait for some actors
|
|
139
139
|
executors.get_next_unordered()
|
|
140
|
-
executors.submit(lambda a, v: a.
|
|
140
|
+
executors.submit(lambda a, v: a.process_file.remote(v), path)
|
|
141
141
|
completed = completed + 1
|
|
142
142
|
files_completed_gauge.set(completed)
|
|
143
143
|
RayUtils.get_available_resources(
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
|
+
from data_processing.runtime.ray import DefaultRayTransformRuntime
|
|
15
|
+
from data_processing.transform import TransformConfiguration
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RayTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
transform_config: TransformConfiguration,
|
|
22
|
+
runtime_class: type[DefaultRayTransformRuntime] = DefaultRayTransformRuntime,
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Initialization
|
|
26
|
+
:param transform_config - base configuration class
|
|
27
|
+
:param runtime_class: implementation of the transform runtime
|
|
28
|
+
:param remove_from_metadata - list of parameters to remove from metadata
|
|
29
|
+
"""
|
|
30
|
+
super().__init__(transform_config=transform_config)
|
|
31
|
+
self.runtime_class = runtime_class
|
|
32
|
+
|
|
33
|
+
def create_transform_runtime(self) -> DefaultRayTransformRuntime:
|
|
34
|
+
"""
|
|
35
|
+
Create transform runtime with the parameters captured during apply_input_params()
|
|
36
|
+
:return: transform runtime object
|
|
37
|
+
"""
|
|
38
|
+
return self.runtime_class(self.transform_config.get_transform_params())
|