data-prep-toolkit 0.1.1__tar.gz → 0.2.0.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/Makefile +15 -22
- {data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.0.dev2}/PKG-INFO +1 -4
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/pyproject.toml +1 -5
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -4
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/SOURCES.txt +6 -19
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/requires.txt +0 -3
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_local.py +8 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_s3.py +36 -7
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_file_processor.py +2 -4
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/transform_file_processor.py +13 -5
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/launch/transform_test.py +12 -6
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/__init__.py +0 -1
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/noop_transform.py +2 -19
- data_prep_toolkit-0.2.0.dev2/src/data_processing/transform/abstract_transform.py +16 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/binary_transform.py +7 -2
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/table_transform.py +14 -12
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/transform_configuration.py +3 -2
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +1 -1
- data_prep_toolkit-0.1.1/.gitignore +0 -35
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/__init__.py +0 -8
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/execution_configuration.py +0 -107
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/ray_utils.py +0 -180
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/runtime_configuration.py +0 -38
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_file_processor.py +0 -46
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_launcher.py +0 -124
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_orchestrator.py +0 -143
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_runtime.py +0 -53
- data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_statistics.py +0 -66
- data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/launcher_test.py +0 -189
- data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/multi_launcher_test.py +0 -80
- data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/ray_util_test.py +0 -105
- data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/test_noop_launch.py +0 -41
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/README.md +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/setup.cfg +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/arrow_s3.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_factory.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_factory_base.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/execution_configuration.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/runtime_configuration.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_launcher.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/runtime_configuration.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/transform_launcher.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/abstract_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/transform_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/transform_statistics.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/__init__.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/log.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/params_utils.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/transform_utils.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/transform/test_noop.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/input/subdir/test1.parquet +0 -0
|
@@ -13,27 +13,19 @@ clean::
|
|
|
13
13
|
.check-env::
|
|
14
14
|
@echo "Checks passed"
|
|
15
15
|
|
|
16
|
-
update-toml:: .check-env
|
|
17
|
-
@# Help: Copy the Makefile distribution version into the pyproject.toml
|
|
18
|
-
sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
|
|
19
|
-
mv tt.toml pyproject.toml
|
|
20
|
-
|
|
21
16
|
setup::
|
|
22
17
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
${PIP} install --upgrade build
|
|
28
|
-
${PYTHON} -m build
|
|
18
|
+
set-versions:: .check-env
|
|
19
|
+
$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
|
|
20
|
+
|
|
21
|
+
build:: build-dist
|
|
29
22
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
#@git push origin ${TAG}
|
|
23
|
+
#build:: update-toml .defaults.build-dist
|
|
24
|
+
build-dist :: set-versions .defaults.build-dist
|
|
25
|
+
|
|
26
|
+
publish:: publish-dist
|
|
27
|
+
|
|
28
|
+
publish-dist :: .check-env .defaults.publish-dist
|
|
37
29
|
|
|
38
30
|
venv:: pyproject.toml
|
|
39
31
|
@# Help: Create the virtual environment using pyproject.toml
|
|
@@ -46,10 +38,14 @@ venv:: pyproject.toml
|
|
|
46
38
|
pip install -e .; \
|
|
47
39
|
pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1
|
|
48
40
|
|
|
41
|
+
image::
|
|
42
|
+
@# Help: Placeholder does nothing for now.
|
|
43
|
+
@echo "Image building for ray is in the works (comming soon)."
|
|
49
44
|
|
|
50
45
|
# Here we run each test directory of tests and each ray launched test separately, because
|
|
51
46
|
# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
|
|
52
47
|
# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
|
|
48
|
+
# TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) .
|
|
53
49
|
test::
|
|
54
50
|
@# Help: Use the already-built virtual environment to run pytest on the test directory.
|
|
55
51
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
@@ -57,8 +53,5 @@ test::
|
|
|
57
53
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/launcher_test.py;
|
|
58
54
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/multi_launcher_test.py;
|
|
59
55
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/test_noop_launch.py;
|
|
60
|
-
|
|
61
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/multi_launcher_test.py;
|
|
62
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
|
|
63
|
-
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
|
|
56
|
+
|
|
64
57
|
|
{data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.0.dev2}/PKG-INFO
RENAMED
|
@@ -1,18 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0.dev2
|
|
4
4
|
Summary: Data Preparation Toolkit Library
|
|
5
5
|
Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Requires-Python: >=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist: ray[default]==2.9.3
|
|
10
9
|
Requires-Dist: pyarrow==15.0.2
|
|
11
10
|
Requires-Dist: boto3==1.34.69
|
|
12
11
|
Requires-Dist: argparse
|
|
13
12
|
Requires-Dist: mmh3
|
|
14
|
-
Requires-Dist: fastapi>=0.109.1
|
|
15
|
-
Requires-Dist: pillow>=10.2.0
|
|
16
13
|
Provides-Extra: dev
|
|
17
14
|
Requires-Dist: twine; extra == "dev"
|
|
18
15
|
Requires-Dist: pytest>=7.3.2; extra == "dev"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_toolkit"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0.dev2"
|
|
4
4
|
requires-python = ">=3.10"
|
|
5
5
|
description = "Data Preparation Toolkit Library"
|
|
6
6
|
license = {text = "Apache-2.0"}
|
|
@@ -10,14 +10,10 @@ authors = [
|
|
|
10
10
|
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
|
|
11
11
|
]
|
|
12
12
|
dependencies = [
|
|
13
|
-
"ray[default]==2.9.3",
|
|
14
13
|
"pyarrow==15.0.2",
|
|
15
14
|
"boto3==1.34.69",
|
|
16
15
|
"argparse",
|
|
17
16
|
"mmh3",
|
|
18
|
-
# These two are to fix security issue
|
|
19
|
-
"fastapi>=0.109.1",
|
|
20
|
-
"pillow>=10.2.0",
|
|
21
17
|
]
|
|
22
18
|
|
|
23
19
|
[build-system]
|
{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2/src/data_prep_toolkit.egg-info}/PKG-INFO
RENAMED
|
@@ -1,18 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0.dev2
|
|
4
4
|
Summary: Data Preparation Toolkit Library
|
|
5
5
|
Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
7
|
Requires-Python: >=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist: ray[default]==2.9.3
|
|
10
9
|
Requires-Dist: pyarrow==15.0.2
|
|
11
10
|
Requires-Dist: boto3==1.34.69
|
|
12
11
|
Requires-Dist: argparse
|
|
13
12
|
Requires-Dist: mmh3
|
|
14
|
-
Requires-Dist: fastapi>=0.109.1
|
|
15
|
-
Requires-Dist: pillow>=10.2.0
|
|
16
13
|
Provides-Extra: dev
|
|
17
14
|
Requires-Dist: twine; extra == "dev"
|
|
18
15
|
Requires-Dist: pytest>=7.3.2; extra == "dev"
|
{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/SOURCES.txt
RENAMED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
.gitignore
|
|
2
1
|
Makefile
|
|
3
2
|
README.md
|
|
4
3
|
pyproject.toml
|
|
@@ -25,15 +24,6 @@ src/data_processing/runtime/pure_python/runtime_configuration.py
|
|
|
25
24
|
src/data_processing/runtime/pure_python/transform_file_processor.py
|
|
26
25
|
src/data_processing/runtime/pure_python/transform_launcher.py
|
|
27
26
|
src/data_processing/runtime/pure_python/transform_orchestrator.py
|
|
28
|
-
src/data_processing/runtime/ray/__init__.py
|
|
29
|
-
src/data_processing/runtime/ray/execution_configuration.py
|
|
30
|
-
src/data_processing/runtime/ray/ray_utils.py
|
|
31
|
-
src/data_processing/runtime/ray/runtime_configuration.py
|
|
32
|
-
src/data_processing/runtime/ray/transform_file_processor.py
|
|
33
|
-
src/data_processing/runtime/ray/transform_launcher.py
|
|
34
|
-
src/data_processing/runtime/ray/transform_orchestrator.py
|
|
35
|
-
src/data_processing/runtime/ray/transform_runtime.py
|
|
36
|
-
src/data_processing/runtime/ray/transform_statistics.py
|
|
37
27
|
src/data_processing/test_support/__init__.py
|
|
38
28
|
src/data_processing/test_support/abstract_test.py
|
|
39
29
|
src/data_processing/test_support/data_access/__init__.py
|
|
@@ -44,6 +34,7 @@ src/data_processing/test_support/transform/__init__.py
|
|
|
44
34
|
src/data_processing/test_support/transform/noop_transform.py
|
|
45
35
|
src/data_processing/test_support/transform/transform_test.py
|
|
46
36
|
src/data_processing/transform/__init__.py
|
|
37
|
+
src/data_processing/transform/abstract_transform.py
|
|
47
38
|
src/data_processing/transform/binary_transform.py
|
|
48
39
|
src/data_processing/transform/table_transform.py
|
|
49
40
|
src/data_processing/transform/transform_configuration.py
|
|
@@ -62,11 +53,11 @@ test-data/data_processing/input/sample1.parquet
|
|
|
62
53
|
test-data/data_processing/input_multiple/sample1.parquet
|
|
63
54
|
test-data/data_processing/input_multiple/sample2.parquet
|
|
64
55
|
test-data/data_processing/input_multiple/sample3.parquet
|
|
65
|
-
test-data/data_processing/
|
|
66
|
-
test-data/data_processing/
|
|
67
|
-
test-data/data_processing/
|
|
68
|
-
test-data/data_processing/
|
|
69
|
-
test-data/data_processing/
|
|
56
|
+
test-data/data_processing/python/noop/expected/metadata.json
|
|
57
|
+
test-data/data_processing/python/noop/expected/sample1.parquet
|
|
58
|
+
test-data/data_processing/python/noop/expected/subdir/test1.parquet
|
|
59
|
+
test-data/data_processing/python/noop/input/sample1.parquet
|
|
60
|
+
test-data/data_processing/python/noop/input/subdir/test1.parquet
|
|
70
61
|
test/data_processing_tests/data_access/daf_local_test.py
|
|
71
62
|
test/data_processing_tests/data_access/data_access_local_test.py
|
|
72
63
|
test/data_processing_tests/data_access/data_access_s3_test.py
|
|
@@ -74,9 +65,5 @@ test/data_processing_tests/data_access/sample_input_data_test.py
|
|
|
74
65
|
test/data_processing_tests/launch/pure_python/launcher_test.py
|
|
75
66
|
test/data_processing_tests/launch/pure_python/multi_launcher_test.py
|
|
76
67
|
test/data_processing_tests/launch/pure_python/test_noop_launch.py
|
|
77
|
-
test/data_processing_tests/launch/ray/launcher_test.py
|
|
78
|
-
test/data_processing_tests/launch/ray/multi_launcher_test.py
|
|
79
|
-
test/data_processing_tests/launch/ray/ray_util_test.py
|
|
80
|
-
test/data_processing_tests/launch/ray/test_noop_launch.py
|
|
81
68
|
test/data_processing_tests/transform/test_noop.py
|
|
82
69
|
test/data_processing_tests/util/transform_utils_test.py
|
|
@@ -55,6 +55,14 @@ class DataAccessLocal(DataAccess):
|
|
|
55
55
|
self.n_samples = n_samples
|
|
56
56
|
self.files_to_use = files_to_use
|
|
57
57
|
|
|
58
|
+
logger.debug(f"Local input folder: {self.input_folder}")
|
|
59
|
+
logger.debug(f"Local output folder: {self.output_folder}")
|
|
60
|
+
logger.debug(f"Local data sets: {self.d_sets}")
|
|
61
|
+
logger.debug(f"Local checkpoint: {self.checkpoint}")
|
|
62
|
+
logger.debug(f"Local m_files: {self.m_files}")
|
|
63
|
+
logger.debug(f"Local n_samples: {self.n_samples}")
|
|
64
|
+
logger.debug(f"Local files_to_use: {self.files_to_use}")
|
|
65
|
+
|
|
58
66
|
def get_num_samples(self) -> int:
|
|
59
67
|
"""
|
|
60
68
|
Get number of samples for input
|
|
@@ -48,15 +48,18 @@ class DataAccessS3(DataAccess):
|
|
|
48
48
|
:param n_samples: amount of files to randomly sample
|
|
49
49
|
:param files_to_use: files extensions of files to include
|
|
50
50
|
"""
|
|
51
|
-
self.
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
51
|
+
self.s3_credentials = {} | s3_credentials
|
|
52
|
+
access_key = self.get_access_key()
|
|
53
|
+
if access_key is None:
|
|
54
|
+
raise ValueError("S3 access key not provided")
|
|
55
|
+
secret_key = self.get_secret_key()
|
|
56
|
+
if secret_key is None:
|
|
57
|
+
raise ValueError("S3 secret key not provided")
|
|
58
|
+
endpoint = self.get_endpoint()
|
|
59
|
+
region = self.get_region()
|
|
57
60
|
if s3_config is None:
|
|
58
61
|
self.input_folder = None
|
|
59
|
-
self.
|
|
62
|
+
self.output_folder = None
|
|
60
63
|
else:
|
|
61
64
|
self.input_folder = TransformUtils.clean_path(s3_config["input_folder"])
|
|
62
65
|
self.output_folder = TransformUtils.clean_path(s3_config["output_folder"])
|
|
@@ -66,6 +69,32 @@ class DataAccessS3(DataAccess):
|
|
|
66
69
|
self.n_samples = n_samples
|
|
67
70
|
self.files_to_use = files_to_use
|
|
68
71
|
|
|
72
|
+
logger.debug(f"S3 access key provided: {access_key}")
|
|
73
|
+
logger.debug(f"S3 secret key provided: no soup for you!")
|
|
74
|
+
logger.debug(f"S3 region {region}")
|
|
75
|
+
logger.debug(f"S3 endpoint/url: {endpoint}")
|
|
76
|
+
logger.debug(f"S input folder: {self.input_folder}")
|
|
77
|
+
logger.debug(f"S3 output folder: {self.output_folder}")
|
|
78
|
+
logger.debug(f"S3 data sets: {self.d_sets}")
|
|
79
|
+
logger.debug(f"S3 checkpoint: {self.checkpoint}")
|
|
80
|
+
logger.debug(f"S3 m_files: {self.m_files}")
|
|
81
|
+
logger.debug(f"S3 n_samples: {self.n_samples}")
|
|
82
|
+
logger.debug(f"S3 files_to_use: {self.files_to_use}")
|
|
83
|
+
|
|
84
|
+
self.arrS3 = ArrowS3(access_key, secret_key, endpoint=endpoint, region=region)
|
|
85
|
+
|
|
86
|
+
def get_access_key(self):
|
|
87
|
+
return self.s3_credentials.get("access_key", None)
|
|
88
|
+
|
|
89
|
+
def get_secret_key(self):
|
|
90
|
+
return self.s3_credentials.get("secret_key", None)
|
|
91
|
+
|
|
92
|
+
def get_endpoint(self):
|
|
93
|
+
return self.s3_credentials.get("url", None)
|
|
94
|
+
|
|
95
|
+
def get_region(self):
|
|
96
|
+
return self.s3_credentials.get("region", None)
|
|
97
|
+
|
|
69
98
|
def get_num_samples(self) -> int:
|
|
70
99
|
"""
|
|
71
100
|
Get number of samples for input
|
|
@@ -13,11 +13,9 @@
|
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
15
|
from data_processing.data_access import DataAccessFactoryBase
|
|
16
|
-
from data_processing.runtime import
|
|
17
|
-
AbstractTransformFileProcessor,
|
|
18
|
-
)
|
|
19
|
-
from data_processing.transform import TransformStatistics
|
|
16
|
+
from data_processing.runtime import AbstractTransformFileProcessor
|
|
20
17
|
from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
|
|
18
|
+
from data_processing.transform import TransformStatistics
|
|
21
19
|
|
|
22
20
|
|
|
23
21
|
class PythonTransformFileProcessor(AbstractTransformFileProcessor):
|
|
@@ -58,7 +58,7 @@ class AbstractTransformFileProcessor:
|
|
|
58
58
|
name_extension = TransformUtils.get_file_extension(f_name)
|
|
59
59
|
self.logger.debug(f"Begin transforming file {f_name}")
|
|
60
60
|
out_files, stats = self.transform.transform_binary(byte_array=filedata, ext=name_extension[1])
|
|
61
|
-
self.logger.debug(f"Done transforming file {f_name}")
|
|
61
|
+
self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
|
|
62
62
|
self.last_file_name = name_extension[0]
|
|
63
63
|
self.last_file_name_next_index = None
|
|
64
64
|
self.last_extension = name_extension[1]
|
|
@@ -83,7 +83,9 @@ class AbstractTransformFileProcessor:
|
|
|
83
83
|
try:
|
|
84
84
|
t_start = time.time()
|
|
85
85
|
# get flush results
|
|
86
|
-
self.logger.debug(
|
|
86
|
+
self.logger.debug(
|
|
87
|
+
f"Begin flushing transform, last file name {self.last_file_name}, last index {self.last_file_name_next_index}"
|
|
88
|
+
)
|
|
87
89
|
out_files, stats = self.transform.flush_binary()
|
|
88
90
|
self.logger.debug(f"Done flushing transform, got {len(out_files)} files")
|
|
89
91
|
# Here we are using the name of the last file, that we were processing
|
|
@@ -113,9 +115,12 @@ class AbstractTransformFileProcessor:
|
|
|
113
115
|
case 1:
|
|
114
116
|
# we have exactly 1 output file
|
|
115
117
|
file_ext = out_files[0]
|
|
116
|
-
|
|
118
|
+
lfn = self.last_file_name
|
|
119
|
+
if self.last_file_name_next_index is not None:
|
|
120
|
+
lfn = f"{lfn}_{self.last_file_name_next_index}"
|
|
121
|
+
output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
|
|
117
122
|
self.logger.debug(
|
|
118
|
-
f"Writing transformed file {self.last_file_name}{self.last_extension}
|
|
123
|
+
f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
|
|
119
124
|
)
|
|
120
125
|
save_res = self.data_access.save_file(path=output_name, data=file_ext[0])
|
|
121
126
|
if save_res is not None:
|
|
@@ -130,7 +135,10 @@ class AbstractTransformFileProcessor:
|
|
|
130
135
|
else:
|
|
131
136
|
self.logger.warning(f"Failed to write file {output_name}")
|
|
132
137
|
self._publish_stats({"failed_writes": 1})
|
|
133
|
-
self.last_file_name_next_index
|
|
138
|
+
if self.last_file_name_next_index is None:
|
|
139
|
+
self.last_file_name_next_index = 0
|
|
140
|
+
else:
|
|
141
|
+
self.last_file_name_next_index += 1
|
|
134
142
|
case _:
|
|
135
143
|
# we have more then 1 file
|
|
136
144
|
file_sizes = 0
|
|
@@ -14,7 +14,6 @@ import sys
|
|
|
14
14
|
import tempfile
|
|
15
15
|
from typing import Any
|
|
16
16
|
|
|
17
|
-
from data_processing.runtime.ray import RayTransformLauncher
|
|
18
17
|
from data_processing.runtime.transform_launcher import AbstractTransformLauncher
|
|
19
18
|
from data_processing.test_support.abstract_test import AbstractTest
|
|
20
19
|
from data_processing.utils import ParamsUtils
|
|
@@ -22,7 +21,7 @@ from data_processing.utils import ParamsUtils
|
|
|
22
21
|
|
|
23
22
|
class AbstractTransformLauncherTest(AbstractTest):
|
|
24
23
|
"""
|
|
25
|
-
The
|
|
24
|
+
The launcher test class for all/most AbstractTransformLauncher implementations.
|
|
26
25
|
Generic tests are provided here, and sub-classes must implement the _get*_fixture() method(s)
|
|
27
26
|
to provide the test data for a given test method. For example, get_test_transform_fixtures()
|
|
28
27
|
provides the test data for the test_transform() test method.
|
|
@@ -36,8 +35,8 @@ class AbstractTransformLauncherTest(AbstractTest):
|
|
|
36
35
|
args = {} | cli_params
|
|
37
36
|
local_ast = {"input_folder": in_table_path, "output_folder": out_table_path}
|
|
38
37
|
args["data_local_config"] = local_ast
|
|
39
|
-
if isinstance(launcher, RayTransformLauncher):
|
|
40
|
-
|
|
38
|
+
# if isinstance(launcher, RayTransformLauncher):
|
|
39
|
+
# args["run_locally"] = "True"
|
|
41
40
|
argv = ParamsUtils.dict_to_req(args)
|
|
42
41
|
return argv
|
|
43
42
|
|
|
@@ -52,7 +51,7 @@ class AbstractTransformLauncherTest(AbstractTest):
|
|
|
52
51
|
Test the given transform and its runtime using the given CLI arguments, input directory of data files and expected output directory.
|
|
53
52
|
Data is processed into a temporary output directory which is then compared with the directory of expected output.
|
|
54
53
|
:param launcher: launcher configured to run the transform being tested
|
|
55
|
-
:param cli_params: a map of the simulated CLI arguments (w/o --). This includes both the transform-specific CLI parameters and
|
|
54
|
+
:param cli_params: a map of the simulated CLI arguments (w/o --). This includes both the transform-specific CLI parameters and the launching args.
|
|
56
55
|
:param in_table_path: a directory containing the input parquet files to be processed and results compared against the expected output table path.
|
|
57
56
|
:param expected_out_table_path: directory contain parquet and metadata.json that is expected to match the processed input directory.
|
|
58
57
|
:return:
|
|
@@ -62,7 +61,14 @@ class AbstractTransformLauncherTest(AbstractTest):
|
|
|
62
61
|
print(f"Using temporary output path {temp_dir}")
|
|
63
62
|
sys.argv = self._get_argv(launcher, cli_params, in_table_path, temp_dir)
|
|
64
63
|
launcher.launch()
|
|
65
|
-
|
|
64
|
+
self._validate_directory_contents_match(temp_dir, expected_out_table_path)
|
|
65
|
+
|
|
66
|
+
def _validate_directory_contents_match(self, dir: str, expected: str):
|
|
67
|
+
"""
|
|
68
|
+
Confirm that the two directories contains the same files.
|
|
69
|
+
Stubbed out like this to allow spark tests to override this since spark tends to rename the files.
|
|
70
|
+
"""
|
|
71
|
+
AbstractTest.validate_directory_contents(dir, expected)
|
|
66
72
|
|
|
67
73
|
def _install_test_fixtures(self, metafunc):
|
|
68
74
|
# Apply the fixtures for the method with these input names (i.e. test_transform()).
|
|
@@ -15,13 +15,10 @@ from argparse import ArgumentParser, Namespace
|
|
|
15
15
|
from typing import Any
|
|
16
16
|
|
|
17
17
|
import pyarrow as pa
|
|
18
|
+
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
18
19
|
from data_processing.runtime.pure_python.runtime_configuration import (
|
|
19
20
|
PythonTransformRuntimeConfiguration,
|
|
20
21
|
)
|
|
21
|
-
from data_processing.runtime.ray import RayTransformLauncher
|
|
22
|
-
from data_processing.runtime.ray.runtime_configuration import (
|
|
23
|
-
RayTransformRuntimeConfiguration,
|
|
24
|
-
)
|
|
25
22
|
from data_processing.transform import AbstractTableTransform, TransformConfiguration
|
|
26
23
|
from data_processing.utils import CLIArgumentProvider, get_logger
|
|
27
24
|
|
|
@@ -138,22 +135,8 @@ class NOOPPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
|
|
|
138
135
|
super().__init__(transform_config=NOOPTransformConfiguration())
|
|
139
136
|
|
|
140
137
|
|
|
141
|
-
class NOOPRayTransformConfiguration(RayTransformRuntimeConfiguration):
|
|
142
|
-
"""
|
|
143
|
-
Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
|
|
144
|
-
NOOP does not use a RayRuntime class so the superclass only needs the base
|
|
145
|
-
python-only configuration.
|
|
146
|
-
"""
|
|
147
|
-
|
|
148
|
-
def __init__(self):
|
|
149
|
-
"""
|
|
150
|
-
Initialization
|
|
151
|
-
"""
|
|
152
|
-
super().__init__(transform_config=NOOPTransformConfiguration())
|
|
153
|
-
|
|
154
|
-
|
|
155
138
|
if __name__ == "__main__":
|
|
156
139
|
# launcher = NOOPRayLauncher()
|
|
157
|
-
launcher =
|
|
140
|
+
launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
|
|
158
141
|
logger.info("Launching noop transform")
|
|
159
142
|
launcher.launch()
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Any, Generic, TypeVar
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
DATA = TypeVar("DATA")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AbstractTransform(Generic[DATA]):
|
|
8
|
+
def transform(self, data: DATA) -> tuple[list[DATA], dict[str, Any]]:
|
|
9
|
+
"""
|
|
10
|
+
Converts input table into an output table.
|
|
11
|
+
If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
|
|
12
|
+
:param table: input table
|
|
13
|
+
:return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
|
|
14
|
+
propagated to metadata
|
|
15
|
+
"""
|
|
16
|
+
raise NotImplemented()
|
|
@@ -10,10 +10,15 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any, TypeVar
|
|
14
14
|
|
|
15
|
+
from data_processing.transform.abstract_transform import AbstractTransform
|
|
15
16
|
|
|
16
|
-
|
|
17
|
+
|
|
18
|
+
DATA = TypeVar("DATA")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AbstractBinaryTransform(AbstractTransform[DATA]):
|
|
17
22
|
"""
|
|
18
23
|
Converts input binary file to output file(s) (binary)
|
|
19
24
|
Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
-
from typing import Any
|
|
13
|
+
from typing import Any, TypeVar
|
|
14
14
|
|
|
15
15
|
import pyarrow as pa
|
|
16
16
|
from data_processing.transform import AbstractBinaryTransform
|
|
@@ -20,7 +20,7 @@ from data_processing.utils import TransformUtils, get_logger
|
|
|
20
20
|
logger = get_logger(__name__)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class AbstractTableTransform(AbstractBinaryTransform):
|
|
23
|
+
class AbstractTableTransform(AbstractBinaryTransform[pa.Table]):
|
|
24
24
|
"""
|
|
25
25
|
Extends AbstractBinaryTransform to expect the byte arrays from to contain a pyarrow Table.
|
|
26
26
|
Sub-classes are expected to implement transform() on the parsed Table instances.
|
|
@@ -59,17 +59,19 @@ class AbstractTableTransform(AbstractBinaryTransform):
|
|
|
59
59
|
# Add number of rows to stats
|
|
60
60
|
stats = stats | {"source_doc_count": table.num_rows}
|
|
61
61
|
# convert tables to files
|
|
62
|
-
return self._check_and_convert_tables(
|
|
62
|
+
return self._check_and_convert_tables(
|
|
63
|
+
out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows}
|
|
64
|
+
)
|
|
63
65
|
|
|
64
|
-
def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
66
|
+
# def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
|
|
67
|
+
# """
|
|
68
|
+
# Converts input table into an output table.
|
|
69
|
+
# If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
|
|
70
|
+
# :param table: input table
|
|
71
|
+
# :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
|
|
72
|
+
# propagated to metadata
|
|
73
|
+
# """
|
|
74
|
+
# raise NotImplemented()
|
|
73
75
|
|
|
74
76
|
def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
|
|
75
77
|
"""
|
|
@@ -14,6 +14,7 @@ from argparse import ArgumentParser
|
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
16
|
from data_processing.transform import AbstractBinaryTransform
|
|
17
|
+
from data_processing.transform.abstract_transform import AbstractTransform
|
|
17
18
|
from data_processing.utils import CLIArgumentProvider
|
|
18
19
|
|
|
19
20
|
|
|
@@ -22,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
|
|
|
22
23
|
This is a base transform configuration class defining transform's input/output parameter
|
|
23
24
|
"""
|
|
24
25
|
|
|
25
|
-
def __init__(self, name: str, transform_class: type[
|
|
26
|
+
def __init__(self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []):
|
|
26
27
|
"""
|
|
27
28
|
Initialization
|
|
28
29
|
:param name: transformer name
|
|
@@ -34,7 +35,7 @@ class TransformConfiguration(CLIArgumentProvider):
|
|
|
34
35
|
self.remove_from_metadata = remove_from_metadata
|
|
35
36
|
self.params = {}
|
|
36
37
|
|
|
37
|
-
def get_transform_class(self) -> type[
|
|
38
|
+
def get_transform_class(self) -> type[AbstractTransform]:
|
|
38
39
|
"""
|
|
39
40
|
Get the class extending AbstractTransform which implements a specific transformation.
|
|
40
41
|
The class will generally be instantiated with a dictionary of configuration produced by
|
|
@@ -32,7 +32,7 @@ class TestRayNOOPTransform(AbstractTransformLauncherTest):
|
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
34
|
def get_test_transform_fixtures(self) -> list[tuple]:
|
|
35
|
-
basedir = "../../../../test-data/data_processing/
|
|
35
|
+
basedir = "../../../../test-data/data_processing/python/noop/"
|
|
36
36
|
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
|
|
37
37
|
launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
|
|
38
38
|
fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")]
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
# Byte-compiled / optimized / DLL files
|
|
5
|
-
__pycache__/
|
|
6
|
-
*.py[cod]
|
|
7
|
-
*$py.class
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
# Distribution / packaging
|
|
11
|
-
bin/
|
|
12
|
-
build/
|
|
13
|
-
develop-eggs/
|
|
14
|
-
dist/
|
|
15
|
-
eggs/
|
|
16
|
-
lib/
|
|
17
|
-
lib64/
|
|
18
|
-
parts/
|
|
19
|
-
sdist/
|
|
20
|
-
var/
|
|
21
|
-
*.egg-info/
|
|
22
|
-
.installed.cfg
|
|
23
|
-
*.egg
|
|
24
|
-
|
|
25
|
-
# Installer logs
|
|
26
|
-
pip-log.txt
|
|
27
|
-
pip-delete-this-directory.txt
|
|
28
|
-
|
|
29
|
-
# Unit test / coverage reports
|
|
30
|
-
.tox/
|
|
31
|
-
htmlcov
|
|
32
|
-
.coverage
|
|
33
|
-
.cache
|
|
34
|
-
nosetests.xml
|
|
35
|
-
coverage.xml
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
from data_processing.runtime.ray.ray_utils import RayUtils
|
|
2
|
-
from data_processing.runtime.ray.transform_statistics import TransformStatisticsRay
|
|
3
|
-
from data_processing.runtime.ray.transform_runtime import DefaultRayTransformRuntime
|
|
4
|
-
from data_processing.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
|
|
5
|
-
from data_processing.runtime.ray.transform_file_processor import RayTransformFileProcessor
|
|
6
|
-
from data_processing.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
|
|
7
|
-
from data_processing.runtime.ray.transform_orchestrator import orchestrate
|
|
8
|
-
from data_processing.runtime.ray.transform_launcher import RayTransformLauncher
|