data-prep-toolkit 0.2.0.dev5__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/Makefile +4 -3
- {data_prep_toolkit-0.2.0.dev5/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1}/PKG-INFO +7 -3
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/README.md +3 -1
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/pyproject.toml +10 -3
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1/src/data_prep_toolkit.egg-info}/PKG-INFO +7 -3
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/SOURCES.txt +14 -2
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/requires.txt +2 -1
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/__init__.py +1 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/arrow_s3.py +62 -43
- data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access.py +457 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/data_access_factory.py +21 -8
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/data_access_factory_base.py +1 -0
- data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access_local.py +249 -0
- data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access_s3.py +207 -0
- data_prep_toolkit-0.2.1/src/data_processing/data_access/snapshot_utils.py +31 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/__init__.py +1 -1
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/execution_configuration.py +5 -5
- data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/__init__.py +10 -0
- data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/execution_configuration.py +70 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/pure_python/runtime_configuration.py +15 -2
- data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_file_processor.py +107 -0
- data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_invoker.py +159 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/pure_python/transform_launcher.py +3 -36
- data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_orchestrator.py +224 -0
- data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_runtime.py +53 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/transform_file_processor.py +53 -23
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/transform_launcher.py +52 -1
- data_prep_toolkit-0.2.1/src/data_processing/test_support/__init__.py +1 -0
- data_prep_toolkit-0.2.1/src/data_processing/test_support/abstract_test.py +275 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/data_access/data_access_factory_test.py +1 -1
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/launch/transform_test.py +17 -12
- data_prep_toolkit-0.2.1/src/data_processing/test_support/transform/__init__.py +6 -0
- data_prep_toolkit-0.2.1/src/data_processing/test_support/transform/binary_transform_test.py +85 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/transform/noop_transform.py +1 -1
- data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/transform/transform_test.py → data_prep_toolkit-0.2.1/src/data_processing/test_support/transform/table_transform_test.py +10 -8
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/binary_transform.py +3 -8
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/table_transform.py +31 -30
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/transform_configuration.py +6 -5
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/transform_statistics.py +1 -2
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/__init__.py +3 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/log.py +9 -2
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/params_utils.py +8 -6
- data_prep_toolkit-0.2.1/src/data_processing/utils/pipinstaller.py +76 -0
- data_prep_toolkit-0.2.1/src/data_processing/utils/transform_configuration.json +158 -0
- data_prep_toolkit-0.2.1/src/data_processing/utils/transform_configurator.py +91 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/transform_utils.py +26 -9
- data_prep_toolkit-0.2.1/src/data_processing/utils/unrecoverable.py +7 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/data_access_local_test.py +52 -42
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/data_access_s3_test.py +11 -9
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/sample_input_data_test.py +1 -1
- data_prep_toolkit-0.2.1/test/data_processing_tests/invoker/python_invoker_test.py +48 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -5
- data_prep_toolkit-0.2.1/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +37 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/transform/test_noop.py +4 -2
- data_prep_toolkit-0.2.1/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
- data_prep_toolkit-0.2.1/test-data/data_processing/python/noop/input/test1.parquet +0 -0
- data_prep_toolkit-0.2.0.dev5/src/data_processing/data_access/data_access.py +0 -228
- data_prep_toolkit-0.2.0.dev5/src/data_processing/data_access/data_access_local.py +0 -407
- data_prep_toolkit-0.2.0.dev5/src/data_processing/data_access/data_access_s3.py +0 -373
- data_prep_toolkit-0.2.0.dev5/src/data_processing/runtime/pure_python/__init__.py +0 -4
- data_prep_toolkit-0.2.0.dev5/src/data_processing/runtime/pure_python/transform_file_processor.py +0 -51
- data_prep_toolkit-0.2.0.dev5/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -104
- data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/__init__.py +0 -1
- data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/abstract_test.py +0 -185
- data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/transform/__init__.py +0 -5
- data_prep_toolkit-0.2.0.dev5/src/data_processing/transform/abstract_transform.py +0 -16
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/setup.cfg +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/runtime_configuration.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/__init__.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
|
@@ -15,13 +15,13 @@ clean::
|
|
|
15
15
|
|
|
16
16
|
setup::
|
|
17
17
|
|
|
18
|
-
set-versions
|
|
18
|
+
set-versions: .check-env
|
|
19
19
|
$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
|
|
20
20
|
|
|
21
21
|
build:: build-dist
|
|
22
22
|
|
|
23
23
|
#build:: update-toml .defaults.build-dist
|
|
24
|
-
build-dist ::
|
|
24
|
+
build-dist :: .defaults.build-dist
|
|
25
25
|
|
|
26
26
|
publish:: publish-dist
|
|
27
27
|
|
|
@@ -46,7 +46,8 @@ image::
|
|
|
46
46
|
# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
|
|
47
47
|
# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
|
|
48
48
|
# TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) .
|
|
49
|
-
test
|
|
49
|
+
.PHONY: test
|
|
50
|
+
test:: venv
|
|
50
51
|
@# Help: Use the already-built virtual environment to run pytest on the test directory.
|
|
51
52
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
|
|
52
53
|
source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
|
{data_prep_toolkit-0.2.0.dev5/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1}/PKG-INFO
RENAMED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Data Preparation Toolkit Library
|
|
5
5
|
Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
+
Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
7
8
|
Requires-Python: >=3.10
|
|
8
9
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist:
|
|
10
|
+
Requires-Dist: numpy<1.29.0
|
|
11
|
+
Requires-Dist: pyarrow==16.1.0
|
|
10
12
|
Requires-Dist: boto3==1.34.69
|
|
11
13
|
Requires-Dist: argparse
|
|
12
14
|
Requires-Dist: mmh3
|
|
@@ -41,12 +43,14 @@ source venv/bin/activate
|
|
|
41
43
|
or set up your IDE to use the venv directory when developing in this project
|
|
42
44
|
|
|
43
45
|
## Library Artifact Build and Publish
|
|
44
|
-
To test, build and publish the library
|
|
46
|
+
To test, build and publish the library
|
|
45
47
|
```shell
|
|
46
48
|
make test build publish
|
|
47
49
|
```
|
|
50
|
+
|
|
48
51
|
To up the version number, edit the Makefile to change VERSION and rerun
|
|
49
52
|
the above. This will require committing both the `Makefile` and the
|
|
50
53
|
autotmatically updated `pyproject.toml` file.
|
|
51
54
|
|
|
52
55
|
|
|
56
|
+
|
|
@@ -18,12 +18,14 @@ source venv/bin/activate
|
|
|
18
18
|
or set up your IDE to use the venv directory when developing in this project
|
|
19
19
|
|
|
20
20
|
## Library Artifact Build and Publish
|
|
21
|
-
To test, build and publish the library
|
|
21
|
+
To test, build and publish the library
|
|
22
22
|
```shell
|
|
23
23
|
make test build publish
|
|
24
24
|
```
|
|
25
|
+
|
|
25
26
|
To up the version number, edit the Makefile to change VERSION and rerun
|
|
26
27
|
the above. This will require committing both the `Makefile` and the
|
|
27
28
|
autotmatically updated `pyproject.toml` file.
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_toolkit"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.1"
|
|
4
4
|
requires-python = ">=3.10"
|
|
5
|
+
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
|
|
5
6
|
description = "Data Preparation Toolkit Library"
|
|
6
7
|
license = {text = "Apache-2.0"}
|
|
7
8
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -10,12 +11,18 @@ authors = [
|
|
|
10
11
|
{ name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
|
|
11
12
|
]
|
|
12
13
|
dependencies = [
|
|
13
|
-
"
|
|
14
|
+
"numpy < 1.29.0",
|
|
15
|
+
"pyarrow==16.1.0",
|
|
14
16
|
"boto3==1.34.69",
|
|
15
17
|
"argparse",
|
|
16
18
|
"mmh3",
|
|
17
19
|
]
|
|
18
20
|
|
|
21
|
+
[project_urls]
|
|
22
|
+
Repository = "https://github.com/IBM/data-prep-kit"
|
|
23
|
+
Issues = "https://github.com/IBM/data-prep-kit/issues"
|
|
24
|
+
Documentation = "https://ibm.github.io/data-prep-kit/"
|
|
25
|
+
|
|
19
26
|
[build-system]
|
|
20
27
|
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
|
|
21
28
|
build-backend = "setuptools.build_meta"
|
|
@@ -34,7 +41,7 @@ dev = [
|
|
|
34
41
|
]
|
|
35
42
|
|
|
36
43
|
[options]
|
|
37
|
-
package_dir = ["src"
|
|
44
|
+
package_dir = ["src"]
|
|
38
45
|
|
|
39
46
|
[options.packages.find]
|
|
40
47
|
where = ["src/data_processing"]
|
{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1/src/data_prep_toolkit.egg-info}/PKG-INFO
RENAMED
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: data_prep_toolkit
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Data Preparation Toolkit Library
|
|
5
5
|
Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
+
Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
|
|
7
8
|
Requires-Python: >=3.10
|
|
8
9
|
Description-Content-Type: text/markdown
|
|
9
|
-
Requires-Dist:
|
|
10
|
+
Requires-Dist: numpy<1.29.0
|
|
11
|
+
Requires-Dist: pyarrow==16.1.0
|
|
10
12
|
Requires-Dist: boto3==1.34.69
|
|
11
13
|
Requires-Dist: argparse
|
|
12
14
|
Requires-Dist: mmh3
|
|
@@ -41,12 +43,14 @@ source venv/bin/activate
|
|
|
41
43
|
or set up your IDE to use the venv directory when developing in this project
|
|
42
44
|
|
|
43
45
|
## Library Artifact Build and Publish
|
|
44
|
-
To test, build and publish the library
|
|
46
|
+
To test, build and publish the library
|
|
45
47
|
```shell
|
|
46
48
|
make test build publish
|
|
47
49
|
```
|
|
50
|
+
|
|
48
51
|
To up the version number, edit the Makefile to change VERSION and rerun
|
|
49
52
|
the above. This will require committing both the `Makefile` and the
|
|
50
53
|
autotmatically updated `pyproject.toml` file.
|
|
51
54
|
|
|
52
55
|
|
|
56
|
+
|
{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/SOURCES.txt
RENAMED
|
@@ -14,16 +14,20 @@ src/data_processing/data_access/data_access_factory.py
|
|
|
14
14
|
src/data_processing/data_access/data_access_factory_base.py
|
|
15
15
|
src/data_processing/data_access/data_access_local.py
|
|
16
16
|
src/data_processing/data_access/data_access_s3.py
|
|
17
|
+
src/data_processing/data_access/snapshot_utils.py
|
|
17
18
|
src/data_processing/runtime/__init__.py
|
|
18
19
|
src/data_processing/runtime/execution_configuration.py
|
|
19
20
|
src/data_processing/runtime/runtime_configuration.py
|
|
20
21
|
src/data_processing/runtime/transform_file_processor.py
|
|
21
22
|
src/data_processing/runtime/transform_launcher.py
|
|
22
23
|
src/data_processing/runtime/pure_python/__init__.py
|
|
24
|
+
src/data_processing/runtime/pure_python/execution_configuration.py
|
|
23
25
|
src/data_processing/runtime/pure_python/runtime_configuration.py
|
|
24
26
|
src/data_processing/runtime/pure_python/transform_file_processor.py
|
|
27
|
+
src/data_processing/runtime/pure_python/transform_invoker.py
|
|
25
28
|
src/data_processing/runtime/pure_python/transform_launcher.py
|
|
26
29
|
src/data_processing/runtime/pure_python/transform_orchestrator.py
|
|
30
|
+
src/data_processing/runtime/pure_python/transform_runtime.py
|
|
27
31
|
src/data_processing/test_support/__init__.py
|
|
28
32
|
src/data_processing/test_support/abstract_test.py
|
|
29
33
|
src/data_processing/test_support/data_access/__init__.py
|
|
@@ -31,10 +35,10 @@ src/data_processing/test_support/data_access/data_access_factory_test.py
|
|
|
31
35
|
src/data_processing/test_support/launch/__init__.py
|
|
32
36
|
src/data_processing/test_support/launch/transform_test.py
|
|
33
37
|
src/data_processing/test_support/transform/__init__.py
|
|
38
|
+
src/data_processing/test_support/transform/binary_transform_test.py
|
|
34
39
|
src/data_processing/test_support/transform/noop_transform.py
|
|
35
|
-
src/data_processing/test_support/transform/
|
|
40
|
+
src/data_processing/test_support/transform/table_transform_test.py
|
|
36
41
|
src/data_processing/transform/__init__.py
|
|
37
|
-
src/data_processing/transform/abstract_transform.py
|
|
38
42
|
src/data_processing/transform/binary_transform.py
|
|
39
43
|
src/data_processing/transform/table_transform.py
|
|
40
44
|
src/data_processing/transform/transform_configuration.py
|
|
@@ -44,7 +48,11 @@ src/data_processing/utils/cli_utils.py
|
|
|
44
48
|
src/data_processing/utils/config.py
|
|
45
49
|
src/data_processing/utils/log.py
|
|
46
50
|
src/data_processing/utils/params_utils.py
|
|
51
|
+
src/data_processing/utils/pipinstaller.py
|
|
52
|
+
src/data_processing/utils/transform_configuration.json
|
|
53
|
+
src/data_processing/utils/transform_configurator.py
|
|
47
54
|
src/data_processing/utils/transform_utils.py
|
|
55
|
+
src/data_processing/utils/unrecoverable.py
|
|
48
56
|
test-data/data_processing/daf/input/ds1/sample1.parquet
|
|
49
57
|
test-data/data_processing/daf/input/ds1/sample2.parquet
|
|
50
58
|
test-data/data_processing/daf/input/ds2/sample3.parquet
|
|
@@ -55,15 +63,19 @@ test-data/data_processing/input_multiple/sample2.parquet
|
|
|
55
63
|
test-data/data_processing/input_multiple/sample3.parquet
|
|
56
64
|
test-data/data_processing/python/noop/expected/metadata.json
|
|
57
65
|
test-data/data_processing/python/noop/expected/sample1.parquet
|
|
66
|
+
test-data/data_processing/python/noop/expected/test1.parquet
|
|
58
67
|
test-data/data_processing/python/noop/expected/subdir/test1.parquet
|
|
59
68
|
test-data/data_processing/python/noop/input/sample1.parquet
|
|
69
|
+
test-data/data_processing/python/noop/input/test1.parquet
|
|
60
70
|
test-data/data_processing/python/noop/input/subdir/test1.parquet
|
|
61
71
|
test/data_processing_tests/data_access/daf_local_test.py
|
|
62
72
|
test/data_processing_tests/data_access/data_access_local_test.py
|
|
63
73
|
test/data_processing_tests/data_access/data_access_s3_test.py
|
|
64
74
|
test/data_processing_tests/data_access/sample_input_data_test.py
|
|
75
|
+
test/data_processing_tests/invoker/python_invoker_test.py
|
|
65
76
|
test/data_processing_tests/launch/pure_python/launcher_test.py
|
|
66
77
|
test/data_processing_tests/launch/pure_python/multi_launcher_test.py
|
|
67
78
|
test/data_processing_tests/launch/pure_python/test_noop_launch.py
|
|
79
|
+
test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py
|
|
68
80
|
test/data_processing_tests/transform/test_noop.py
|
|
69
81
|
test/data_processing_tests/util/transform_utils_test.py
|
{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/__init__.py
RENAMED
|
@@ -4,3 +4,4 @@ from data_processing.data_access.data_access_local import DataAccessLocal
|
|
|
4
4
|
from data_processing.data_access.data_access_s3 import DataAccessS3
|
|
5
5
|
from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
|
|
6
6
|
from data_processing.data_access.data_access_factory import DataAccessFactory
|
|
7
|
+
from data_processing.data_access.snapshot_utils import SnapshotUtils
|
{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/arrow_s3.py
RENAMED
|
@@ -56,6 +56,7 @@ class ArrowS3:
|
|
|
56
56
|
config=Config(retries={"max_attempts": s3_max_attempts, "mode": "standard"}),
|
|
57
57
|
)
|
|
58
58
|
self.retries = s3_retries
|
|
59
|
+
self.s3_max_attempts = s3_max_attempts
|
|
59
60
|
|
|
60
61
|
@staticmethod
|
|
61
62
|
def _get_bucket_key(key: str) -> tuple[str, str]:
|
|
@@ -68,144 +69,162 @@ class ArrowS3:
|
|
|
68
69
|
return prefixes[0], "/".join(prefixes[1:])
|
|
69
70
|
|
|
70
71
|
# get list of the files (names and sizes) for a given prefix (including bucket name)
|
|
71
|
-
def list_files(self, key: str) -> list[dict[str, Any]]:
|
|
72
|
+
def list_files(self, key: str) -> tuple[list[dict[str, Any]], int]:
|
|
72
73
|
"""
|
|
73
74
|
List files in the folder (hierarchically going through all sub-folders)
|
|
74
75
|
:param key: complete folder name
|
|
75
|
-
:return: list of dictionaries, containing file names and length
|
|
76
|
+
:return: list of dictionaries, containing file names and length and number of retries
|
|
76
77
|
"""
|
|
77
78
|
bucket, prefix = self._get_bucket_key(key)
|
|
78
79
|
# Use paginator here to get all the files rather then 1 page
|
|
79
80
|
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
80
81
|
pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
|
|
81
82
|
files = []
|
|
83
|
+
retries = 0
|
|
82
84
|
for page in pages:
|
|
83
85
|
# For every page
|
|
86
|
+
retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
|
|
84
87
|
for obj in page.get("Contents", []):
|
|
85
88
|
# Get both file name and size
|
|
86
89
|
files.append({"name": f"{bucket}/{obj['Key']}", "size": obj["Size"]})
|
|
87
|
-
return files
|
|
90
|
+
return files, retries
|
|
88
91
|
|
|
89
|
-
def list_folders(self, key: str) -> list[str]:
|
|
92
|
+
def list_folders(self, key: str) -> tuple[list[str], int]:
|
|
90
93
|
"""
|
|
91
94
|
Get list of folders for folder
|
|
92
95
|
:param key: complete folder
|
|
93
|
-
:return: list of folders within a given folder
|
|
96
|
+
:return: list of folders within a given folder and number of retries
|
|
94
97
|
"""
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def _get_sub_folders(bck: str, p: str) -> list[str]:
|
|
98
|
+
def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]:
|
|
99
|
+
sub_folders = []
|
|
98
100
|
# use paginator
|
|
99
101
|
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
100
102
|
# use Delimiter to get folders just folders
|
|
101
103
|
page_iterator = paginator.paginate(Bucket=bck, Prefix=p, Delimiter="/")
|
|
102
|
-
|
|
104
|
+
internal_retries = 0
|
|
103
105
|
for page in page_iterator:
|
|
104
106
|
# for every page
|
|
107
|
+
internal_retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
|
|
105
108
|
for p in page.get("CommonPrefixes", []):
|
|
106
|
-
|
|
109
|
+
sf = p["Prefix"]
|
|
110
|
+
sub_folders.append(sf)
|
|
107
111
|
# apply recursively
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
sf, r = _get_sub_folders(bck=bck, p=sf)
|
|
113
|
+
internal_retries += r
|
|
114
|
+
sub_folders.extend(sf)
|
|
115
|
+
return sub_folders, internal_retries
|
|
116
|
+
bucket, prefix = self._get_bucket_key(key)
|
|
117
|
+
subs, retries = _get_sub_folders(bck=bucket, p=prefix)
|
|
118
|
+
return [f"{bucket}/{f}" for f in subs], retries
|
|
114
119
|
|
|
115
|
-
def read_file(self, key: str) -> bytes:
|
|
120
|
+
def read_file(self, key: str) -> tuple[bytes, int]:
|
|
116
121
|
"""
|
|
117
122
|
Read an s3 file by name
|
|
118
123
|
:param key: complete path
|
|
119
|
-
:return: byte array of file content or None if the file does not exist
|
|
124
|
+
:return: byte array of file content or None if the file does not exist and a number of retries
|
|
120
125
|
"""
|
|
121
126
|
bucket, prefix = self._get_bucket_key(key)
|
|
127
|
+
retries = 0
|
|
122
128
|
for n in range(self.retries):
|
|
123
129
|
try:
|
|
124
130
|
obj = self.s3_client.get_object(Bucket=bucket, Key=prefix)
|
|
125
|
-
|
|
131
|
+
retries += obj.get("ResponseMetadata", {}).get("RetryAttempts", 0)
|
|
132
|
+
return obj["Body"].read(), retries
|
|
126
133
|
except Exception as e:
|
|
127
134
|
logger.error(f"failed to read file {key}, exception {e}, attempt {n}")
|
|
135
|
+
retries += self.s3_max_attempts
|
|
128
136
|
logger.error(f"failed to read file {key} in {self.retries} attempts. Skipping it")
|
|
129
|
-
return None
|
|
137
|
+
return None, retries
|
|
130
138
|
|
|
131
|
-
def save_file(self, key: str, data: bytes) -> dict[str, Any]:
|
|
139
|
+
def save_file(self, key: str, data: bytes) -> tuple[dict[str, Any], int]:
|
|
132
140
|
"""
|
|
133
141
|
Save file to S3
|
|
134
142
|
:param key: complete path
|
|
135
143
|
:param data: byte array of the file content
|
|
136
144
|
:return: dictionary as
|
|
137
145
|
defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
|
|
138
|
-
in the case of failure dict is None
|
|
146
|
+
in the case of failure dict is None and the number of retries
|
|
139
147
|
"""
|
|
140
148
|
bucket, prefix = self._get_bucket_key(key)
|
|
149
|
+
retries = 0
|
|
141
150
|
for n in range(self.retries):
|
|
142
151
|
try:
|
|
143
|
-
|
|
152
|
+
res = self.s3_client.put_object(Bucket=bucket, Key=prefix, Body=data)
|
|
153
|
+
retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
|
|
154
|
+
return res, retries
|
|
144
155
|
except Exception as e:
|
|
145
156
|
logger.error(f"Failed to upload file to to key {key}, exception {e}")
|
|
157
|
+
retries += self.s3_max_attempts
|
|
146
158
|
logger.error(f"Failed to upload file {key}, skipping it")
|
|
147
|
-
return None
|
|
159
|
+
return None, retries
|
|
148
160
|
|
|
149
|
-
def read_table(self, key: str, schema: pa.schema = None) -> pa.Table:
|
|
161
|
+
def read_table(self, key: str, schema: pa.schema = None) -> tuple[pa.Table, int]:
|
|
150
162
|
"""
|
|
151
163
|
Get an arrow table from a file with a given name
|
|
152
164
|
:param key: complete path
|
|
153
165
|
:param schema: Schema used for reading table, default None
|
|
154
|
-
:return: table or None if the read failed
|
|
166
|
+
:return: table or None if the read failed and the number of retries
|
|
155
167
|
"""
|
|
156
168
|
# Read file as bytes
|
|
157
|
-
data = self.read_file(key)
|
|
169
|
+
data, retries = self.read_file(key)
|
|
158
170
|
if data is None:
|
|
159
|
-
return None
|
|
160
|
-
return TransformUtils.convert_binary_to_arrow(data=data, schema=schema)
|
|
171
|
+
return None, retries
|
|
172
|
+
return TransformUtils.convert_binary_to_arrow(data=data, schema=schema), retries
|
|
161
173
|
|
|
162
|
-
def save_table(self, key: str, table: pa.Table) -> tuple[int, dict[str, Any]]:
|
|
174
|
+
def save_table(self, key: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
|
|
163
175
|
"""
|
|
164
176
|
Save an arrow table to a file with a name
|
|
165
177
|
:param key: complete path
|
|
166
178
|
:param table: table to save
|
|
167
179
|
:return: table size and a dictionary as
|
|
168
180
|
defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
|
|
169
|
-
in the case of failure len is -1 and dict is None
|
|
181
|
+
in the case of failure len is -1 and dict is None and the number of retries
|
|
170
182
|
"""
|
|
171
183
|
# convert to bytes
|
|
172
184
|
data = TransformUtils.convert_arrow_to_binary(table=table)
|
|
173
185
|
if data is None:
|
|
174
186
|
return -1, None
|
|
175
187
|
# save bytes
|
|
176
|
-
|
|
188
|
+
res, retries = self.save_file(key, data)
|
|
189
|
+
return len(data), res, retries
|
|
177
190
|
|
|
178
|
-
def delete_file(self, key: str) ->
|
|
191
|
+
def delete_file(self, key: str) -> int:
|
|
179
192
|
"""
|
|
180
193
|
Delete file from S3
|
|
181
194
|
:param key: complete path
|
|
182
|
-
:return:
|
|
195
|
+
:return: the number of retries
|
|
183
196
|
"""
|
|
184
197
|
bucket, prefix = self._get_bucket_key(key)
|
|
198
|
+
retries = 0
|
|
185
199
|
for n in range(self.retries):
|
|
186
200
|
try:
|
|
187
|
-
self.s3_client.delete_object(Bucket=bucket, Key=prefix)
|
|
188
|
-
|
|
201
|
+
res = self.s3_client.delete_object(Bucket=bucket, Key=prefix)
|
|
202
|
+
retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
|
|
203
|
+
return retries
|
|
189
204
|
except Exception as e:
|
|
190
205
|
logger.error(f"failed to delete file {key}, exception {e}")
|
|
191
|
-
|
|
206
|
+
retries += self.s3_max_attempts
|
|
207
|
+
return retries
|
|
192
208
|
|
|
193
|
-
def move_file(self, source: str, dest: str) ->
|
|
209
|
+
def move_file(self, source: str, dest: str) -> int:
|
|
194
210
|
"""
|
|
195
211
|
move file from source to destination
|
|
196
212
|
:param source: complete source path
|
|
197
213
|
:param dest: complete destination path
|
|
198
|
-
:return:
|
|
214
|
+
:return: number of retries
|
|
199
215
|
"""
|
|
200
216
|
s_bucket, s_prefix = self._get_bucket_key(source)
|
|
201
217
|
d_bucket, d_prefix = self._get_bucket_key(dest)
|
|
202
218
|
# copy source to destination and then delete source
|
|
203
219
|
copy_source = {"Bucket": s_bucket, "Key": s_prefix}
|
|
220
|
+
retries = 0
|
|
204
221
|
for n in range(self.retries):
|
|
205
222
|
try:
|
|
206
|
-
self.s3_client.copy_object(CopySource=copy_source, Bucket=d_bucket, Key=d_prefix)
|
|
207
|
-
|
|
208
|
-
|
|
223
|
+
res = self.s3_client.copy_object(CopySource=copy_source, Bucket=d_bucket, Key=d_prefix)
|
|
224
|
+
retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
|
|
225
|
+
retries += self.delete_file(source)
|
|
226
|
+
return retries
|
|
209
227
|
except Exception as e:
|
|
210
228
|
logger.error(f"failed to copy file {source} to {dest}, exception {e}")
|
|
211
|
-
|
|
229
|
+
retries += self.s3_max_attempts
|
|
230
|
+
return retries
|