data-prep-toolkit 0.1.1__tar.gz → 0.2.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/Makefile +15 -22
  2. {data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.0.dev2}/PKG-INFO +1 -4
  3. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/pyproject.toml +1 -5
  4. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -4
  5. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/SOURCES.txt +6 -19
  6. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/requires.txt +0 -3
  7. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_local.py +8 -0
  8. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_s3.py +36 -7
  9. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_file_processor.py +2 -4
  10. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/transform_file_processor.py +13 -5
  11. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/launch/transform_test.py +12 -6
  12. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/__init__.py +0 -1
  13. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/noop_transform.py +2 -19
  14. data_prep_toolkit-0.2.0.dev2/src/data_processing/transform/abstract_transform.py +16 -0
  15. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/binary_transform.py +7 -2
  16. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/table_transform.py +14 -12
  17. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/transform_configuration.py +3 -2
  18. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +1 -1
  19. data_prep_toolkit-0.1.1/.gitignore +0 -35
  20. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/__init__.py +0 -8
  21. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/execution_configuration.py +0 -107
  22. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/ray_utils.py +0 -180
  23. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/runtime_configuration.py +0 -38
  24. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_file_processor.py +0 -46
  25. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_launcher.py +0 -124
  26. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_orchestrator.py +0 -143
  27. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_runtime.py +0 -53
  28. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_statistics.py +0 -66
  29. data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/launcher_test.py +0 -189
  30. data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/multi_launcher_test.py +0 -80
  31. data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/ray_util_test.py +0 -105
  32. data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/test_noop_launch.py +0 -41
  33. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/README.md +0 -0
  34. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/setup.cfg +0 -0
  35. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  36. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  37. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/__init__.py +0 -0
  38. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/__init__.py +0 -0
  39. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/arrow_s3.py +0 -0
  40. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access.py +0 -0
  41. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_factory.py +0 -0
  42. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  43. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/__init__.py +0 -0
  44. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/execution_configuration.py +0 -0
  45. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/__init__.py +0 -0
  46. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/runtime_configuration.py +0 -0
  47. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_launcher.py +0 -0
  48. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -0
  49. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/runtime_configuration.py +0 -0
  50. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/transform_launcher.py +0 -0
  51. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/__init__.py +0 -0
  52. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/abstract_test.py +0 -0
  53. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/data_access/__init__.py +0 -0
  54. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  55. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/launch/__init__.py +0 -0
  56. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/transform_test.py +0 -0
  57. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/__init__.py +0 -0
  58. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/transform_statistics.py +0 -0
  59. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/__init__.py +0 -0
  60. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/cli_utils.py +0 -0
  61. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/config.py +0 -0
  62. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/log.py +0 -0
  63. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/params_utils.py +0 -0
  64. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/utils/transform_utils.py +0 -0
  65. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  66. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
  67. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
  68. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  69. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
  70. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
  71. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/transform/test_noop.py +0 -0
  72. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  73. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  74. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  75. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  76. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  77. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input/sample1.parquet +0 -0
  78. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  79. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  80. {data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  81. {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/expected/metadata.json +0 -0
  82. {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/expected/sample1.parquet +0 -0
  83. {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/expected/subdir/test1.parquet +0 -0
  84. {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/input/sample1.parquet +0 -0
  85. {data_prep_toolkit-0.1.1/test-data/data_processing/ray → data_prep_toolkit-0.2.0.dev2/test-data/data_processing/python}/noop/input/subdir/test1.parquet +0 -0
@@ -13,27 +13,19 @@ clean::
13
13
  .check-env::
14
14
  @echo "Checks passed"
15
15
 
16
- update-toml:: .check-env
17
- @# Help: Copy the Makefile distribution version into the pyproject.toml
18
- sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
19
- mv tt.toml pyproject.toml
20
-
21
16
  setup::
22
17
 
23
- build:: update-toml venv
24
- @# Help: Build the distribution for publishing to a pypi
25
- rm -r dist || true
26
- rm -rf src/*egg-info || true
27
- ${PIP} install --upgrade build
28
- ${PYTHON} -m build
18
+ set-versions:: .check-env
19
+ $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
20
+
21
+ build:: build-dist
29
22
 
30
- publish:: .check-env update-toml
31
- @# Help: Publish project to pypi
32
- ${PYTHON} -m twine check dist/*
33
- ${PYTHON} -m twine upload --verbose --non-interactive dist/*
34
- #@echo "create a git tag to reference published version"
35
- #@git tag ${TAG}
36
- #@git push origin ${TAG}
23
+ #build:: update-toml .defaults.build-dist
24
+ build-dist :: set-versions .defaults.build-dist
25
+
26
+ publish:: publish-dist
27
+
28
+ publish-dist :: .check-env .defaults.publish-dist
37
29
 
38
30
  venv:: pyproject.toml
39
31
  @# Help: Create the virtual environment using pyproject.toml
@@ -46,10 +38,14 @@ venv:: pyproject.toml
46
38
  pip install -e .; \
47
39
  pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1
48
40
 
41
+ image::
42
+ @# Help: Placeholder does nothing for now.
43
+ @echo "Image building for ray is in the works (comming soon)."
49
44
 
50
45
  # Here we run each test directory of tests and each ray launched test separately, because
51
46
  # it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
52
47
  # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
48
+ # TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) .
53
49
  test::
54
50
  @# Help: Use the already-built virtual environment to run pytest on the test directory.
55
51
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
@@ -57,8 +53,5 @@ test::
57
53
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/launcher_test.py;
58
54
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/multi_launcher_test.py;
59
55
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/test_noop_launch.py;
60
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/ray_util_test.py;
61
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/multi_launcher_test.py;
62
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
63
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
56
+
64
57
 
@@ -1,18 +1,15 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.1.1
3
+ Version: 0.2.0.dev2
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
7
7
  Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: ray[default]==2.9.3
10
9
  Requires-Dist: pyarrow==15.0.2
11
10
  Requires-Dist: boto3==1.34.69
12
11
  Requires-Dist: argparse
13
12
  Requires-Dist: mmh3
14
- Requires-Dist: fastapi>=0.109.1
15
- Requires-Dist: pillow>=10.2.0
16
13
  Provides-Extra: dev
17
14
  Requires-Dist: twine; extra == "dev"
18
15
  Requires-Dist: pytest>=7.3.2; extra == "dev"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.1.1"
3
+ version = "0.2.0.dev2"
4
4
  requires-python = ">=3.10"
5
5
  description = "Data Preparation Toolkit Library"
6
6
  license = {text = "Apache-2.0"}
@@ -10,14 +10,10 @@ authors = [
10
10
  { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
11
11
  ]
12
12
  dependencies = [
13
- "ray[default]==2.9.3",
14
13
  "pyarrow==15.0.2",
15
14
  "boto3==1.34.69",
16
15
  "argparse",
17
16
  "mmh3",
18
- # These two are to fix security issue
19
- "fastapi>=0.109.1",
20
- "pillow>=10.2.0",
21
17
  ]
22
18
 
23
19
  [build-system]
@@ -1,18 +1,15 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.1.1
3
+ Version: 0.2.0.dev2
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
7
7
  Requires-Python: >=3.10
8
8
  Description-Content-Type: text/markdown
9
- Requires-Dist: ray[default]==2.9.3
10
9
  Requires-Dist: pyarrow==15.0.2
11
10
  Requires-Dist: boto3==1.34.69
12
11
  Requires-Dist: argparse
13
12
  Requires-Dist: mmh3
14
- Requires-Dist: fastapi>=0.109.1
15
- Requires-Dist: pillow>=10.2.0
16
13
  Provides-Extra: dev
17
14
  Requires-Dist: twine; extra == "dev"
18
15
  Requires-Dist: pytest>=7.3.2; extra == "dev"
@@ -1,4 +1,3 @@
1
- .gitignore
2
1
  Makefile
3
2
  README.md
4
3
  pyproject.toml
@@ -25,15 +24,6 @@ src/data_processing/runtime/pure_python/runtime_configuration.py
25
24
  src/data_processing/runtime/pure_python/transform_file_processor.py
26
25
  src/data_processing/runtime/pure_python/transform_launcher.py
27
26
  src/data_processing/runtime/pure_python/transform_orchestrator.py
28
- src/data_processing/runtime/ray/__init__.py
29
- src/data_processing/runtime/ray/execution_configuration.py
30
- src/data_processing/runtime/ray/ray_utils.py
31
- src/data_processing/runtime/ray/runtime_configuration.py
32
- src/data_processing/runtime/ray/transform_file_processor.py
33
- src/data_processing/runtime/ray/transform_launcher.py
34
- src/data_processing/runtime/ray/transform_orchestrator.py
35
- src/data_processing/runtime/ray/transform_runtime.py
36
- src/data_processing/runtime/ray/transform_statistics.py
37
27
  src/data_processing/test_support/__init__.py
38
28
  src/data_processing/test_support/abstract_test.py
39
29
  src/data_processing/test_support/data_access/__init__.py
@@ -44,6 +34,7 @@ src/data_processing/test_support/transform/__init__.py
44
34
  src/data_processing/test_support/transform/noop_transform.py
45
35
  src/data_processing/test_support/transform/transform_test.py
46
36
  src/data_processing/transform/__init__.py
37
+ src/data_processing/transform/abstract_transform.py
47
38
  src/data_processing/transform/binary_transform.py
48
39
  src/data_processing/transform/table_transform.py
49
40
  src/data_processing/transform/transform_configuration.py
@@ -62,11 +53,11 @@ test-data/data_processing/input/sample1.parquet
62
53
  test-data/data_processing/input_multiple/sample1.parquet
63
54
  test-data/data_processing/input_multiple/sample2.parquet
64
55
  test-data/data_processing/input_multiple/sample3.parquet
65
- test-data/data_processing/ray/noop/expected/metadata.json
66
- test-data/data_processing/ray/noop/expected/sample1.parquet
67
- test-data/data_processing/ray/noop/expected/subdir/test1.parquet
68
- test-data/data_processing/ray/noop/input/sample1.parquet
69
- test-data/data_processing/ray/noop/input/subdir/test1.parquet
56
+ test-data/data_processing/python/noop/expected/metadata.json
57
+ test-data/data_processing/python/noop/expected/sample1.parquet
58
+ test-data/data_processing/python/noop/expected/subdir/test1.parquet
59
+ test-data/data_processing/python/noop/input/sample1.parquet
60
+ test-data/data_processing/python/noop/input/subdir/test1.parquet
70
61
  test/data_processing_tests/data_access/daf_local_test.py
71
62
  test/data_processing_tests/data_access/data_access_local_test.py
72
63
  test/data_processing_tests/data_access/data_access_s3_test.py
@@ -74,9 +65,5 @@ test/data_processing_tests/data_access/sample_input_data_test.py
74
65
  test/data_processing_tests/launch/pure_python/launcher_test.py
75
66
  test/data_processing_tests/launch/pure_python/multi_launcher_test.py
76
67
  test/data_processing_tests/launch/pure_python/test_noop_launch.py
77
- test/data_processing_tests/launch/ray/launcher_test.py
78
- test/data_processing_tests/launch/ray/multi_launcher_test.py
79
- test/data_processing_tests/launch/ray/ray_util_test.py
80
- test/data_processing_tests/launch/ray/test_noop_launch.py
81
68
  test/data_processing_tests/transform/test_noop.py
82
69
  test/data_processing_tests/util/transform_utils_test.py
@@ -1,10 +1,7 @@
1
- ray[default]==2.9.3
2
1
  pyarrow==15.0.2
3
2
  boto3==1.34.69
4
3
  argparse
5
4
  mmh3
6
- fastapi>=0.109.1
7
- pillow>=10.2.0
8
5
 
9
6
  [dev]
10
7
  twine
@@ -55,6 +55,14 @@ class DataAccessLocal(DataAccess):
55
55
  self.n_samples = n_samples
56
56
  self.files_to_use = files_to_use
57
57
 
58
+ logger.debug(f"Local input folder: {self.input_folder}")
59
+ logger.debug(f"Local output folder: {self.output_folder}")
60
+ logger.debug(f"Local data sets: {self.d_sets}")
61
+ logger.debug(f"Local checkpoint: {self.checkpoint}")
62
+ logger.debug(f"Local m_files: {self.m_files}")
63
+ logger.debug(f"Local n_samples: {self.n_samples}")
64
+ logger.debug(f"Local files_to_use: {self.files_to_use}")
65
+
58
66
  def get_num_samples(self) -> int:
59
67
  """
60
68
  Get number of samples for input
@@ -48,15 +48,18 @@ class DataAccessS3(DataAccess):
48
48
  :param n_samples: amount of files to randomly sample
49
49
  :param files_to_use: files extensions of files to include
50
50
  """
51
- self.arrS3 = ArrowS3(
52
- access_key=s3_credentials.get("access_key", ""),
53
- secret_key=s3_credentials.get("secret_key", ""),
54
- endpoint=s3_credentials.get("url", None),
55
- region=s3_credentials.get("region", None),
56
- )
51
+ self.s3_credentials = {} | s3_credentials
52
+ access_key = self.get_access_key()
53
+ if access_key is None:
54
+ raise ValueError("S3 access key not provided")
55
+ secret_key = self.get_secret_key()
56
+ if secret_key is None:
57
+ raise ValueError("S3 secret key not provided")
58
+ endpoint = self.get_endpoint()
59
+ region = self.get_region()
57
60
  if s3_config is None:
58
61
  self.input_folder = None
59
- self.input_folder = None
62
+ self.output_folder = None
60
63
  else:
61
64
  self.input_folder = TransformUtils.clean_path(s3_config["input_folder"])
62
65
  self.output_folder = TransformUtils.clean_path(s3_config["output_folder"])
@@ -66,6 +69,32 @@ class DataAccessS3(DataAccess):
66
69
  self.n_samples = n_samples
67
70
  self.files_to_use = files_to_use
68
71
 
72
+ logger.debug(f"S3 access key provided: {access_key}")
73
+ logger.debug(f"S3 secret key provided: no soup for you!")
74
+ logger.debug(f"S3 region {region}")
75
+ logger.debug(f"S3 endpoint/url: {endpoint}")
76
+ logger.debug(f"S input folder: {self.input_folder}")
77
+ logger.debug(f"S3 output folder: {self.output_folder}")
78
+ logger.debug(f"S3 data sets: {self.d_sets}")
79
+ logger.debug(f"S3 checkpoint: {self.checkpoint}")
80
+ logger.debug(f"S3 m_files: {self.m_files}")
81
+ logger.debug(f"S3 n_samples: {self.n_samples}")
82
+ logger.debug(f"S3 files_to_use: {self.files_to_use}")
83
+
84
+ self.arrS3 = ArrowS3(access_key, secret_key, endpoint=endpoint, region=region)
85
+
86
+ def get_access_key(self):
87
+ return self.s3_credentials.get("access_key", None)
88
+
89
+ def get_secret_key(self):
90
+ return self.s3_credentials.get("secret_key", None)
91
+
92
+ def get_endpoint(self):
93
+ return self.s3_credentials.get("url", None)
94
+
95
+ def get_region(self):
96
+ return self.s3_credentials.get("region", None)
97
+
69
98
  def get_num_samples(self) -> int:
70
99
  """
71
100
  Get number of samples for input
@@ -13,11 +13,9 @@
13
13
  from typing import Any
14
14
 
15
15
  from data_processing.data_access import DataAccessFactoryBase
16
- from data_processing.runtime import (
17
- AbstractTransformFileProcessor,
18
- )
19
- from data_processing.transform import TransformStatistics
16
+ from data_processing.runtime import AbstractTransformFileProcessor
20
17
  from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
18
+ from data_processing.transform import TransformStatistics
21
19
 
22
20
 
23
21
  class PythonTransformFileProcessor(AbstractTransformFileProcessor):
@@ -58,7 +58,7 @@ class AbstractTransformFileProcessor:
58
58
  name_extension = TransformUtils.get_file_extension(f_name)
59
59
  self.logger.debug(f"Begin transforming file {f_name}")
60
60
  out_files, stats = self.transform.transform_binary(byte_array=filedata, ext=name_extension[1])
61
- self.logger.debug(f"Done transforming file {f_name}")
61
+ self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
62
62
  self.last_file_name = name_extension[0]
63
63
  self.last_file_name_next_index = None
64
64
  self.last_extension = name_extension[1]
@@ -83,7 +83,9 @@ class AbstractTransformFileProcessor:
83
83
  try:
84
84
  t_start = time.time()
85
85
  # get flush results
86
- self.logger.debug(f"Begin flushing transform")
86
+ self.logger.debug(
87
+ f"Begin flushing transform, last file name {self.last_file_name}, last index {self.last_file_name_next_index}"
88
+ )
87
89
  out_files, stats = self.transform.flush_binary()
88
90
  self.logger.debug(f"Done flushing transform, got {len(out_files)} files")
89
91
  # Here we are using the name of the last file, that we were processing
@@ -113,9 +115,12 @@ class AbstractTransformFileProcessor:
113
115
  case 1:
114
116
  # we have exactly 1 output file
115
117
  file_ext = out_files[0]
116
- output_name = self.data_access.get_output_location(path=f"{self.last_file_name}{file_ext[1]}")
118
+ lfn = self.last_file_name
119
+ if self.last_file_name_next_index is not None:
120
+ lfn = f"{lfn}_{self.last_file_name_next_index}"
121
+ output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
117
122
  self.logger.debug(
118
- f"Writing transformed file {self.last_file_name}{self.last_extension} " f"to {output_name}"
123
+ f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
119
124
  )
120
125
  save_res = self.data_access.save_file(path=output_name, data=file_ext[0])
121
126
  if save_res is not None:
@@ -130,7 +135,10 @@ class AbstractTransformFileProcessor:
130
135
  else:
131
136
  self.logger.warning(f"Failed to write file {output_name}")
132
137
  self._publish_stats({"failed_writes": 1})
133
- self.last_file_name_next_index = 0
138
+ if self.last_file_name_next_index is None:
139
+ self.last_file_name_next_index = 0
140
+ else:
141
+ self.last_file_name_next_index += 1
134
142
  case _:
135
143
  # we have more then 1 file
136
144
  file_sizes = 0
@@ -14,7 +14,6 @@ import sys
14
14
  import tempfile
15
15
  from typing import Any
16
16
 
17
- from data_processing.runtime.ray import RayTransformLauncher
18
17
  from data_processing.runtime.transform_launcher import AbstractTransformLauncher
19
18
  from data_processing.test_support.abstract_test import AbstractTest
20
19
  from data_processing.utils import ParamsUtils
@@ -22,7 +21,7 @@ from data_processing.utils import ParamsUtils
22
21
 
23
22
  class AbstractTransformLauncherTest(AbstractTest):
24
23
  """
25
- The Ray-based test class for all/most AbstractTransform implementations.
24
+ The launcher test class for all/most AbstractTransformLauncher implementations.
26
25
  Generic tests are provided here, and sub-classes must implement the _get*_fixture() method(s)
27
26
  to provide the test data for a given test method. For example, get_test_transform_fixtures()
28
27
  provides the test data for the test_transform() test method.
@@ -36,8 +35,8 @@ class AbstractTransformLauncherTest(AbstractTest):
36
35
  args = {} | cli_params
37
36
  local_ast = {"input_folder": in_table_path, "output_folder": out_table_path}
38
37
  args["data_local_config"] = local_ast
39
- if isinstance(launcher, RayTransformLauncher):
40
- args["run_locally"] = "True"
38
+ # if isinstance(launcher, RayTransformLauncher):
39
+ # args["run_locally"] = "True"
41
40
  argv = ParamsUtils.dict_to_req(args)
42
41
  return argv
43
42
 
@@ -52,7 +51,7 @@ class AbstractTransformLauncherTest(AbstractTest):
52
51
  Test the given transform and its runtime using the given CLI arguments, input directory of data files and expected output directory.
53
52
  Data is processed into a temporary output directory which is then compared with the directory of expected output.
54
53
  :param launcher: launcher configured to run the transform being tested
55
- :param cli_params: a map of the simulated CLI arguments (w/o --). This includes both the transform-specific CLI parameters and the Ray launching args.
54
+ :param cli_params: a map of the simulated CLI arguments (w/o --). This includes both the transform-specific CLI parameters and the launching args.
56
55
  :param in_table_path: a directory containing the input parquet files to be processed and results compared against the expected output table path.
57
56
  :param expected_out_table_path: directory contain parquet and metadata.json that is expected to match the processed input directory.
58
57
  :return:
@@ -62,7 +61,14 @@ class AbstractTransformLauncherTest(AbstractTest):
62
61
  print(f"Using temporary output path {temp_dir}")
63
62
  sys.argv = self._get_argv(launcher, cli_params, in_table_path, temp_dir)
64
63
  launcher.launch()
65
- AbstractTest.validate_directory_contents(temp_dir, expected_out_table_path)
64
+ self._validate_directory_contents_match(temp_dir, expected_out_table_path)
65
+
66
+ def _validate_directory_contents_match(self, dir: str, expected: str):
67
+ """
68
+ Confirm that the two directories contains the same files.
69
+ Stubbed out like this to allow spark tests to override this since spark tends to rename the files.
70
+ """
71
+ AbstractTest.validate_directory_contents(dir, expected)
66
72
 
67
73
  def _install_test_fixtures(self, metafunc):
68
74
  # Apply the fixtures for the method with these input names (i.e. test_transform()).
@@ -1,6 +1,5 @@
1
1
  from .noop_transform import (
2
2
  NOOPTransform,
3
3
  NOOPPythonTransformConfiguration,
4
- NOOPRayTransformConfiguration,
5
4
  )
6
5
  from .transform_test import AbstractTransformTest
@@ -15,13 +15,10 @@ from argparse import ArgumentParser, Namespace
15
15
  from typing import Any
16
16
 
17
17
  import pyarrow as pa
18
+ from data_processing.runtime.pure_python import PythonTransformLauncher
18
19
  from data_processing.runtime.pure_python.runtime_configuration import (
19
20
  PythonTransformRuntimeConfiguration,
20
21
  )
21
- from data_processing.runtime.ray import RayTransformLauncher
22
- from data_processing.runtime.ray.runtime_configuration import (
23
- RayTransformRuntimeConfiguration,
24
- )
25
22
  from data_processing.transform import AbstractTableTransform, TransformConfiguration
26
23
  from data_processing.utils import CLIArgumentProvider, get_logger
27
24
 
@@ -138,22 +135,8 @@ class NOOPPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
138
135
  super().__init__(transform_config=NOOPTransformConfiguration())
139
136
 
140
137
 
141
- class NOOPRayTransformConfiguration(RayTransformRuntimeConfiguration):
142
- """
143
- Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
144
- NOOP does not use a RayRuntime class so the superclass only needs the base
145
- python-only configuration.
146
- """
147
-
148
- def __init__(self):
149
- """
150
- Initialization
151
- """
152
- super().__init__(transform_config=NOOPTransformConfiguration())
153
-
154
-
155
138
  if __name__ == "__main__":
156
139
  # launcher = NOOPRayLauncher()
157
- launcher = RayTransformLauncher(NOOPRayTransformConfiguration())
140
+ launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
158
141
  logger.info("Launching noop transform")
159
142
  launcher.launch()
@@ -0,0 +1,16 @@
1
+ from typing import Any, Generic, TypeVar
2
+
3
+
4
+ DATA = TypeVar("DATA")
5
+
6
+
7
+ class AbstractTransform(Generic[DATA]):
8
+ def transform(self, data: DATA) -> tuple[list[DATA], dict[str, Any]]:
9
+ """
10
+ Converts input table into an output table.
11
+ If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
12
+ :param table: input table
13
+ :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
14
+ propagated to metadata
15
+ """
16
+ raise NotImplemented()
@@ -10,10 +10,15 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
- from typing import Any
13
+ from typing import Any, TypeVar
14
14
 
15
+ from data_processing.transform.abstract_transform import AbstractTransform
15
16
 
16
- class AbstractBinaryTransform:
17
+
18
+ DATA = TypeVar("DATA")
19
+
20
+
21
+ class AbstractBinaryTransform(AbstractTransform[DATA]):
17
22
  """
18
23
  Converts input binary file to output file(s) (binary)
19
24
  Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or
@@ -10,7 +10,7 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
- from typing import Any
13
+ from typing import Any, TypeVar
14
14
 
15
15
  import pyarrow as pa
16
16
  from data_processing.transform import AbstractBinaryTransform
@@ -20,7 +20,7 @@ from data_processing.utils import TransformUtils, get_logger
20
20
  logger = get_logger(__name__)
21
21
 
22
22
 
23
- class AbstractTableTransform(AbstractBinaryTransform):
23
+ class AbstractTableTransform(AbstractBinaryTransform[pa.Table]):
24
24
  """
25
25
  Extends AbstractBinaryTransform to expect the byte arrays from to contain a pyarrow Table.
26
26
  Sub-classes are expected to implement transform() on the parsed Table instances.
@@ -59,17 +59,19 @@ class AbstractTableTransform(AbstractBinaryTransform):
59
59
  # Add number of rows to stats
60
60
  stats = stats | {"source_doc_count": table.num_rows}
61
61
  # convert tables to files
62
- return self._check_and_convert_tables(out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows})
62
+ return self._check_and_convert_tables(
63
+ out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows}
64
+ )
63
65
 
64
- def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
65
- """
66
- Converts input table into an output table.
67
- If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
68
- :param table: input table
69
- :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
70
- propagated to metadata
71
- """
72
- raise NotImplemented()
66
+ # def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
67
+ # """
68
+ # Converts input table into an output table.
69
+ # If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
70
+ # :param table: input table
71
+ # :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
72
+ # propagated to metadata
73
+ # """
74
+ # raise NotImplemented()
73
75
 
74
76
  def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
75
77
  """
@@ -14,6 +14,7 @@ from argparse import ArgumentParser
14
14
  from typing import Any
15
15
 
16
16
  from data_processing.transform import AbstractBinaryTransform
17
+ from data_processing.transform.abstract_transform import AbstractTransform
17
18
  from data_processing.utils import CLIArgumentProvider
18
19
 
19
20
 
@@ -22,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
22
23
  This is a base transform configuration class defining transform's input/output parameter
23
24
  """
24
25
 
25
- def __init__(self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = []):
26
+ def __init__(self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []):
26
27
  """
27
28
  Initialization
28
29
  :param name: transformer name
@@ -34,7 +35,7 @@ class TransformConfiguration(CLIArgumentProvider):
34
35
  self.remove_from_metadata = remove_from_metadata
35
36
  self.params = {}
36
37
 
37
- def get_transform_class(self) -> type[AbstractBinaryTransform]:
38
+ def get_transform_class(self) -> type[AbstractTransform]:
38
39
  """
39
40
  Get the class extending AbstractTransform which implements a specific transformation.
40
41
  The class will generally be instantiated with a dictionary of configuration produced by
@@ -32,7 +32,7 @@ class TestRayNOOPTransform(AbstractTransformLauncherTest):
32
32
  """
33
33
 
34
34
  def get_test_transform_fixtures(self) -> list[tuple]:
35
- basedir = "../../../../test-data/data_processing/ray/noop/"
35
+ basedir = "../../../../test-data/data_processing/python/noop/"
36
36
  basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
37
37
  launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
38
38
  fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")]
@@ -1,35 +0,0 @@
1
-
2
-
3
-
4
- # Byte-compiled / optimized / DLL files
5
- __pycache__/
6
- *.py[cod]
7
- *$py.class
8
-
9
-
10
- # Distribution / packaging
11
- bin/
12
- build/
13
- develop-eggs/
14
- dist/
15
- eggs/
16
- lib/
17
- lib64/
18
- parts/
19
- sdist/
20
- var/
21
- *.egg-info/
22
- .installed.cfg
23
- *.egg
24
-
25
- # Installer logs
26
- pip-log.txt
27
- pip-delete-this-directory.txt
28
-
29
- # Unit test / coverage reports
30
- .tox/
31
- htmlcov
32
- .coverage
33
- .cache
34
- nosetests.xml
35
- coverage.xml
@@ -1,8 +0,0 @@
1
- from data_processing.runtime.ray.ray_utils import RayUtils
2
- from data_processing.runtime.ray.transform_statistics import TransformStatisticsRay
3
- from data_processing.runtime.ray.transform_runtime import DefaultRayTransformRuntime
4
- from data_processing.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
5
- from data_processing.runtime.ray.transform_file_processor import RayTransformFileProcessor
6
- from data_processing.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
7
- from data_processing.runtime.ray.transform_orchestrator import orchestrate
8
- from data_processing.runtime.ray.transform_launcher import RayTransformLauncher