data-prep-toolkit 0.2.0.dev5__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/Makefile +4 -3
  2. {data_prep_toolkit-0.2.0.dev5/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1}/PKG-INFO +7 -3
  3. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/README.md +3 -1
  4. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/pyproject.toml +10 -3
  5. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1/src/data_prep_toolkit.egg-info}/PKG-INFO +7 -3
  6. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/SOURCES.txt +14 -2
  7. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/requires.txt +2 -1
  8. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/__init__.py +1 -0
  9. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/arrow_s3.py +62 -43
  10. data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access.py +457 -0
  11. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/data_access_factory.py +21 -8
  12. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/data_access_factory_base.py +1 -0
  13. data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access_local.py +249 -0
  14. data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access_s3.py +207 -0
  15. data_prep_toolkit-0.2.1/src/data_processing/data_access/snapshot_utils.py +31 -0
  16. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/__init__.py +1 -1
  17. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/execution_configuration.py +5 -5
  18. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/__init__.py +10 -0
  19. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/execution_configuration.py +70 -0
  20. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/pure_python/runtime_configuration.py +15 -2
  21. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_file_processor.py +107 -0
  22. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_invoker.py +159 -0
  23. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/pure_python/transform_launcher.py +3 -36
  24. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_orchestrator.py +224 -0
  25. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_runtime.py +53 -0
  26. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/transform_file_processor.py +53 -23
  27. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/transform_launcher.py +52 -1
  28. data_prep_toolkit-0.2.1/src/data_processing/test_support/__init__.py +1 -0
  29. data_prep_toolkit-0.2.1/src/data_processing/test_support/abstract_test.py +275 -0
  30. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/data_access/data_access_factory_test.py +1 -1
  31. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/launch/transform_test.py +17 -12
  32. data_prep_toolkit-0.2.1/src/data_processing/test_support/transform/__init__.py +6 -0
  33. data_prep_toolkit-0.2.1/src/data_processing/test_support/transform/binary_transform_test.py +85 -0
  34. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/transform/noop_transform.py +1 -1
  35. data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/transform/transform_test.py → data_prep_toolkit-0.2.1/src/data_processing/test_support/transform/table_transform_test.py +10 -8
  36. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/binary_transform.py +3 -8
  37. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/table_transform.py +31 -30
  38. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/transform_configuration.py +6 -5
  39. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/transform_statistics.py +1 -2
  40. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/__init__.py +3 -0
  41. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/log.py +9 -2
  42. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/params_utils.py +8 -6
  43. data_prep_toolkit-0.2.1/src/data_processing/utils/pipinstaller.py +76 -0
  44. data_prep_toolkit-0.2.1/src/data_processing/utils/transform_configuration.json +158 -0
  45. data_prep_toolkit-0.2.1/src/data_processing/utils/transform_configurator.py +91 -0
  46. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/transform_utils.py +26 -9
  47. data_prep_toolkit-0.2.1/src/data_processing/utils/unrecoverable.py +7 -0
  48. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/data_access_local_test.py +52 -42
  49. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/data_access_s3_test.py +11 -9
  50. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/sample_input_data_test.py +1 -1
  51. data_prep_toolkit-0.2.1/test/data_processing_tests/invoker/python_invoker_test.py +48 -0
  52. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -5
  53. data_prep_toolkit-0.2.1/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +37 -0
  54. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/transform/test_noop.py +4 -2
  55. data_prep_toolkit-0.2.1/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
  56. data_prep_toolkit-0.2.1/test-data/data_processing/python/noop/input/test1.parquet +0 -0
  57. data_prep_toolkit-0.2.0.dev5/src/data_processing/data_access/data_access.py +0 -228
  58. data_prep_toolkit-0.2.0.dev5/src/data_processing/data_access/data_access_local.py +0 -407
  59. data_prep_toolkit-0.2.0.dev5/src/data_processing/data_access/data_access_s3.py +0 -373
  60. data_prep_toolkit-0.2.0.dev5/src/data_processing/runtime/pure_python/__init__.py +0 -4
  61. data_prep_toolkit-0.2.0.dev5/src/data_processing/runtime/pure_python/transform_file_processor.py +0 -51
  62. data_prep_toolkit-0.2.0.dev5/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -104
  63. data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/__init__.py +0 -1
  64. data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/abstract_test.py +0 -185
  65. data_prep_toolkit-0.2.0.dev5/src/data_processing/test_support/transform/__init__.py +0 -5
  66. data_prep_toolkit-0.2.0.dev5/src/data_processing/transform/abstract_transform.py +0 -16
  67. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/setup.cfg +0 -0
  68. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  69. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  70. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/__init__.py +0 -0
  71. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/runtime/runtime_configuration.py +0 -0
  72. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/data_access/__init__.py +0 -0
  73. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/test_support/launch/__init__.py +0 -0
  74. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/transform/__init__.py +0 -0
  75. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/cli_utils.py +0 -0
  76. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/utils/config.py +0 -0
  77. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  78. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
  79. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
  80. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  81. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  82. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  83. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  84. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  85. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input/sample1.parquet +0 -0
  86. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  87. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  88. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  89. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
  90. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
  91. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
  92. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
  93. {data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
@@ -15,13 +15,13 @@ clean::
15
15
 
16
16
  setup::
17
17
 
18
- set-versions:: .check-env
18
+ set-versions: .check-env
19
19
  $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
20
20
 
21
21
  build:: build-dist
22
22
 
23
23
  #build:: update-toml .defaults.build-dist
24
- build-dist :: set-versions .defaults.build-dist
24
+ build-dist :: .defaults.build-dist
25
25
 
26
26
  publish:: publish-dist
27
27
 
@@ -46,7 +46,8 @@ image::
46
46
  # it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
47
47
  # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
48
48
  # TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) .
49
- test::
49
+ .PHONY: test
50
+ test:: venv
50
51
  @# Help: Use the already-built virtual environment to run pytest on the test directory.
51
52
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
52
53
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
@@ -1,12 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.0.dev5
3
+ Version: 0.2.1
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
7
+ Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
7
8
  Requires-Python: >=3.10
8
9
  Description-Content-Type: text/markdown
9
- Requires-Dist: pyarrow==15.0.2
10
+ Requires-Dist: numpy<1.29.0
11
+ Requires-Dist: pyarrow==16.1.0
10
12
  Requires-Dist: boto3==1.34.69
11
13
  Requires-Dist: argparse
12
14
  Requires-Dist: mmh3
@@ -41,12 +43,14 @@ source venv/bin/activate
41
43
  or set up your IDE to use the venv directory when developing in this project
42
44
 
43
45
  ## Library Artifact Build and Publish
44
- To test, build and publish the library to artifactory
46
+ To test, build and publish the library
45
47
  ```shell
46
48
  make test build publish
47
49
  ```
50
+
48
51
  To up the version number, edit the Makefile to change VERSION and rerun
49
52
  the above. This will require committing both the `Makefile` and the
50
53
  autotmatically updated `pyproject.toml` file.
51
54
 
52
55
 
56
+
@@ -18,12 +18,14 @@ source venv/bin/activate
18
18
  or set up your IDE to use the venv directory when developing in this project
19
19
 
20
20
  ## Library Artifact Build and Publish
21
- To test, build and publish the library to artifactory
21
+ To test, build and publish the library
22
22
  ```shell
23
23
  make test build publish
24
24
  ```
25
+
25
26
  To up the version number, edit the Makefile to change VERSION and rerun
26
27
  the above. This will require committing both the `Makefile` and the
27
28
  autotmatically updated `pyproject.toml` file.
28
29
 
29
30
 
31
+
@@ -1,7 +1,8 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.2.0.dev5"
3
+ version = "0.2.1"
4
4
  requires-python = ">=3.10"
5
+ keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
5
6
  description = "Data Preparation Toolkit Library"
6
7
  license = {text = "Apache-2.0"}
7
8
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -10,12 +11,18 @@ authors = [
10
11
  { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
11
12
  ]
12
13
  dependencies = [
13
- "pyarrow==15.0.2",
14
+ "numpy < 1.29.0",
15
+ "pyarrow==16.1.0",
14
16
  "boto3==1.34.69",
15
17
  "argparse",
16
18
  "mmh3",
17
19
  ]
18
20
 
21
+ [project_urls]
22
+ Repository = "https://github.com/IBM/data-prep-kit"
23
+ Issues = "https://github.com/IBM/data-prep-kit/issues"
24
+ Documentation = "https://ibm.github.io/data-prep-kit/"
25
+
19
26
  [build-system]
20
27
  requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
21
28
  build-backend = "setuptools.build_meta"
@@ -34,7 +41,7 @@ dev = [
34
41
  ]
35
42
 
36
43
  [options]
37
- package_dir = ["src","test"]
44
+ package_dir = ["src"]
38
45
 
39
46
  [options.packages.find]
40
47
  where = ["src/data_processing"]
@@ -1,12 +1,14 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.0.dev5
3
+ Version: 0.2.1
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
7
+ Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
7
8
  Requires-Python: >=3.10
8
9
  Description-Content-Type: text/markdown
9
- Requires-Dist: pyarrow==15.0.2
10
+ Requires-Dist: numpy<1.29.0
11
+ Requires-Dist: pyarrow==16.1.0
10
12
  Requires-Dist: boto3==1.34.69
11
13
  Requires-Dist: argparse
12
14
  Requires-Dist: mmh3
@@ -41,12 +43,14 @@ source venv/bin/activate
41
43
  or set up your IDE to use the venv directory when developing in this project
42
44
 
43
45
  ## Library Artifact Build and Publish
44
- To test, build and publish the library to artifactory
46
+ To test, build and publish the library
45
47
  ```shell
46
48
  make test build publish
47
49
  ```
50
+
48
51
  To up the version number, edit the Makefile to change VERSION and rerun
49
52
  the above. This will require committing both the `Makefile` and the
50
53
  autotmatically updated `pyproject.toml` file.
51
54
 
52
55
 
56
+
@@ -14,16 +14,20 @@ src/data_processing/data_access/data_access_factory.py
14
14
  src/data_processing/data_access/data_access_factory_base.py
15
15
  src/data_processing/data_access/data_access_local.py
16
16
  src/data_processing/data_access/data_access_s3.py
17
+ src/data_processing/data_access/snapshot_utils.py
17
18
  src/data_processing/runtime/__init__.py
18
19
  src/data_processing/runtime/execution_configuration.py
19
20
  src/data_processing/runtime/runtime_configuration.py
20
21
  src/data_processing/runtime/transform_file_processor.py
21
22
  src/data_processing/runtime/transform_launcher.py
22
23
  src/data_processing/runtime/pure_python/__init__.py
24
+ src/data_processing/runtime/pure_python/execution_configuration.py
23
25
  src/data_processing/runtime/pure_python/runtime_configuration.py
24
26
  src/data_processing/runtime/pure_python/transform_file_processor.py
27
+ src/data_processing/runtime/pure_python/transform_invoker.py
25
28
  src/data_processing/runtime/pure_python/transform_launcher.py
26
29
  src/data_processing/runtime/pure_python/transform_orchestrator.py
30
+ src/data_processing/runtime/pure_python/transform_runtime.py
27
31
  src/data_processing/test_support/__init__.py
28
32
  src/data_processing/test_support/abstract_test.py
29
33
  src/data_processing/test_support/data_access/__init__.py
@@ -31,10 +35,10 @@ src/data_processing/test_support/data_access/data_access_factory_test.py
31
35
  src/data_processing/test_support/launch/__init__.py
32
36
  src/data_processing/test_support/launch/transform_test.py
33
37
  src/data_processing/test_support/transform/__init__.py
38
+ src/data_processing/test_support/transform/binary_transform_test.py
34
39
  src/data_processing/test_support/transform/noop_transform.py
35
- src/data_processing/test_support/transform/transform_test.py
40
+ src/data_processing/test_support/transform/table_transform_test.py
36
41
  src/data_processing/transform/__init__.py
37
- src/data_processing/transform/abstract_transform.py
38
42
  src/data_processing/transform/binary_transform.py
39
43
  src/data_processing/transform/table_transform.py
40
44
  src/data_processing/transform/transform_configuration.py
@@ -44,7 +48,11 @@ src/data_processing/utils/cli_utils.py
44
48
  src/data_processing/utils/config.py
45
49
  src/data_processing/utils/log.py
46
50
  src/data_processing/utils/params_utils.py
51
+ src/data_processing/utils/pipinstaller.py
52
+ src/data_processing/utils/transform_configuration.json
53
+ src/data_processing/utils/transform_configurator.py
47
54
  src/data_processing/utils/transform_utils.py
55
+ src/data_processing/utils/unrecoverable.py
48
56
  test-data/data_processing/daf/input/ds1/sample1.parquet
49
57
  test-data/data_processing/daf/input/ds1/sample2.parquet
50
58
  test-data/data_processing/daf/input/ds2/sample3.parquet
@@ -55,15 +63,19 @@ test-data/data_processing/input_multiple/sample2.parquet
55
63
  test-data/data_processing/input_multiple/sample3.parquet
56
64
  test-data/data_processing/python/noop/expected/metadata.json
57
65
  test-data/data_processing/python/noop/expected/sample1.parquet
66
+ test-data/data_processing/python/noop/expected/test1.parquet
58
67
  test-data/data_processing/python/noop/expected/subdir/test1.parquet
59
68
  test-data/data_processing/python/noop/input/sample1.parquet
69
+ test-data/data_processing/python/noop/input/test1.parquet
60
70
  test-data/data_processing/python/noop/input/subdir/test1.parquet
61
71
  test/data_processing_tests/data_access/daf_local_test.py
62
72
  test/data_processing_tests/data_access/data_access_local_test.py
63
73
  test/data_processing_tests/data_access/data_access_s3_test.py
64
74
  test/data_processing_tests/data_access/sample_input_data_test.py
75
+ test/data_processing_tests/invoker/python_invoker_test.py
65
76
  test/data_processing_tests/launch/pure_python/launcher_test.py
66
77
  test/data_processing_tests/launch/pure_python/multi_launcher_test.py
67
78
  test/data_processing_tests/launch/pure_python/test_noop_launch.py
79
+ test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py
68
80
  test/data_processing_tests/transform/test_noop.py
69
81
  test/data_processing_tests/util/transform_utils_test.py
@@ -1,4 +1,5 @@
1
- pyarrow==15.0.2
1
+ numpy<1.29.0
2
+ pyarrow==16.1.0
2
3
  boto3==1.34.69
3
4
  argparse
4
5
  mmh3
@@ -4,3 +4,4 @@ from data_processing.data_access.data_access_local import DataAccessLocal
4
4
  from data_processing.data_access.data_access_s3 import DataAccessS3
5
5
  from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
6
6
  from data_processing.data_access.data_access_factory import DataAccessFactory
7
+ from data_processing.data_access.snapshot_utils import SnapshotUtils
@@ -56,6 +56,7 @@ class ArrowS3:
56
56
  config=Config(retries={"max_attempts": s3_max_attempts, "mode": "standard"}),
57
57
  )
58
58
  self.retries = s3_retries
59
+ self.s3_max_attempts = s3_max_attempts
59
60
 
60
61
  @staticmethod
61
62
  def _get_bucket_key(key: str) -> tuple[str, str]:
@@ -68,144 +69,162 @@ class ArrowS3:
68
69
  return prefixes[0], "/".join(prefixes[1:])
69
70
 
70
71
  # get list of the files (names and sizes) for a given prefix (including bucket name)
71
- def list_files(self, key: str) -> list[dict[str, Any]]:
72
+ def list_files(self, key: str) -> tuple[list[dict[str, Any]], int]:
72
73
  """
73
74
  List files in the folder (hierarchically going through all sub-folders)
74
75
  :param key: complete folder name
75
- :return: list of dictionaries, containing file names and length
76
+ :return: list of dictionaries, containing file names and length and number of retries
76
77
  """
77
78
  bucket, prefix = self._get_bucket_key(key)
78
79
  # Use paginator here to get all the files rather then 1 page
79
80
  paginator = self.s3_client.get_paginator("list_objects_v2")
80
81
  pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
81
82
  files = []
83
+ retries = 0
82
84
  for page in pages:
83
85
  # For every page
86
+ retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
84
87
  for obj in page.get("Contents", []):
85
88
  # Get both file name and size
86
89
  files.append({"name": f"{bucket}/{obj['Key']}", "size": obj["Size"]})
87
- return files
90
+ return files, retries
88
91
 
89
- def list_folders(self, key: str) -> list[str]:
92
+ def list_folders(self, key: str) -> tuple[list[str], int]:
90
93
  """
91
94
  Get list of folders for folder
92
95
  :param key: complete folder
93
- :return: list of folders within a given folder
96
+ :return: list of folders within a given folder and number of retries
94
97
  """
95
- bucket, prefix = self._get_bucket_key(key)
96
-
97
- def _get_sub_folders(bck: str, p: str) -> list[str]:
98
+ def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]:
99
+ sub_folders = []
98
100
  # use paginator
99
101
  paginator = self.s3_client.get_paginator("list_objects_v2")
100
102
  # use Delimiter to get folders just folders
101
103
  page_iterator = paginator.paginate(Bucket=bck, Prefix=p, Delimiter="/")
102
- sub_folders = []
104
+ internal_retries = 0
103
105
  for page in page_iterator:
104
106
  # for every page
107
+ internal_retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
105
108
  for p in page.get("CommonPrefixes", []):
106
- sub_folders.append(p["Prefix"])
109
+ sf = p["Prefix"]
110
+ sub_folders.append(sf)
107
111
  # apply recursively
108
- sub_folders.extend(_get_sub_folders(bck, p["Prefix"]))
109
- return sub_folders
110
-
111
- prefixes = _get_sub_folders(bck=bucket, p=prefix)
112
- # remove base prefix
113
- return [p.removeprefix(prefix) for p in prefixes]
112
+ sf, r = _get_sub_folders(bck=bck, p=sf)
113
+ internal_retries += r
114
+ sub_folders.extend(sf)
115
+ return sub_folders, internal_retries
116
+ bucket, prefix = self._get_bucket_key(key)
117
+ subs, retries = _get_sub_folders(bck=bucket, p=prefix)
118
+ return [f"{bucket}/{f}" for f in subs], retries
114
119
 
115
- def read_file(self, key: str) -> bytes:
120
+ def read_file(self, key: str) -> tuple[bytes, int]:
116
121
  """
117
122
  Read an s3 file by name
118
123
  :param key: complete path
119
- :return: byte array of file content or None if the file does not exist
124
+ :return: byte array of file content or None if the file does not exist and a number of retries
120
125
  """
121
126
  bucket, prefix = self._get_bucket_key(key)
127
+ retries = 0
122
128
  for n in range(self.retries):
123
129
  try:
124
130
  obj = self.s3_client.get_object(Bucket=bucket, Key=prefix)
125
- return obj["Body"].read()
131
+ retries += obj.get("ResponseMetadata", {}).get("RetryAttempts", 0)
132
+ return obj["Body"].read(), retries
126
133
  except Exception as e:
127
134
  logger.error(f"failed to read file {key}, exception {e}, attempt {n}")
135
+ retries += self.s3_max_attempts
128
136
  logger.error(f"failed to read file {key} in {self.retries} attempts. Skipping it")
129
- return None
137
+ return None, retries
130
138
 
131
- def save_file(self, key: str, data: bytes) -> dict[str, Any]:
139
+ def save_file(self, key: str, data: bytes) -> tuple[dict[str, Any], int]:
132
140
  """
133
141
  Save file to S3
134
142
  :param key: complete path
135
143
  :param data: byte array of the file content
136
144
  :return: dictionary as
137
145
  defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
138
- in the case of failure dict is None
146
+ in the case of failure dict is None and the number of retries
139
147
  """
140
148
  bucket, prefix = self._get_bucket_key(key)
149
+ retries = 0
141
150
  for n in range(self.retries):
142
151
  try:
143
- return self.s3_client.put_object(Bucket=bucket, Key=prefix, Body=data)
152
+ res = self.s3_client.put_object(Bucket=bucket, Key=prefix, Body=data)
153
+ retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
154
+ return res, retries
144
155
  except Exception as e:
145
156
  logger.error(f"Failed to upload file to to key {key}, exception {e}")
157
+ retries += self.s3_max_attempts
146
158
  logger.error(f"Failed to upload file {key}, skipping it")
147
- return None
159
+ return None, retries
148
160
 
149
- def read_table(self, key: str, schema: pa.schema = None) -> pa.Table:
161
+ def read_table(self, key: str, schema: pa.schema = None) -> tuple[pa.Table, int]:
150
162
  """
151
163
  Get an arrow table from a file with a given name
152
164
  :param key: complete path
153
165
  :param schema: Schema used for reading table, default None
154
- :return: table or None if the read failed
166
+ :return: table or None if the read failed and the number of retries
155
167
  """
156
168
  # Read file as bytes
157
- data = self.read_file(key)
169
+ data, retries = self.read_file(key)
158
170
  if data is None:
159
- return None
160
- return TransformUtils.convert_binary_to_arrow(data=data, schema=schema)
171
+ return None, retries
172
+ return TransformUtils.convert_binary_to_arrow(data=data, schema=schema), retries
161
173
 
162
- def save_table(self, key: str, table: pa.Table) -> tuple[int, dict[str, Any]]:
174
+ def save_table(self, key: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
163
175
  """
164
176
  Save an arrow table to a file with a name
165
177
  :param key: complete path
166
178
  :param table: table to save
167
179
  :return: table size and a dictionary as
168
180
  defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
169
- in the case of failure len is -1 and dict is None
181
+ in the case of failure len is -1 and dict is None and the number of retries
170
182
  """
171
183
  # convert to bytes
172
184
  data = TransformUtils.convert_arrow_to_binary(table=table)
173
185
  if data is None:
174
186
  return -1, None
175
187
  # save bytes
176
- return len(data), self.save_file(key, data)
188
+ res, retries = self.save_file(key, data)
189
+ return len(data), res, retries
177
190
 
178
- def delete_file(self, key: str) -> None:
191
+ def delete_file(self, key: str) -> int:
179
192
  """
180
193
  Delete file from S3
181
194
  :param key: complete path
182
- :return: None
195
+ :return: the number of retries
183
196
  """
184
197
  bucket, prefix = self._get_bucket_key(key)
198
+ retries = 0
185
199
  for n in range(self.retries):
186
200
  try:
187
- self.s3_client.delete_object(Bucket=bucket, Key=prefix)
188
- return None
201
+ res = self.s3_client.delete_object(Bucket=bucket, Key=prefix)
202
+ retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
203
+ return retries
189
204
  except Exception as e:
190
205
  logger.error(f"failed to delete file {key}, exception {e}")
191
- return None
206
+ retries += self.s3_max_attempts
207
+ return retries
192
208
 
193
- def move_file(self, source: str, dest: str) -> None:
209
+ def move_file(self, source: str, dest: str) -> int:
194
210
  """
195
211
  move file from source to destination
196
212
  :param source: complete source path
197
213
  :param dest: complete destination path
198
- :return: None
214
+ :return: number of retries
199
215
  """
200
216
  s_bucket, s_prefix = self._get_bucket_key(source)
201
217
  d_bucket, d_prefix = self._get_bucket_key(dest)
202
218
  # copy source to destination and then delete source
203
219
  copy_source = {"Bucket": s_bucket, "Key": s_prefix}
220
+ retries = 0
204
221
  for n in range(self.retries):
205
222
  try:
206
- self.s3_client.copy_object(CopySource=copy_source, Bucket=d_bucket, Key=d_prefix)
207
- self.delete_file(source)
208
- return None
223
+ res = self.s3_client.copy_object(CopySource=copy_source, Bucket=d_bucket, Key=d_prefix)
224
+ retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
225
+ retries += self.delete_file(source)
226
+ return retries
209
227
  except Exception as e:
210
228
  logger.error(f"failed to copy file {source} to {dest}, exception {e}")
211
- return None
229
+ retries += self.s3_max_attempts
230
+ return retries