data-prep-toolkit 0.0.1.dev12__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/Makefile +10 -8
  2. {data_prep_toolkit-0.0.1.dev12/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.1.1}/PKG-INFO +3 -3
  3. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/README.md +2 -2
  4. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/pyproject.toml +1 -1
  5. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info}/PKG-INFO +3 -3
  6. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/SOURCES.txt +10 -24
  7. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_factory.py +6 -6
  8. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_local.py +16 -16
  9. data_prep_toolkit-0.1.1/src/data_processing/runtime/__init__.py +4 -0
  10. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/execution_configuration.py +8 -7
  11. data_prep_toolkit-0.1.1/src/data_processing/runtime/pure_python/__init__.py +4 -0
  12. data_prep_toolkit-0.1.1/src/data_processing/runtime/pure_python/runtime_configuration.py +24 -0
  13. data_prep_toolkit-0.1.1/src/data_processing/runtime/pure_python/transform_file_processor.py +53 -0
  14. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/pure_python/transform_launcher.py +11 -11
  15. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/pure_python/transform_orchestrator.py +13 -12
  16. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/__init__.py +4 -4
  17. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_orchestrator_configuration.py → data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/execution_configuration.py +3 -5
  18. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/ray_utils.py +2 -2
  19. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/runtime_configuration.py +38 -0
  20. data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/transform_file_processor.py +46 -0
  21. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_launcher.py +13 -21
  22. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_orchestrator.py +10 -10
  23. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_runtime.py +1 -1
  24. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/runtime/ray/transform_statistics.py +10 -4
  25. data_prep_toolkit-0.1.1/src/data_processing/runtime/runtime_configuration.py +64 -0
  26. data_prep_toolkit-0.1.1/src/data_processing/runtime/transform_file_processor.py +173 -0
  27. data_prep_toolkit-0.1.1/src/data_processing/runtime/transform_launcher.py +76 -0
  28. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/transform/__init__.py +2 -1
  29. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/transform/noop_transform.py +34 -29
  30. data_prep_toolkit-0.1.1/src/data_processing/transform/__init__.py +4 -0
  31. data_prep_toolkit-0.1.1/src/data_processing/transform/binary_transform.py +53 -0
  32. data_prep_toolkit-0.1.1/src/data_processing/transform/table_transform.py +116 -0
  33. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/transform/transform_configuration.py +35 -20
  34. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/params_utils.py +16 -1
  35. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/transform_utils.py +4 -9
  36. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/pure_python/launcher_test.py +3 -12
  37. data_prep_toolkit-0.1.1/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +77 -0
  38. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +7 -9
  39. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/ray/launcher_test.py +16 -23
  40. data_prep_toolkit-0.1.1/test/data_processing_tests/launch/ray/multi_launcher_test.py +80 -0
  41. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/ray/test_noop_launch.py +0 -1
  42. data_prep_toolkit-0.0.1.dev12/doc/advanced-transform-tutorial.md +0 -284
  43. data_prep_toolkit-0.0.1.dev12/doc/architecture.md +0 -104
  44. data_prep_toolkit-0.0.1.dev12/doc/overview.md +0 -28
  45. data_prep_toolkit-0.0.1.dev12/doc/processing-architecture.jpg +0 -0
  46. data_prep_toolkit-0.0.1.dev12/doc/python-launcher-options.md +0 -60
  47. data_prep_toolkit-0.0.1.dev12/doc/python-runtime.md +0 -12
  48. data_prep_toolkit-0.0.1.dev12/doc/ray-launcher-options.md +0 -79
  49. data_prep_toolkit-0.0.1.dev12/doc/ray-runtime.md +0 -143
  50. data_prep_toolkit-0.0.1.dev12/doc/simplest-transform-tutorial.md +0 -211
  51. data_prep_toolkit-0.0.1.dev12/doc/testing-e2e-transform.md +0 -4
  52. data_prep_toolkit-0.0.1.dev12/doc/transform-external-resources.md +0 -224
  53. data_prep_toolkit-0.0.1.dev12/doc/transform-runtimes.md +0 -9
  54. data_prep_toolkit-0.0.1.dev12/doc/transform-s3-testing.md +0 -91
  55. data_prep_toolkit-0.0.1.dev12/doc/transform-standalone-testing.md +0 -99
  56. data_prep_toolkit-0.0.1.dev12/doc/transform-testing.md +0 -6
  57. data_prep_toolkit-0.0.1.dev12/doc/transform-tutorial-examples.md +0 -15
  58. data_prep_toolkit-0.0.1.dev12/doc/transform-tutorials.md +0 -67
  59. data_prep_toolkit-0.0.1.dev12/doc/transformer-utilities.md +0 -24
  60. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/__init__.py +0 -2
  61. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/__init__.py +0 -4
  62. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/python_launcher_configuration.py +0 -97
  63. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/pure_python/transform_table_processor.py +0 -191
  64. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_configuration.py +0 -33
  65. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_launch_configuration.py +0 -44
  66. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/ray/transform_table_processor.py +0 -191
  67. data_prep_toolkit-0.0.1.dev12/src/data_processing/runtime/transform_launcher.py +0 -25
  68. data_prep_toolkit-0.0.1.dev12/src/data_processing/transform/__init__.py +0 -3
  69. data_prep_toolkit-0.0.1.dev12/src/data_processing/transform/table_transform.py +0 -50
  70. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/.gitignore +0 -0
  71. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/setup.cfg +0 -0
  72. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  73. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
  74. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  75. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/__init__.py +0 -0
  76. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/__init__.py +0 -0
  77. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/arrow_s3.py +0 -0
  78. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access.py +0 -0
  79. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  80. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/data_access/data_access_s3.py +0 -0
  81. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/__init__.py +0 -0
  82. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/abstract_test.py +0 -0
  83. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/data_access/__init__.py +0 -0
  84. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  85. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/launch/__init__.py +0 -0
  86. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/launch/transform_test.py +0 -0
  87. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/test_support/transform/transform_test.py +0 -0
  88. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/transform/transform_statistics.py +0 -0
  89. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/__init__.py +2 -2
  90. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/cli_utils.py +0 -0
  91. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/config.py +0 -0
  92. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/src/data_processing/utils/log.py +0 -0
  93. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  94. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
  95. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
  96. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  97. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/launch/ray/ray_util_test.py +0 -0
  98. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/transform/test_noop.py +0 -0
  99. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  100. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  101. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  102. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  103. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  104. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input/sample1.parquet +0 -0
  105. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  106. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  107. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  108. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/expected/metadata.json +0 -0
  109. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/expected/sample1.parquet +0 -0
  110. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/expected/subdir/test1.parquet +0 -0
  111. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/input/sample1.parquet +0 -0
  112. {data_prep_toolkit-0.0.1.dev12 → data_prep_toolkit-0.1.1}/test-data/data_processing/ray/noop/input/subdir/test1.parquet +0 -0
@@ -1,7 +1,6 @@
1
1
  # Use make help, to see the available rules
2
- REPOROOT=../
3
- include ../.make.defaults
4
- include ../.make.versions
2
+ REPOROOT=../..
3
+ include $(REPOROOT)/.make.defaults
5
4
 
6
5
  TAG := "v${DPK_LIB_VERSION}"
7
6
 
@@ -53,10 +52,13 @@ venv:: pyproject.toml
53
52
  # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
54
53
  test::
55
54
  @# Help: Use the already-built virtual environment to run pytest on the test directory.
56
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
57
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
58
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python;
55
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
56
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
57
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/launcher_test.py;
58
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/multi_launcher_test.py;
59
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/pure_python/test_noop_launch.py;
59
60
  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/ray_util_test.py;
60
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
61
- source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
61
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/multi_launcher_test.py;
62
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/launcher_test.py;
63
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/launch/ray/test_noop_launch.py;
62
64
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.0.1.dev12
3
+ Version: 0.1.1
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -27,9 +27,9 @@ Requires-Dist: markupsafe==2.0.1; extra == "dev"
27
27
  # Data Processing Library
28
28
  This provides a python framework for developing _transforms_
29
29
  on data stored in files - currently parquet files are supported -
30
- and running them in a [ray](https://ray.com) cluster.
30
+ and running them in a [ray](https://www.ray.io/) cluster.
31
31
  Data files may be stored in the local file system or COS/S3.
32
- For more details see the [documentation](doc/overview.md).
32
+ For more details see the [documentation](../doc/overview.md).
33
33
 
34
34
  ### Virtual Environment
35
35
  The project uses `pyproject.toml` and a Makefile for operations.
@@ -1,9 +1,9 @@
1
1
  # Data Processing Library
2
2
  This provides a python framework for developing _transforms_
3
3
  on data stored in files - currently parquet files are supported -
4
- and running them in a [ray](https://ray.com) cluster.
4
+ and running them in a [ray](https://www.ray.io/) cluster.
5
5
  Data files may be stored in the local file system or COS/S3.
6
- For more details see the [documentation](doc/overview.md).
6
+ For more details see the [documentation](../doc/overview.md).
7
7
 
8
8
  ### Virtual Environment
9
9
  The project uses `pyproject.toml` and a Makefile for operations.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.0.1-dev12"
3
+ version = "0.1.1"
4
4
  requires-python = ">=3.10"
5
5
  description = "Data Preparation Toolkit Library"
6
6
  license = {text = "Apache-2.0"}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.0.1.dev12
3
+ Version: 0.1.1
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -27,9 +27,9 @@ Requires-Dist: markupsafe==2.0.1; extra == "dev"
27
27
  # Data Processing Library
28
28
  This provides a python framework for developing _transforms_
29
29
  on data stored in files - currently parquet files are supported -
30
- and running them in a [ray](https://ray.com) cluster.
30
+ and running them in a [ray](https://www.ray.io/) cluster.
31
31
  Data files may be stored in the local file system or COS/S3.
32
- For more details see the [documentation](doc/overview.md).
32
+ For more details see the [documentation](../doc/overview.md).
33
33
 
34
34
  ### Virtual Environment
35
35
  The project uses `pyproject.toml` and a Makefile for operations.
@@ -2,24 +2,6 @@
2
2
  Makefile
3
3
  README.md
4
4
  pyproject.toml
5
- doc/advanced-transform-tutorial.md
6
- doc/architecture.md
7
- doc/overview.md
8
- doc/processing-architecture.jpg
9
- doc/python-launcher-options.md
10
- doc/python-runtime.md
11
- doc/ray-launcher-options.md
12
- doc/ray-runtime.md
13
- doc/simplest-transform-tutorial.md
14
- doc/testing-e2e-transform.md
15
- doc/transform-external-resources.md
16
- doc/transform-runtimes.md
17
- doc/transform-s3-testing.md
18
- doc/transform-standalone-testing.md
19
- doc/transform-testing.md
20
- doc/transform-tutorial-examples.md
21
- doc/transform-tutorials.md
22
- doc/transformer-utilities.md
23
5
  src/data_prep_toolkit.egg-info/PKG-INFO
24
6
  src/data_prep_toolkit.egg-info/SOURCES.txt
25
7
  src/data_prep_toolkit.egg-info/dependency_links.txt
@@ -35,22 +17,23 @@ src/data_processing/data_access/data_access_local.py
35
17
  src/data_processing/data_access/data_access_s3.py
36
18
  src/data_processing/runtime/__init__.py
37
19
  src/data_processing/runtime/execution_configuration.py
20
+ src/data_processing/runtime/runtime_configuration.py
21
+ src/data_processing/runtime/transform_file_processor.py
38
22
  src/data_processing/runtime/transform_launcher.py
39
23
  src/data_processing/runtime/pure_python/__init__.py
40
- src/data_processing/runtime/pure_python/python_launcher_configuration.py
24
+ src/data_processing/runtime/pure_python/runtime_configuration.py
25
+ src/data_processing/runtime/pure_python/transform_file_processor.py
41
26
  src/data_processing/runtime/pure_python/transform_launcher.py
42
27
  src/data_processing/runtime/pure_python/transform_orchestrator.py
43
- src/data_processing/runtime/pure_python/transform_table_processor.py
44
28
  src/data_processing/runtime/ray/__init__.py
29
+ src/data_processing/runtime/ray/execution_configuration.py
45
30
  src/data_processing/runtime/ray/ray_utils.py
46
- src/data_processing/runtime/ray/transform_configuration.py
47
- src/data_processing/runtime/ray/transform_launch_configuration.py
31
+ src/data_processing/runtime/ray/runtime_configuration.py
32
+ src/data_processing/runtime/ray/transform_file_processor.py
48
33
  src/data_processing/runtime/ray/transform_launcher.py
49
34
  src/data_processing/runtime/ray/transform_orchestrator.py
50
- src/data_processing/runtime/ray/transform_orchestrator_configuration.py
51
35
  src/data_processing/runtime/ray/transform_runtime.py
52
36
  src/data_processing/runtime/ray/transform_statistics.py
53
- src/data_processing/runtime/ray/transform_table_processor.py
54
37
  src/data_processing/test_support/__init__.py
55
38
  src/data_processing/test_support/abstract_test.py
56
39
  src/data_processing/test_support/data_access/__init__.py
@@ -61,6 +44,7 @@ src/data_processing/test_support/transform/__init__.py
61
44
  src/data_processing/test_support/transform/noop_transform.py
62
45
  src/data_processing/test_support/transform/transform_test.py
63
46
  src/data_processing/transform/__init__.py
47
+ src/data_processing/transform/binary_transform.py
64
48
  src/data_processing/transform/table_transform.py
65
49
  src/data_processing/transform/transform_configuration.py
66
50
  src/data_processing/transform/transform_statistics.py
@@ -88,8 +72,10 @@ test/data_processing_tests/data_access/data_access_local_test.py
88
72
  test/data_processing_tests/data_access/data_access_s3_test.py
89
73
  test/data_processing_tests/data_access/sample_input_data_test.py
90
74
  test/data_processing_tests/launch/pure_python/launcher_test.py
75
+ test/data_processing_tests/launch/pure_python/multi_launcher_test.py
91
76
  test/data_processing_tests/launch/pure_python/test_noop_launch.py
92
77
  test/data_processing_tests/launch/ray/launcher_test.py
78
+ test/data_processing_tests/launch/ray/multi_launcher_test.py
93
79
  test/data_processing_tests/launch/ray/ray_util_test.py
94
80
  test/data_processing_tests/launch/ray/test_noop_launch.py
95
81
  test/data_processing_tests/transform/test_noop.py
@@ -142,14 +142,14 @@ class DataAccessFactory(DataAccessFactoryBase):
142
142
  arg_dict = args
143
143
  else:
144
144
  raise ValueError("args must be Namespace or dictionary")
145
- s3_cred = arg_dict.get(f"{self.cli_arg_prefix}s3_cred")
146
- s3_config = arg_dict.get(f"{self.cli_arg_prefix}s3_config")
147
- local_config = arg_dict.get(f"{self.cli_arg_prefix}local_config")
148
- checkpointing = arg_dict.get(f"{self.cli_arg_prefix}checkpointing")
145
+ s3_cred = arg_dict.get(f"{self.cli_arg_prefix}s3_cred", None)
146
+ s3_config = arg_dict.get(f"{self.cli_arg_prefix}s3_config", None)
147
+ local_config = arg_dict.get(f"{self.cli_arg_prefix}local_config", None)
148
+ checkpointing = arg_dict.get(f"{self.cli_arg_prefix}checkpointing", False)
149
149
  max_files = arg_dict.get(f"{self.cli_arg_prefix}max_files", -1)
150
- data_sets = arg_dict.get(f"{self.cli_arg_prefix}data_sets")
150
+ data_sets = arg_dict.get(f"{self.cli_arg_prefix}data_sets", None)
151
151
  n_samples = arg_dict.get(f"{self.cli_arg_prefix}num_samples", -1)
152
- files_to_use = arg_dict.get(f"{self.cli_arg_prefix}files_to_use")
152
+ files_to_use = arg_dict.get(f"{self.cli_arg_prefix}files_to_use", [".parquet"])
153
153
  # check which configuration (S3, LakeHouse, or Local) is specified
154
154
  s3_config_specified = 1 if s3_config is not None else 0
155
155
  local_config_specified = 1 if local_config is not None else 0
@@ -318,32 +318,32 @@ class DataAccessLocal(DataAccess):
318
318
  metadata["source"] = {"name": self.input_folder, "type": "path"}
319
319
  metadata["target"] = {"name": self.output_folder, "type": "path"}
320
320
  return self.save_file(
321
- file_path=os.path.join(self.output_folder, "metadata.json"),
322
- bytes_data=json.dumps(metadata, indent=2).encode(),
321
+ path=os.path.join(self.output_folder, "metadata.json"),
322
+ data=json.dumps(metadata, indent=2).encode(),
323
323
  )
324
324
 
325
- def get_file(self, file_path: str) -> bytes:
325
+ def get_file(self, path: str) -> bytes:
326
326
  """
327
327
  Gets the contents of a file as a byte array, decompressing gz files if needed.
328
328
 
329
329
  Args:
330
- file_path (str): The path to the file.
330
+ path (str): The path to the file.
331
331
 
332
332
  Returns:
333
333
  bytes: The contents of the file as a byte array, or None if an error occurs.
334
334
  """
335
335
 
336
336
  try:
337
- if file_path.endswith(".gz"):
338
- with gzip.open(file_path, "rb") as f:
337
+ if path.endswith(".gz"):
338
+ with gzip.open(path, "rb") as f:
339
339
  data = f.read()
340
340
  else:
341
- with open(file_path, "rb") as f:
341
+ with open(path, "rb") as f:
342
342
  data = f.read()
343
343
  return data
344
344
 
345
345
  except (FileNotFoundError, gzip.BadGzipFile) as e:
346
- logger.error(f"Error reading file {file_path}: {e}")
346
+ logger.error(f"Error reading file {path}: {e}")
347
347
  raise e
348
348
 
349
349
  def get_folder_files(self, path: str, extensions: list[str] = None, return_data: bool = True) -> dict[str, bytes]:
@@ -374,13 +374,13 @@ class DataAccessLocal(DataAccess):
374
374
  matching_files[filename] = _get_file_content(filename, return_data)
375
375
  return matching_files
376
376
 
377
- def save_file(self, file_path: str, bytes_data: bytes) -> dict[str, Any]:
377
+ def save_file(self, path: str, data: bytes) -> dict[str, Any]:
378
378
  """
379
379
  Saves bytes to a file and returns a dictionary with file information.
380
380
 
381
381
  Args:
382
- bytes_data (bytes): The bytes data to save.
383
- file_path (str): The full name of the file to save.
382
+ data (bytes): The bytes data to save.
383
+ path (str): The full name of the file to save.
384
384
 
385
385
  Returns:
386
386
  dict or None: A dictionary with "name" and "size" keys if successful,
@@ -388,12 +388,12 @@ class DataAccessLocal(DataAccess):
388
388
  """
389
389
 
390
390
  try:
391
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
392
- with open(file_path, "wb") as f:
393
- f.write(bytes_data)
394
- file_info = {"name": file_path, "size": os.path.getsize(file_path)}
391
+ os.makedirs(os.path.dirname(path), exist_ok=True)
392
+ with open(path, "wb") as f:
393
+ f.write(data)
394
+ file_info = {"name": path, "size": os.path.getsize(path)}
395
395
  return file_info
396
396
 
397
397
  except Exception as e:
398
- logger.error(f"Error saving bytes to file {file_path}: {e}")
398
+ logger.error(f"Error saving bytes to file {path}: {e}")
399
399
  return None
@@ -0,0 +1,4 @@
1
+ from data_processing.runtime.execution_configuration import TransformExecutionConfiguration
2
+ from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration
3
+ from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher
4
+ from data_processing.runtime.transform_file_processor import AbstractTransformFileProcessor
@@ -27,15 +27,17 @@ class TransformExecutionConfiguration(CLIArgumentProvider):
27
27
  A class specifying and validating transform execution configuration
28
28
  """
29
29
 
30
- def __init__(self, name: str, pp: bool = True):
30
+ def __init__(self, name: str, print_params: bool = True):
31
31
  """
32
32
  Initialization
33
+ :param name: job name
34
+ :param print_params: flag to print parameters
33
35
  """
34
36
  self.pipeline_id = ""
35
37
  self.job_details = {}
36
38
  self.code_location = {}
37
39
  self.name = name
38
- self.pp = pp
40
+ self.print_params = print_params
39
41
 
40
42
  def add_input_params(self, parser: argparse.ArgumentParser) -> None:
41
43
  """
@@ -74,10 +76,9 @@ class TransformExecutionConfiguration(CLIArgumentProvider):
74
76
  "job id": captured["job_id"],
75
77
  }
76
78
  self.code_location = captured["code_location"]
77
-
78
- if self.pp:
79
- # print parameters
80
- logger.info(f"pipeline id {self.pipeline_id}")
79
+ # print parameters
80
+ logger.info(f"pipeline id {self.pipeline_id}")
81
+ if self.print_params:
81
82
  logger.info(f"job details {self.job_details}")
82
- logger.info(f"code location {self.code_location}")
83
+ logger.info(f"code location {self.code_location}")
83
84
  return True
@@ -0,0 +1,4 @@
1
+ from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
2
+ from data_processing.runtime.pure_python.transform_file_processor import PythonTransformFileProcessor
3
+ from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
4
+ from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
@@ -0,0 +1,24 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.runtime import TransformRuntimeConfiguration
14
+ from data_processing.transform import TransformConfiguration
15
+
16
+
17
+ class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
18
+ def __init__(self, transform_config: TransformConfiguration):
19
+ """
20
+ Initialization
21
+ :param transform_config - base configuration class
22
+ """
23
+ self.transform_config = transform_config
24
+ super().__init__(transform_config=transform_config)
@@ -0,0 +1,53 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
16
+ from data_processing.runtime import (
17
+ AbstractTransformFileProcessor,
18
+ )
19
+ from data_processing.transform import TransformStatistics
20
+ from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
21
+
22
+
23
+ class PythonTransformFileProcessor(AbstractTransformFileProcessor):
24
+ """
25
+ This is the class implementing the worker class processing of a single file
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ data_access_factory: DataAccessFactoryBase,
31
+ statistics: TransformStatistics,
32
+ runtime_configuration: PythonTransformRuntimeConfiguration,
33
+ ):
34
+ """
35
+ Init method
36
+ :param data_access_factory - data access factory
37
+ :param statistics - reference to statistics class
38
+ :param runtime_configuration: transform configuration class
39
+ """
40
+ # Create data access
41
+ super().__init__()
42
+ self.data_access = data_access_factory.create_data_access()
43
+ # Add data access and statistics to the processor parameters
44
+ transform_params = dict(runtime_configuration.get_transform_params())
45
+ transform_params["data_access"] = self.data_access
46
+ transform_params["statistics"] = statistics
47
+ # Create local processor
48
+ self.transform = runtime_configuration.get_transform_class()(transform_params)
49
+ # Create statistics
50
+ self.stats = statistics
51
+
52
+ def _publish_stats(self, stats: dict[str, Any]) -> None:
53
+ self.stats.add_stats(stats)
@@ -15,9 +15,11 @@ import time
15
15
 
16
16
  from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
17
17
  from data_processing.runtime import TransformExecutionConfiguration
18
- from data_processing.runtime.pure_python import PythonLauncherConfiguration, orchestrate
18
+ from data_processing.runtime.pure_python import (
19
+ PythonTransformRuntimeConfiguration,
20
+ orchestrate,
21
+ )
19
22
  from data_processing.runtime.transform_launcher import AbstractTransformLauncher
20
- from data_processing.transform import TransformConfiguration
21
23
  from data_processing.utils import get_logger
22
24
 
23
25
 
@@ -31,18 +33,16 @@ class PythonTransformLauncher(AbstractTransformLauncher):
31
33
 
32
34
  def __init__(
33
35
  self,
34
- # transform_runtime_config: PythonLauncherConfiguration,
35
- transform_config: TransformConfiguration,
36
+ runtime_config: PythonTransformRuntimeConfiguration,
36
37
  data_access_factory: DataAccessFactoryBase = DataAccessFactory(),
37
38
  ):
38
39
  """
39
40
  Creates driver
40
- :param transform_runtime_config: transform runtime factory
41
+ :param runtime_config: transform runtime factory
41
42
  :param data_access_factory: the factory to create DataAccess instances.
42
43
  """
43
- super().__init__(transform_config, data_access_factory)
44
- self.transform_runtime_config = PythonLauncherConfiguration(transform_config)
45
- self.execution_config = TransformExecutionConfiguration(name=self.transform_runtime_config.get_name())
44
+ super().__init__(runtime_config, data_access_factory)
45
+ self.execution_config = TransformExecutionConfiguration(name=runtime_config.get_name())
46
46
 
47
47
  def __get_parameters(self) -> bool:
48
48
  """
@@ -57,12 +57,12 @@ class PythonTransformLauncher(AbstractTransformLauncher):
57
57
  formatter_class=argparse.RawTextHelpFormatter,
58
58
  )
59
59
  # add additional arguments
60
- self.transform_runtime_config.add_input_params(parser=parser)
60
+ self.runtime_config.add_input_params(parser=parser)
61
61
  self.data_access_factory.add_input_params(parser=parser)
62
62
  self.execution_config.add_input_params(parser=parser)
63
63
  args = parser.parse_args()
64
64
  return (
65
- self.transform_runtime_config.apply_input_params(args=args)
65
+ self.runtime_config.apply_input_params(args=args)
66
66
  and self.execution_config.apply_input_params(args=args)
67
67
  and self.data_access_factory.apply_input_params(args=args)
68
68
  )
@@ -78,7 +78,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
78
78
  logger.debug("Starting orchestrator")
79
79
  res = orchestrate(
80
80
  data_access_factory=self.data_access_factory,
81
- transform_config=self.transform_runtime_config,
81
+ runtime_config=self.runtime_config,
82
82
  execution_config=self.execution_config,
83
83
  )
84
84
  logger.debug("Completed orchestrator")
@@ -15,11 +15,11 @@ import traceback
15
15
  from datetime import datetime
16
16
 
17
17
  from data_processing.data_access import DataAccessFactoryBase
18
- from data_processing.runtime import TransformExecutionConfiguration
19
- from data_processing.runtime.pure_python import (
20
- PythonLauncherConfiguration,
21
- TransformTableProcessor,
18
+ from data_processing.runtime import (
19
+ TransformExecutionConfiguration,
20
+ TransformRuntimeConfiguration,
22
21
  )
22
+ from data_processing.runtime.pure_python import PythonTransformFileProcessor
23
23
  from data_processing.transform import TransformStatistics
24
24
  from data_processing.utils import get_logger
25
25
 
@@ -29,17 +29,18 @@ logger = get_logger(__name__)
29
29
 
30
30
  def orchestrate(
31
31
  data_access_factory: DataAccessFactoryBase,
32
- transform_config: PythonLauncherConfiguration,
32
+ runtime_config: TransformRuntimeConfiguration,
33
33
  execution_config: TransformExecutionConfiguration,
34
34
  ) -> int:
35
35
  """
36
36
  orchestrator for transformer execution
37
37
  :param data_access_factory: data access factory
38
- :param transform_config: transformer configuration
38
+ :param runtime_config: transformer configuration
39
+ :param execution_config: execution configuration
39
40
  :return: 0 - success or 1 - failure
40
41
  """
41
42
  start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
42
- logger.info(f"orchestrator {transform_config.get_name()} started at {start_ts}")
43
+ logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
43
44
  try:
44
45
  # create data access
45
46
  data_access = data_access_factory.create_data_access()
@@ -59,15 +60,15 @@ def orchestrate(
59
60
  # create statistics
60
61
  statistics = TransformStatistics()
61
62
  # create executor
62
- executor = TransformTableProcessor(
63
- data_access_factory=data_access_factory, statistics=statistics, params=transform_config
63
+ executor = PythonTransformFileProcessor(
64
+ data_access_factory=data_access_factory, statistics=statistics, runtime_configuration=runtime_config
64
65
  )
65
66
  # process data
66
- logger.debug(f"{transform_config.get_name()} Begin processing files")
67
+ logger.debug(f"{runtime_config.get_name()} Begin processing files")
67
68
  t_start = time.time()
68
69
  completed = 0
69
70
  for path in files:
70
- executor.process_data(path)
71
+ executor.process_file(path)
71
72
  completed += 1
72
73
  if completed % print_interval == 0:
73
74
  logger.info(f"Completed {completed} files in {(time.time() - t_start)/60} min")
@@ -81,7 +82,7 @@ def orchestrate(
81
82
  stats = statistics.get_execution_stats()
82
83
  # build and save metadata
83
84
  logger.debug("Building job metadata")
84
- input_params = transform_config.get_transform_metadata()
85
+ input_params = runtime_config.get_transform_metadata()
85
86
  metadata = {
86
87
  "pipeline": execution_config.pipeline_id,
87
88
  "job details": execution_config.job_details
@@ -1,8 +1,8 @@
1
1
  from data_processing.runtime.ray.ray_utils import RayUtils
2
2
  from data_processing.runtime.ray.transform_statistics import TransformStatisticsRay
3
- from data_processing.runtime.ray.transform_table_processor import TransformTableProcessorRay
4
- from data_processing.runtime.ray.transform_runtime import DefaultTableTransformRuntimeRay
5
- from data_processing.runtime.ray.transform_launch_configuration import RayLauncherConfiguration
6
- from data_processing.runtime.ray.transform_orchestrator_configuration import TransformOrchestratorConfiguration
3
+ from data_processing.runtime.ray.transform_runtime import DefaultRayTransformRuntime
4
+ from data_processing.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
5
+ from data_processing.runtime.ray.transform_file_processor import RayTransformFileProcessor
6
+ from data_processing.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
7
7
  from data_processing.runtime.ray.transform_orchestrator import orchestrate
8
8
  from data_processing.runtime.ray.transform_launcher import RayTransformLauncher
@@ -24,7 +24,7 @@ logger = get_logger(__name__)
24
24
  cli_prefix = "runtime_"
25
25
 
26
26
 
27
- class TransformOrchestratorConfiguration(TransformExecutionConfiguration):
27
+ class RayTransformExecutionConfiguration(TransformExecutionConfiguration):
28
28
  """
29
29
  A class specifying and validating Ray orchestrator configuration
30
30
  """
@@ -33,7 +33,7 @@ class TransformOrchestratorConfiguration(TransformExecutionConfiguration):
33
33
  """
34
34
  Initialization
35
35
  """
36
- super().__init__(name=name, pp=False)
36
+ super().__init__(name=name, print_params=False)
37
37
  self.worker_options = {}
38
38
  self.n_workers = 1
39
39
  self.creation_delay = 0
@@ -91,10 +91,8 @@ class TransformOrchestratorConfiguration(TransformExecutionConfiguration):
91
91
 
92
92
  # print them
93
93
  logger.info(f"number of workers {self.n_workers} worker options {self.worker_options}")
94
- logger.info(f"pipeline id {self.pipeline_id}; number workers {self.n_workers}")
95
- logger.info(f"job details {self.job_details}")
96
- logger.info(f"code location {self.code_location}")
97
94
  logger.info(f"actor creation delay {self.creation_delay}")
95
+ logger.info(f"job details {self.job_details}")
98
96
  return True
99
97
 
100
98
  def get_input_params(self) -> dict[str, Any]:
@@ -132,12 +132,12 @@ class RayUtils:
132
132
  completed = 0
133
133
  for path in files:
134
134
  if executors.has_free(): # still have room
135
- executors.submit(lambda a, v: a.process_data.remote(v), path)
135
+ executors.submit(lambda a, v: a.process_file.remote(v), path)
136
136
  running = running + 1
137
137
  files_in_progress_gauge.set(running)
138
138
  else: # need to wait for some actors
139
139
  executors.get_next_unordered()
140
- executors.submit(lambda a, v: a.process_data.remote(v), path)
140
+ executors.submit(lambda a, v: a.process_file.remote(v), path)
141
141
  completed = completed + 1
142
142
  files_completed_gauge.set(completed)
143
143
  RayUtils.get_available_resources(
@@ -0,0 +1,38 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.runtime import TransformRuntimeConfiguration
14
+ from data_processing.runtime.ray import DefaultRayTransformRuntime
15
+ from data_processing.transform import TransformConfiguration
16
+
17
+
18
+ class RayTransformRuntimeConfiguration(TransformRuntimeConfiguration):
19
+ def __init__(
20
+ self,
21
+ transform_config: TransformConfiguration,
22
+ runtime_class: type[DefaultRayTransformRuntime] = DefaultRayTransformRuntime,
23
+ ):
24
+ """
25
+ Initialization
26
+ :param transform_config - base configuration class
27
+ :param runtime_class: implementation of the transform runtime
28
+ :param remove_from_metadata - list of parameters to remove from metadata
29
+ """
30
+ super().__init__(transform_config=transform_config)
31
+ self.runtime_class = runtime_class
32
+
33
+ def create_transform_runtime(self) -> DefaultRayTransformRuntime:
34
+ """
35
+ Create transform runtime with the parameters captured during apply_input_params()
36
+ :return: transform runtime object
37
+ """
38
+ return self.runtime_class(self.transform_config.get_transform_params())