data-prep-toolkit 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. data_prep_toolkit-0.0.1/.gitignore +35 -0
  2. data_prep_toolkit-0.0.1/Makefile +62 -0
  3. data_prep_toolkit-0.0.1/PKG-INFO +55 -0
  4. data_prep_toolkit-0.0.1/README.md +29 -0
  5. data_prep_toolkit-0.0.1/doc/advanced-transform-tutorial.md +284 -0
  6. data_prep_toolkit-0.0.1/doc/architecture.md +104 -0
  7. data_prep_toolkit-0.0.1/doc/launcher-options.md +84 -0
  8. data_prep_toolkit-0.0.1/doc/logo-ibm-dark.png +0 -0
  9. data_prep_toolkit-0.0.1/doc/logo-ibm.png +0 -0
  10. data_prep_toolkit-0.0.1/doc/overview.md +27 -0
  11. data_prep_toolkit-0.0.1/doc/processing-architecture.jpg +0 -0
  12. data_prep_toolkit-0.0.1/doc/simplest-transform-tutorial.md +198 -0
  13. data_prep_toolkit-0.0.1/doc/testing-e2e-transform.md +4 -0
  14. data_prep_toolkit-0.0.1/doc/testing-transforms.md +99 -0
  15. data_prep_toolkit-0.0.1/doc/transform-external-resources.md +224 -0
  16. data_prep_toolkit-0.0.1/doc/transform-tutorials.md +194 -0
  17. data_prep_toolkit-0.0.1/doc/transformer-utilities.md +24 -0
  18. data_prep_toolkit-0.0.1/doc/using_s3_transformers.md +91 -0
  19. data_prep_toolkit-0.0.1/pyproject.toml +52 -0
  20. data_prep_toolkit-0.0.1/setup.cfg +4 -0
  21. data_prep_toolkit-0.0.1/src/data_prep_toolkit.egg-info/PKG-INFO +55 -0
  22. data_prep_toolkit-0.0.1/src/data_prep_toolkit.egg-info/SOURCES.txt +87 -0
  23. data_prep_toolkit-0.0.1/src/data_prep_toolkit.egg-info/dependency_links.txt +1 -0
  24. data_prep_toolkit-0.0.1/src/data_prep_toolkit.egg-info/requires.txt +18 -0
  25. data_prep_toolkit-0.0.1/src/data_prep_toolkit.egg-info/top_level.txt +1 -0
  26. data_prep_toolkit-0.0.1/src/data_processing/__init__.py +0 -0
  27. data_prep_toolkit-0.0.1/src/data_processing/data_access/__init__.py +6 -0
  28. data_prep_toolkit-0.0.1/src/data_processing/data_access/arrow_s3.py +223 -0
  29. data_prep_toolkit-0.0.1/src/data_processing/data_access/data_access.py +228 -0
  30. data_prep_toolkit-0.0.1/src/data_processing/data_access/data_access_factory.py +252 -0
  31. data_prep_toolkit-0.0.1/src/data_processing/data_access/data_access_factory_base.py +142 -0
  32. data_prep_toolkit-0.0.1/src/data_processing/data_access/data_access_local.py +399 -0
  33. data_prep_toolkit-0.0.1/src/data_processing/data_access/data_access_s3.py +344 -0
  34. data_prep_toolkit-0.0.1/src/data_processing/pure_python/__init__.py +4 -0
  35. data_prep_toolkit-0.0.1/src/data_processing/pure_python/python_launcher_configuration.py +99 -0
  36. data_prep_toolkit-0.0.1/src/data_processing/pure_python/transform_launcher.py +99 -0
  37. data_prep_toolkit-0.0.1/src/data_processing/pure_python/transform_orchestrator.py +103 -0
  38. data_prep_toolkit-0.0.1/src/data_processing/pure_python/transform_table_processor.py +190 -0
  39. data_prep_toolkit-0.0.1/src/data_processing/ray/__init__.py +10 -0
  40. data_prep_toolkit-0.0.1/src/data_processing/ray/ray_utils.py +180 -0
  41. data_prep_toolkit-0.0.1/src/data_processing/ray/transform_launcher.py +125 -0
  42. data_prep_toolkit-0.0.1/src/data_processing/ray/transform_orchestrator.py +143 -0
  43. data_prep_toolkit-0.0.1/src/data_processing/ray/transform_orchestrator_configuration.py +109 -0
  44. data_prep_toolkit-0.0.1/src/data_processing/ray/transform_runtime.py +105 -0
  45. data_prep_toolkit-0.0.1/src/data_processing/ray/transform_statistics.py +60 -0
  46. data_prep_toolkit-0.0.1/src/data_processing/ray/transform_table_processor.py +191 -0
  47. data_prep_toolkit-0.0.1/src/data_processing/test_support/__init__.py +1 -0
  48. data_prep_toolkit-0.0.1/src/data_processing/test_support/abstract_test.py +185 -0
  49. data_prep_toolkit-0.0.1/src/data_processing/test_support/data_access/__init__.py +1 -0
  50. data_prep_toolkit-0.0.1/src/data_processing/test_support/data_access/data_access_factory_test.py +73 -0
  51. data_prep_toolkit-0.0.1/src/data_processing/test_support/ray/__init__.py +1 -0
  52. data_prep_toolkit-0.0.1/src/data_processing/test_support/ray/transform_test.py +88 -0
  53. data_prep_toolkit-0.0.1/src/data_processing/test_support/transform/__init__.py +7 -0
  54. data_prep_toolkit-0.0.1/src/data_processing/test_support/transform/noop_transform.py +146 -0
  55. data_prep_toolkit-0.0.1/src/data_processing/test_support/transform/transform_test.py +86 -0
  56. data_prep_toolkit-0.0.1/src/data_processing/transform/__init__.py +7 -0
  57. data_prep_toolkit-0.0.1/src/data_processing/transform/execution_configuration.py +83 -0
  58. data_prep_toolkit-0.0.1/src/data_processing/transform/launcher_configuration.py +62 -0
  59. data_prep_toolkit-0.0.1/src/data_processing/transform/table_transform.py +50 -0
  60. data_prep_toolkit-0.0.1/src/data_processing/transform/transform_statistics.py +43 -0
  61. data_prep_toolkit-0.0.1/src/data_processing/utils/__init__.py +5 -0
  62. data_prep_toolkit-0.0.1/src/data_processing/utils/cli_utils.py +80 -0
  63. data_prep_toolkit-0.0.1/src/data_processing/utils/config.py +46 -0
  64. data_prep_toolkit-0.0.1/src/data_processing/utils/log.py +59 -0
  65. data_prep_toolkit-0.0.1/src/data_processing/utils/params_utils.py +153 -0
  66. data_prep_toolkit-0.0.1/src/data_processing/utils/transform_utils.py +195 -0
  67. data_prep_toolkit-0.0.1/test/data_processing_tests/data_access/daf_local_test.py +29 -0
  68. data_prep_toolkit-0.0.1/test/data_processing_tests/data_access/data_access_local_test.py +621 -0
  69. data_prep_toolkit-0.0.1/test/data_processing_tests/data_access/data_access_s3_test.py +137 -0
  70. data_prep_toolkit-0.0.1/test/data_processing_tests/data_access/sample_input_data_test.py +37 -0
  71. data_prep_toolkit-0.0.1/test/data_processing_tests/pure_python/launcher_test.py +226 -0
  72. data_prep_toolkit-0.0.1/test/data_processing_tests/ray/launcher_test.py +287 -0
  73. data_prep_toolkit-0.0.1/test/data_processing_tests/ray/ray_util_test.py +105 -0
  74. data_prep_toolkit-0.0.1/test/data_processing_tests/ray/test_noop_launch.py +35 -0
  75. data_prep_toolkit-0.0.1/test/data_processing_tests/transform/test_noop.py +36 -0
  76. data_prep_toolkit-0.0.1/test/data_processing_tests/util/transform_utils_test.py +32 -0
  77. data_prep_toolkit-0.0.1/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  78. data_prep_toolkit-0.0.1/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  79. data_prep_toolkit-0.0.1/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  80. data_prep_toolkit-0.0.1/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  81. data_prep_toolkit-0.0.1/test-data/data_processing/input/sample1.parquet +0 -0
  82. data_prep_toolkit-0.0.1/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  83. data_prep_toolkit-0.0.1/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  84. data_prep_toolkit-0.0.1/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  85. data_prep_toolkit-0.0.1/test-data/data_processing/ray/noop/expected/metadata.json +46 -0
  86. data_prep_toolkit-0.0.1/test-data/data_processing/ray/noop/expected/sample1.parquet +0 -0
  87. data_prep_toolkit-0.0.1/test-data/data_processing/ray/noop/expected/subdir/test1.parquet +0 -0
  88. data_prep_toolkit-0.0.1/test-data/data_processing/ray/noop/input/sample1.parquet +0 -0
  89. data_prep_toolkit-0.0.1/test-data/data_processing/ray/noop/input/subdir/test1.parquet +0 -0
@@ -0,0 +1,35 @@
1
+
2
+
3
+
4
+ # Byte-compiled / optimized / DLL files
5
+ __pycache__/
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+
10
+ # Distribution / packaging
11
+ bin/
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ *.egg-info/
22
+ .installed.cfg
23
+ *.egg
24
+
25
+ # Installer logs
26
+ pip-log.txt
27
+ pip-delete-this-directory.txt
28
+
29
+ # Unit test / coverage reports
30
+ .tox/
31
+ htmlcov
32
+ .coverage
33
+ .cache
34
+ nosetests.xml
35
+ coverage.xml
@@ -0,0 +1,62 @@
1
+ # Use make help, to see the available rules
2
+ REPOROOT=../
3
+ include ../.make.defaults
4
+ include ../.make.versions
5
+
6
+ TAG := "v${DPK_LIB_VERSION}"
7
+
8
+
9
+ clean::
10
+ @# Help: Clean up the distribution build and the venv
11
+ rm -rf dist venv
12
+ rm -rf src/*egg-info
13
+
14
+ .check-env::
15
+ @echo "Checks passed"
16
+
17
+ update-toml:: .check-env
18
+ @# Help: Copy the Makefile distribution version into the pyproject.toml
19
+ sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
20
+ mv tt.toml pyproject.toml
21
+
22
+ setup::
23
+
24
+ build:: update-toml venv
25
+ @# Help: Build the distribution for publishing to a pypi
26
+ rm -r dist || true
27
+ rm -rf src/*egg-info || true
28
+ ${PIP} install --upgrade build
29
+ ${PYTHON} -m build
30
+
31
+ publish:: .check-env update-toml
32
+ @# Help: Publish project to pypi
33
+ ${PYTHON} -m twine check dist/*
34
+ ${PYTHON} -m twine upload --verbose --non-interactive dist/*
35
+ #@echo "create a git tag to reference published version"
36
+ #@git tag ${TAG}
37
+ #@git push origin ${TAG}
38
+
39
+ venv:: pyproject.toml
40
+ @# Help: Create the virtual environment using pyproject.toml
41
+ rm -r dist venv || true
42
+ rm -rf src/*egg-info || true
43
+ rm makeenv || true
44
+ $(PYTHON) -m venv venv
45
+ source venv/bin/activate; \
46
+ pip install --upgrade pip; \
47
+ pip install -e .; \
48
+ pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1
49
+
50
+
51
+ # Here we run each test directory of tests and each ray launched test separately, because
52
+ # it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
53
+ # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
54
+ test::
55
+ @# Help: Use the already-built virtual environment to run pytest on the test directory.
56
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/data_access;
57
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/transform;
58
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/pure_python;
59
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/ray/ray_util_test.py;
60
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/ray/launcher_test.py;
61
+ source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) data_processing_tests/ray/test_noop_launch.py;
62
+
@@ -0,0 +1,55 @@
1
+ Metadata-Version: 2.1
2
+ Name: data_prep_toolkit
3
+ Version: 0.0.1
4
+ Summary: Data Preparation Laboratory Library
5
+ Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: ray[default]==2.9.3
10
+ Requires-Dist: pyarrow==15.0.2
11
+ Requires-Dist: boto3==1.34.69
12
+ Requires-Dist: argparse
13
+ Requires-Dist: mmh3
14
+ Requires-Dist: fastapi>=0.109.1
15
+ Requires-Dist: pillow>=10.2.0
16
+ Provides-Extra: dev
17
+ Requires-Dist: twine; extra == "dev"
18
+ Requires-Dist: pytest>=7.3.2; extra == "dev"
19
+ Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
20
+ Requires-Dist: pytest-env>=1.0.0; extra == "dev"
21
+ Requires-Dist: pre-commit>=3.3.2; extra == "dev"
22
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
23
+ Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
24
+ Requires-Dist: moto==5.0.5; extra == "dev"
25
+ Requires-Dist: markupsafe==2.0.1; extra == "dev"
26
+
27
+ # Data Processing Library
28
+ This provides a python framework for developing _transforms_
29
+ on data stored in files - currently parquet files are supported -
30
+ and running them in a [ray](https://ray.com) cluster.
31
+ Data files may be stored in the local file system or COS/S3.
32
+ For more details see the [documentation](doc/overview.md).
33
+
34
+ ### Virtual Environment
35
+ The project uses `pyproject.toml` and a Makefile for operations.
36
+ To do development you should establish the virtual environment
37
+ ```shell
38
+ make venv
39
+ ```
40
+ and then either activate
41
+ ```shell
42
+ source venv/bin/activate
43
+ ```
44
+ or set up your IDE to use the venv directory when developing in this project
45
+
46
+ ## Library Artifact Build and Publish
47
+ To test, build and publish the library to artifactory
48
+ ```shell
49
+ make test build publish
50
+ ```
51
+ To up the version number, edit the Makefile to change VERSION and rerun
52
+ the above. This will require committing both the `Makefile` and the
53
+ autotmatically updated `pyproject.toml` file.
54
+
55
+
@@ -0,0 +1,29 @@
1
+ # Data Processing Library
2
+ This provides a python framework for developing _transforms_
3
+ on data stored in files - currently parquet files are supported -
4
+ and running them in a [ray](https://ray.com) cluster.
5
+ Data files may be stored in the local file system or COS/S3.
6
+ For more details see the [documentation](doc/overview.md).
7
+
8
+ ### Virtual Environment
9
+ The project uses `pyproject.toml` and a Makefile for operations.
10
+ To do development you should establish the virtual environment
11
+ ```shell
12
+ make venv
13
+ ```
14
+ and then either activate
15
+ ```shell
16
+ source venv/bin/activate
17
+ ```
18
+ or set up your IDE to use the venv directory when developing in this project
19
+
20
+ ## Library Artifact Build and Publish
21
+ To test, build and publish the library to artifactory
22
+ ```shell
23
+ make test build publish
24
+ ```
25
+ To up the version number, edit the Makefile to change VERSION and rerun
26
+ the above. This will require committing both the `Makefile` and the
27
+ autotmatically updated `pyproject.toml` file.
28
+
29
+
@@ -0,0 +1,284 @@
1
+ # Advanced Transform Tutorial
2
+
3
+ In this example, we implement an [ededup](../../transforms/universal/ededup) transform that
4
+ removes duplicate documents across all files. In this tutorial, we will show the following:
5
+
6
+ * How to write the `ededup` transform to generate the output table.
7
+ * How to define transform-specific metadata that can be associated
8
+ with each table transformation and aggregated across all transformations
9
+ in a single run of the transform.
10
+ * How to implement custom `TransformRuntime` to create supporting Ray objects and supplement
11
+ transform-specific metadata with the information about this statistics
12
+ * How to define command line arguments that can be used to configure
13
+ the operation of our _noop_ transform.
14
+
15
+ The complete task involves the following:
16
+ * EdedupTransform - class that implements the specific transformation
17
+ * EdedupRuntime - class that implements custom TransformRuntime to create supporting Ray objects and enhance job output
18
+ statistics
19
+ * EdedupTableTransformConfiguration - class that provides configuration for the
20
+ EdedupTransform and EdedupRuntime, including transform runtime class and the command line arguments used to
21
+ configure them.
22
+ * main() - simple creation and use of the TransformLauncher.
23
+
24
+ (Currently, the complete code for the noop transform used for this
25
+ tutorial can be found in the
26
+ [ededup transform](../../transforms/universal/ededup) directory.
27
+
28
+ Finally, we show to use the command line to run the transform in a local ray cluster
29
+
30
+ ## HashFilter
31
+
32
+ One of the basic components of exact dedup implementation is a cache of hashes. That is why we will start
33
+ from implementing this support actor. The implementation is fairly straight forward and can be
34
+ found [here](../../transforms/universal/ededup/src/ededup_transform.py)
35
+
36
+ ## EdedupTransform
37
+
38
+ First, let's define the transform class. To do this we extend
39
+ the base abstract/interface class
40
+ [AbstractTableTransform](../src/data_processing/transform/table_transform.py),
41
+ which requires definition of the following:
42
+ * an initializer (i.e. `init()`) that accepts a dictionary of configuration
43
+ data. For this example, the configuration data will only be defined by
44
+ command line arguments (defined below).
45
+ * the `transform()` method itself that takes an input table and produces an output
46
+ table and any associated metadata for that table transformation.
47
+
48
+ Other methods such as `flush()` need not be overridden/redefined for this example.
49
+
50
+ We start with the simple definition of the class, its initializer and the imports required
51
+ by subsequent code:
52
+
53
+ ```python
54
+ from argparse import ArgumentParser, Namespace
55
+ from typing import Any
56
+
57
+ import pyarrow as pa
58
+ import ray
59
+ from data_processing.data_access import DataAccessFactory
60
+ from data_processing.ray import (
61
+ RayLauncherConfiguration,
62
+ DefaultTableTransformRuntimeRay,
63
+ RayUtils,
64
+ RayTransformLauncher,
65
+ )
66
+ from data_processing.transform import AbstractTableTransform
67
+ from data_processing.utils import GB, TransformUtils
68
+ from ray.actor import ActorHandle
69
+
70
+
71
+ class EdedupTransform(AbstractTableTransform):
72
+
73
+ def __init__(self, config: dict):
74
+ super().__init__(config)
75
+ self.doc_column = config.get("doc_column", "")
76
+ self.hashes = config.get("hashes", [])
77
+ ```
78
+ The `EdedupTransform` class extends the `AbstractTableTransform`, which defines the required methods.
79
+
80
+ For purposes of the tutorial and to simulate a more complex processing
81
+ job, our initializer allows our transform to be configurable
82
+ with document column name and a list of hash actors during the call to `transform()`.
83
+ Configuration is provided by the framework in a dictionary provided to the initializer.
84
+ Below we will cover how `doc_column` and `hashes` arguments are made available to the initializer.
85
+
86
+ Next we define the `transform()` method itself, which includes the addition of some
87
+ metadata.
88
+
89
+ ```python
90
+ def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
91
+ if not TransformUtils.validate_columns(table=table, required=[self.doc_column]):
92
+ return [], {}
93
+ # Inner variables
94
+ hashes = set()
95
+ unique = []
96
+ hd = {}
97
+ # Compute unique hashes for the table
98
+ for text in table[self.doc_column]:
99
+ # Compute doc hash
100
+ h = TransformUtils.str_to_hash(TransformUtils.normalize_string(str(text)))
101
+ if h not in hashes: # Processing this hash for the first time
102
+ hashes.add(h) # Remember it locally
103
+ hd[h] = str(text)
104
+ if len(hd) >= REQUEST_LEN: # time to check remotely
105
+ unique = unique + self._process_remote_hashes(hd=hd)
106
+ hd = {}
107
+ if len(hd) > 0: # Process remaining hashes
108
+ unique = unique + self._process_remote_hashes(hd=hd)
109
+
110
+ # Remove duplicates
111
+ unique_set = set(unique)
112
+ mask = [False] * table.num_rows
113
+ index = 0
114
+ for text in table[self.doc_column]:
115
+ str_text = str(text)
116
+ if str_text in unique_set:
117
+ mask[index] = True
118
+ unique_set.remove(str_text)
119
+ index += 1
120
+ # Create output table
121
+ out_table = table.filter(mask)
122
+ # report statistics
123
+ stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows}
124
+ return [out_table], stats
125
+ ```
126
+ The single input to this method is the in-memory pyarrow table to be transformed.
127
+ The return of this function is a list of tables and optional metadata. In this
128
+ case of simple 1:1 table conversion the list will contain a single table, result of removing
129
+ duplicates from input table.
130
+
131
+ The metadata is a free-form dictionary of keys with numeric values that will be aggregated
132
+ by the framework and reported as aggregated job statistics metadata.
133
+ If there is no metadata then simply return an empty dictionary.
134
+
135
+ ## EdedupRuntime
136
+
137
+ First, let's define the transform runtime class. To do this we extend
138
+ the base abstract/interface class
139
+ [DefaultTableTransformRuntime](../src/data_processing/ray/transform_runtime.py),
140
+ which requires definition of the following:
141
+ * an initializer (i.e. `init()`) that accepts a dictionary of configuration
142
+ data. For this example, the configuration data will only be defined by
143
+ command line arguments (defined below).
144
+ * the `get_transform_config()` method that takes `data_access_factory`, `statistics actor`, and
145
+ `list of files to process` and produces a dictionary of parameters used by transform.
146
+ * the `compute_execution_stats()` method that takes take a dictionary of metadata, enhances it and
147
+ produces an enhanced metadata dictionary.
148
+
149
+ We start with the simple definition of the class and its initializer
150
+
151
+ ```python
152
+ class EdedupRuntime(DefaultTableTransformRuntime):
153
+
154
+ def __init__(self, params: dict[str, Any]):
155
+ super().__init__(params)
156
+ self.filters = []
157
+ ```
158
+ Next we define the `get_transform_config()` method, which, in this case, creates supporting Ray Actors and
159
+ adds their handles to the transform parameters
160
+
161
+ ```python
162
+ def get_transform_config(
163
+ self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
164
+ ) -> dict[str, Any]:
165
+ self.filters = RayUtils.create_actors(
166
+ clazz=HashFilter,
167
+ params={},
168
+ actor_options={"num_cpus": self.params.get("hash_cpu", 0.5)},
169
+ n_actors=self.params.get("num_hashes", 1),
170
+ )
171
+ return {"hashes": self.filters} | self.params
172
+ ```
173
+ Inputs to this method includes a set of parameters, that moght not be needed for this transformer, but
174
+ rather a superset of all parameters that can be used by different implementations of transform runtime (
175
+ see for example [fuzzy dedup](../../transforms/universal/fdedup), etc).
176
+ The return of this function is a dictionary information for transformer initialization. In this
177
+ implementation we add additional parameters to the input dictionary, but in general, it can be a completely
178
+ new dictionary build here
179
+
180
+ Finally we define the `compute_execution_stats()` method, which which enhances metadata collected by statistics
181
+ class
182
+
183
+ ```python
184
+ def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
185
+ # Get filters stats
186
+ sum_hash = 0
187
+ sum_hash_mem = 0
188
+ remote_replies = [f.get_hash_size.remote() for f in self.filters]
189
+ while remote_replies:
190
+ # Wait for replies
191
+ ready, not_ready = ray.wait(remote_replies)
192
+ for r in ready:
193
+ h_size, h_memory = ray.get(r)
194
+ sum_hash = sum_hash + h_size
195
+ sum_hash_mem = sum_hash_mem + h_memory
196
+ remote_replies = not_ready
197
+ dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1))
198
+ return {"number of hashes": sum_hash, "hash memory, GB": sum_hash_mem, "de duplication %": dedup_prst} | stats
199
+ ```
200
+ Input to this method is a dictionary of metadata collected by statistics object. It then enhances it by information
201
+ collected by hash actors and custom computations based on statistics data.
202
+
203
+ ## EdedupTableTransformConfiguration
204
+
205
+ The final class we need to implement is `EdedupTableTransformConfiguration` class and its initializer that
206
+ define the following:
207
+
208
+ * The short name for the transform
209
+ * The class implementing the transform - in our case EdedupTransform
210
+ * The transform runtime class be used - in our case EdedupRuntime
211
+ * Command line argument support.
212
+
213
+ First we define the class and its initializer,
214
+
215
+ ```python
216
+ short_name = "ededup"
217
+ cli_prefix = f"{short_name}_"
218
+
219
+ class EdedupTableTransformConfiguration(DefaultTableTransformConfiguration):
220
+ def __init__(self):
221
+ super().__init__(name=short_name, runtime_class=EdedupRuntime, transform_class=EdedupTransform)
222
+ self.params = {}
223
+ ```
224
+
225
+ The initializer extends the DefaultTableTransformConfiguration which provides simple
226
+ capture of our configuration data and enables picklability through the network.
227
+ It also adds a `params` field that will be used below to hold the transform's
228
+ configuration data (used in `EdedupRuntime.init()` above).
229
+
230
+ Next, we provide two methods that define and capture the command line configuration that
231
+ is specific to the `EdedupTransform`, in this case the number of seconds to sleep during transformation.
232
+ First we define the method establishes the command line arguments.
233
+ This method is given a global argument parser to which the `EdedupTransform` arguments are added.
234
+ It is good practice to include a common prefix to all transform-specific options (i.e. pii, lang, etc).
235
+ In our case we will use `noop_`.
236
+
237
+ ```python
238
+ def add_input_params(self, parser: ArgumentParser) -> None:
239
+ parser.add_argument(f"--{cli_prefix}hash_cpu", type=float, default=0.5, help="number of CPUs per hash")
240
+ parser.add_argument(f"--{cli_prefix}num_hashes", type=int, default=0, help="number of hash actors to use")
241
+ parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="key for accessing data")
242
+ ```
243
+ Next we implement a method that is called after the framework has parsed the CLI args
244
+ and which allows us to capture the `EdedupTransform`-specific arguments and optionally validate them.
245
+
246
+ ```python
247
+ def apply_input_params(self, args: Namespace) -> bool:
248
+ captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
249
+ self.params = self.params | captured
250
+ if self.params["num_hashes"] <= 0:
251
+ logger.info(f"Number of hashes should be greater then zero, provided {args.num_hashes}")
252
+ return False
253
+ logger.info(f"exact dedup params are {self.params}")
254
+ return True
255
+ ```
256
+
257
+ ## main()
258
+
259
+ Next, we show how to launch the framework with the `EdedupTransform` using the
260
+ framework's `TransformLauncher` class.
261
+
262
+ ```python
263
+ if __name__ == "__main__":
264
+ launcher = TransformLauncher(transform_runtime_config=EdedupTransformConfiguration())
265
+ launcher.launch()
266
+ ```
267
+ The launcher requires only an instance of DefaultTableTransformConfiguration
268
+ (our `EdedupTransformConfiguration` class).
269
+ A single method `launch()` is then invoked to run the transform in a Ray cluster.
270
+
271
+ ## Running
272
+
273
+ Assuming the above `main()` is placed in `ededup_transform.py` we can run the transform on data
274
+ in COS as follows:
275
+
276
+ ```shell
277
+ python ededup_transform.py --hash_cpu 0.5 --num_hashes 2 --doc_column "contents" \
278
+ --run_locally True \
279
+ --s3_cred "{'access_key': 'KEY', 'secret_key': 'SECRET', 'cos_url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud'}" \
280
+ --s3_config "{'input_folder': 'cos-optimal-llm-pile/test/david/input/', 'output_folder': 'cos-optimal-llm-pile/test/david/output/'}"
281
+ ```
282
+ This is a minimal set of options to run locally.
283
+ See the [launcher options](launcher-options.md) for a complete list of
284
+ transform-independent command line options.
@@ -0,0 +1,104 @@
1
+ # Data Processing Architecture
2
+
3
+ In this section we cover the high-level architecture, some of the core components.
4
+
5
+ Transform implementation and examples are provided in the [tutorial](transform-tutorials.md).
6
+
7
+ ## Architecture
8
+
9
+ The architecture is a "standard" implementation of [Embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel) to
10
+ process many input files in parallel using a distribute network of RayWorkers.
11
+
12
+ ![Processing Architecture](processing-architecture.jpg)
13
+
14
+ The architecture includes the following core components:
15
+
16
+ * [RayLauncher](../src/data_processing/ray/transform_launcher.py) accepts and validates
17
+ CLI parameters to establish the Ray Orchestrator with the proper configuration.
18
+ It uses the following components, all of which can/do define CLI configuration parameters.:
19
+ * [Transform Orchestrator Configuration](../src/data_processing/ray/transform_orchestrator_configuration.py) is responsible
20
+ for defining and validating infrastructure parameters
21
+ (e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state
22
+ (several dictionaries) and is fully pickleable. As a result framework uses its instance as a
23
+ parameter in remote functions/actors invocation.
24
+ * [DataAccessFactory](../src/data_processing/data_access/data_access_factory.py) - provides the
25
+ configuration for the type of DataAccess to use when reading/writing the input/output data for
26
+ the transforms. Similar to Transform Orchestrator Configuration, this is a pickleable
27
+ instance that is passed between Launcher, Orchestrator and Workers.
28
+ * [TransformConfiguration](../src/data_processing/ray/transform_runtime.py) - defines specifics
29
+ of the transform implementation including transform implementation class, its short name, any transform-
30
+ specific CLI parameters, and an optional TransformRuntime class, discussed below.
31
+
32
+ After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton
33
+ and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation.
34
+ The Launcher waits for the Ray Orchestrator to complete.
35
+ * [Ray Orchestrator](../src/data_processing/ray/transform_orchestrator.py) is responsible for overall management of
36
+ the data processing job. It creates the actors, determines the set of input data and distributes the
37
+ references to the data files to be processed by the workers. More specifically, it performs the following:
38
+ 1. Uses the DataAccess instance created by the DataAccessFactory to determine the set of the files
39
+ to be processed.
40
+ 2. uses the TransformConfiguration to create the TransformRuntime instance
41
+ 3. Uses the TransformRuntime to optionally apply additional configuration (ray object storage, etc) for the configuration
42
+ and operation of the Transform.
43
+ 3. uses the TransformOrchestratorConfiguration to determine the set of RayWorkers to create
44
+ to execute transformers in parallel, providing the following to each worker:
45
+ * Ray worker configuration
46
+ * DataAccessFactory
47
+ * Transform class and its TransformConfiguration containing the CLI parameters and any TransformRuntime additions.
48
+ 4. in a load-balanced, round-robin fashion, distributes the names of the input files to the workers for them to transform/process.
49
+
50
+ Additionally, to provide monitoring of long-running transforms, the orchestrator is instrumented with
51
+ [custom metrics](https://docs.ray.io/en/latest/ray-observability/user-guides/add-app-metrics.html), that are exported to localhost:8080 (this is the endpoint that
52
+ Prometheus would be configured to scrape).
53
+ Once all data is processed, the orchestrator will collect execution statistics (from the statistics actor)
54
+ and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution
55
+ result to the Launcher.
56
+ * [Ray worker](../src/data_processing/ray/transform_table_processor.py) is responsible for
57
+ reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8))
58
+ assigned by the orchestrator, applying the transform to the input table and writing out the
59
+ resulting table(s). Metadata produced by each table transformation is aggregated into
60
+ Transform Statistics (below).
61
+ * [Transform Statistics](../src/data_processing/ray/transform_statistics.py) is a general
62
+ purpose data collector actor aggregating the numeric metadata from different places of
63
+ the framework (especially metadata produced by the transform).
64
+ These statistics are reported as metadata (`metadata.json`) by the orchestrator upon completion.
65
+
66
+ ## Core Components
67
+ Some of the core components used by the architecture are definfed here:
68
+
69
+ * [CLIProvider](../src/data_processing/utils/cli_utils.py) - provides a general purpose
70
+ mechanism for defining, validating and sharing CLI parameters.
71
+ It is used by the DataAccessFactor and Transform Configuration (below).
72
+ * Data Access is an abstraction layer for different data access supported by the framework. The main components
73
+ of this layer are:
74
+ * [Data Access](../src/data_processing/data_access/data_access.py) is the basic interface for the data access, and enables the identification of
75
+ input files to process, associated output files, checkpointing and general file reading/writing.
76
+ Currently, the framework implements several concrete implementations of the Data Access, including
77
+ [local data support](../src/data_processing/data_access/data_access_local.py) and
78
+ [s3](../src/data_processing/data_access/data_access_s3.py). Additional Data Access implementations can be added as required.
79
+ * [Data Access Factory](../src/data_processing/data_access/data_access_factory.py) is an implementation of the
80
+ [factory design pattern](https://www.pentalog.com/blog/design-patterns/factory-method-design-pattern/) for creation
81
+ of the data access instances. Data Access Factory, as a CLIProvider, enables the definition of CLI
82
+ parameters that configure the instance of Data Access to be created. Data Access factory has very simple state
83
+ (several dictionaries) and is fully pickleable. The framework uses Data Access Factory instance as a
84
+ parameter in remote functions/actors invocations.
85
+
86
+
87
+ ## Transforms
88
+ A brief discussion of the Transform components are provided here.
89
+ For a more complete discussion, see the [tutorials](transform-tutorials.md).
90
+
91
+ * [Transform](../src/data_processing/transform/table_transform.py) - defines the methods required
92
+ of any transform implementation - `transform()` and `flush()` - and provides the bulk of any transform implementation
93
+ convert one Table to 0 or more new Tables. In general, this is not tied to the above Ray infrastructure
94
+ and so can usually be used independent of Ray.
95
+ * [TransformRuntime ](../src/data_processing/ray/transform_runtime.py) - this class only needs to be
96
+ extended/implemented when additional Ray components (actors, shared memory objects, etc.) are used
97
+ by the transform. The main method `get_transform_config()` is used to enable these extensions.
98
+ * [TransformConfiguration](../src/data_processing/ray/transform_runtime.py) - this is the bootstrap
99
+ class provided to the Launcher that enables the instantiation of the Transform and the TransformRuntime within
100
+ the architecture. It is a CLIProvider, which allows it to define transform-specific CLI configuration
101
+ that is made available to the Transform's initializer.
102
+
103
+
104
+
@@ -0,0 +1,84 @@
1
+ # Launcher Command Line Options
2
+ A number of command line options are available when launching a transform.
3
+
4
+ The following is a current --help output (a work in progress) for
5
+ the `NOOPTransform` (note the --noop_sleep_sec option):
6
+
7
+ ```
8
+ usage: noop_transform.py [-h]
9
+ [--run_locally RUN_LOCALLY]
10
+ [--noop_sleep_sec NOOP_SLEEP_SEC]
11
+ [--data_s3_cred DATA_S3_CRED]
12
+ [--data_s3_config DATA_S3_CONFIG]
13
+ [--data_local_config DATA_LOCAL_CONFIG]
14
+ [--data_max_files DATA_MAX_FILES]
15
+ [--data_checkpointing DATA_CHECKPOINTING]
16
+ [--data_data_sets DATA_DATA_SETS]
17
+ [--data_max_files MAX_FILES]
18
+ [--data_files_to_use DATA_FILES_TO_USE]
19
+ [--data_num_samples DATA_NUM_SAMPLES]
20
+ [--runtime_num_workers NUM_WORKERS]
21
+ [--runtime_worker_options WORKER_OPTIONS]
22
+ [--runtime_pipeline_id PIPELINE_ID] [--job_id JOB_ID]
23
+ [--runtime_creation_delay CREATION_DELAY]
24
+ [--runtime_code_location CODE_LOCATION]
25
+
26
+ Driver for NOOP processing
27
+
28
+ options:
29
+ -h, --help show this help message and exit
30
+ --run_locally RUN_LOCALLY
31
+ running ray local flag
32
+ --noop_sleep_sec NOOP_SLEEP_SEC
33
+ Sleep actor for a number of seconds while processing the data frame, before writing the file to COS
34
+ --data_s3_cred S3_CRED
35
+ AST string of options for cos credentials. Only required for COS or Lakehouse.
36
+ access_key: access key help text
37
+ secret_key: secret key help text
38
+ cos_url: COS url
39
+ Example: { 'access_key': 'access', 'secret_key': 'secret', 's3_url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud' }
40
+ --data_s3_config S3_CONFIG
41
+ AST string containing input/output paths.
42
+ input_folder: Path to input folder of files to be processed
43
+ output_folder: Path to output folder of processed files
44
+ Example: { 'input_folder': 'your input folder', 'output_folder ': 'your output folder' }
45
+ --data_local_config LOCAL_CONFIG
46
+ ast string containing input/output folders using local fs.
47
+ input_folder: Path to input folder of files to be processed
48
+ output_folder: Path to output folder of processed files
49
+ Example: { 'input_folder': './input', 'output_folder': '/tmp/output' }
50
+ --data_max_files MAX_FILES
51
+ Max amount of files to process
52
+ --data_checkpointing CHECKPOINTING
53
+ checkpointing flag
54
+ --data_data_sets DATA_SETS
55
+ List of data sets
56
+ --data_files_to_use DATA_FILES_TO_USE
57
+ files extensions to use, default .parquet
58
+ --data_num_samples DATA_NUM_SAMPLES
59
+ number of randomply picked files to use
60
+ --runtime_num_workers NUM_WORKERS
61
+ number of workers
62
+ --runtime_worker_options WORKER_OPTIONS
63
+ AST string defining worker resource requirements.
64
+ num_cpus: Required number of CPUs.
65
+ num_gpus: Required number of GPUs
66
+ resources: The complete list can be found at
67
+ https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html#ray.remote_function.RemoteFunction.options
68
+ and contains accelerator_type, memory, name, num_cpus, num_gpus, object_store_memory, placement_group,
69
+ placement_group_bundle_index, placement_group_capture_child_tasks, resources, runtime_env,
70
+ scheduling_strategy, _metadata, concurrency_groups, lifetime, max_concurrency, max_restarts,
71
+ max_task_retries, max_pending_calls, namespace, get_if_exists
72
+ Example: { 'num_cpus': '8', 'num_gpus': '1', 'resources': '{"special_hardware": 1, "custom_label": 1}' }
73
+ --runtime_pipeline_id PIPELINE_ID
74
+ pipeline id
75
+ --runtime_job_id JOB_ID job id
76
+ --runtime_creation_delay CREATION_DELAY
77
+ delay between actor' creation
78
+ --runtime_code_location CODE_LOCATION
79
+ AST string containing code location
80
+ github: Github repository URL.
81
+ commit_hash: github commit hash
82
+ path: Path within the repository
83
+ Example: { 'github': 'https://github.com/somerepo', 'commit_hash': '13241231asdfaed', 'path': 'transforms/universal/ededup' }
84
+ ```
Binary file