PyPI - data-prep-toolkit - Versions diffs - 0.0.1__tar.gz - Mend

data-prep-toolkit 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

data_prep_toolkit-0.0.1/.gitignore ADDED Viewed

@@ -0,0 +1,35 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+.tox/
+htmlcov
+.coverage
+.cache
+nosetests.xml
+coverage.xml

data_prep_toolkit-0.0.1/Makefile ADDED Viewed

@@ -0,0 +1,62 @@
+# Use make help, to see the available rules
+REPOROOT=../
+include ../.make.defaults
+include ../.make.versions
+TAG := "v${DPK_LIB_VERSION}"
+clean::
+	@# Help: Clean up the distribution build and the venv
+	rm -rf dist venv
+	rm -rf src/*egg-info
+.check-env::
+	@echo "Checks passed"
+update-toml:: .check-env
+	@# Help: Copy the Makefile distribution version into the pyproject.toml
+	sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
+	mv tt.toml pyproject.toml
+setup::
+build:: update-toml venv
+	@# Help: Build the distribution for publishing to a pypi
+	rm -r dist || true
+	rm -rf src/*egg-info || true
+	${PIP} install --upgrade build
+	${PYTHON} -m build
+publish:: .check-env update-toml
+	@# Help: Publish project to pypi
+	${PYTHON} -m twine check dist/*
+	${PYTHON} -m twine upload --verbose --non-interactive dist/*
+	#@echo "create a git tag to reference published version"
+	#@git tag ${TAG}
+	#@git push origin ${TAG}
+venv::	pyproject.toml
+	@# Help: Create the virtual environment using pyproject.toml
+	rm -r dist venv || true
+	rm -rf src/*egg-info || true
+	rm makeenv || true
+	$(PYTHON) -m venv venv
+	source venv/bin/activate; 	\
+	pip install --upgrade pip;	\
+	pip install -e .;		\
+	pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1
+# Here we run each test directory of tests and each ray launched test separately, because
+# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
+# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
+test::
+	@# Help: Use the already-built virtual environment to run pytest on the test directory.
+	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/data_access;
+	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/transform;
+	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/pure_python;
+	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/ray/ray_util_test.py;
+	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/ray/launcher_test.py;
+	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/ray/test_noop_launch.py;

data_prep_toolkit-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,55 @@
+Metadata-Version: 2.1
+Name: data_prep_toolkit
+Version: 0.0.1
+Summary: Data Preparation Laboratory Library
+Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
+License: Apache-2.0
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: ray[default]==2.9.3
+Requires-Dist: pyarrow==15.0.2
+Requires-Dist: boto3==1.34.69
+Requires-Dist: argparse
+Requires-Dist: mmh3
+Requires-Dist: fastapi>=0.109.1
+Requires-Dist: pillow>=10.2.0
+Provides-Extra: dev
+Requires-Dist: twine; extra == "dev"
+Requires-Dist: pytest>=7.3.2; extra == "dev"
+Requires-Dist: pytest-dotenv>=0.5.2; extra == "dev"
+Requires-Dist: pytest-env>=1.0.0; extra == "dev"
+Requires-Dist: pre-commit>=3.3.2; extra == "dev"
+Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
+Requires-Dist: pytest-mock>=3.10.0; extra == "dev"
+Requires-Dist: moto==5.0.5; extra == "dev"
+Requires-Dist: markupsafe==2.0.1; extra == "dev"
+# Data Processing Library
+This provides a python framework for developing _transforms_
+on data stored in files - currently parquet files are supported -
+and running them in a [ray](https://ray.com) cluster.
+Data files may be stored in the local file system or  COS/S3.
+For more details see the [documentation](doc/overview.md).
+### Virtual Environment
+The project uses `pyproject.toml` and a Makefile for operations.
+To do development you should establish the virtual environment
+```shell
+make venv
+```
+and then either activate
+```shell
+source venv/bin/activate
+```
+or set up your IDE to use the venv directory when developing in this project
+## Library Artifact Build and Publish
+To test, build and publish the library to artifactory
+```shell
+make test build publish
+```
+To up the version number, edit the Makefile to change VERSION and rerun
+the above.  This will require committing both the `Makefile` and the
+autotmatically updated `pyproject.toml` file.

data_prep_toolkit-0.0.1/README.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Data Processing Library
+This provides a python framework for developing _transforms_
+on data stored in files - currently parquet files are supported -
+and running them in a [ray](https://ray.com) cluster.
+Data files may be stored in the local file system or  COS/S3.
+For more details see the [documentation](doc/overview.md).
+### Virtual Environment
+The project uses `pyproject.toml` and a Makefile for operations.
+To do development you should establish the virtual environment
+```shell
+make venv
+```
+and then either activate
+```shell
+source venv/bin/activate
+```
+or set up your IDE to use the venv directory when developing in this project
+## Library Artifact Build and Publish
+To test, build and publish the library to artifactory
+```shell
+make test build publish
+```
+To up the version number, edit the Makefile to change VERSION and rerun
+the above.  This will require committing both the `Makefile` and the
+autotmatically updated `pyproject.toml` file.

data_prep_toolkit-0.0.1/doc/advanced-transform-tutorial.md ADDED Viewed

@@ -0,0 +1,284 @@
+# Advanced Transform Tutorial
+In this example, we implement an [ededup](../../transforms/universal/ededup) transform that
+removes duplicate documents across all files. In this tutorial, we will show the following:
+* How to write the `ededup` transform to generate the output table.
+* How to define transform-specific metadata that can be associated
+  with each table transformation and aggregated across all transformations
+  in a single run of the transform.
+* How to implement custom `TransformRuntime` to create supporting Ray objects and supplement
+  transform-specific metadata with the information about this statistics
+* How to define command line arguments that can be used to configure
+  the operation of our _noop_ transform.
+The complete task involves the following:
+* EdedupTransform - class that implements the specific transformation
+* EdedupRuntime - class that implements custom TransformRuntime to create supporting Ray objects and enhance job output
+  statistics
+* EdedupTableTransformConfiguration - class that provides configuration for the
+  EdedupTransform and EdedupRuntime, including transform runtime class and the command line arguments used to
+  configure them.
+* main() - simple creation and use of the TransformLauncher.
+(Currently, the complete code for the noop transform used for this
+tutorial can be found in the
+[ededup transform](../../transforms/universal/ededup) directory.
+Finally, we show to use the command line to run the transform in a local ray cluster
+## HashFilter
+One of the basic components of exact dedup implementation is a cache of hashes. That is why we will start
+from implementing this support actor. The implementation is fairly straight forward and can be
+found [here](../../transforms/universal/ededup/src/ededup_transform.py)
+## EdedupTransform
+First, let's define the transform class.  To do this we extend
+the base abstract/interface class
+[AbstractTableTransform](../src/data_processing/transform/table_transform.py),
+which requires definition of the following:
+* an initializer (i.e. `init()`) that accepts a dictionary of configuration
+  data.  For this example, the configuration data will only be defined by
+  command line arguments (defined below).
+* the `transform()` method itself that takes an input table and produces an output
+  table and any associated metadata for that table transformation.
+Other methods such as `flush()` need not be overridden/redefined for this example.
+We start with the simple definition of the class, its initializer and the imports required
+by subsequent code:
+```python
+from argparse import ArgumentParser, Namespace
+from typing import Any
+import pyarrow as pa
+import ray
+from data_processing.data_access import DataAccessFactory
+from data_processing.ray import (
+  RayLauncherConfiguration,
+  DefaultTableTransformRuntimeRay,
+  RayUtils,
+  RayTransformLauncher,
+)
+from data_processing.transform import AbstractTableTransform
+from data_processing.utils import GB, TransformUtils
+from ray.actor import ActorHandle
+class EdedupTransform(AbstractTableTransform):
+  def __init__(self, config: dict):
+    super().__init__(config)
+    self.doc_column = config.get("doc_column", "")
+    self.hashes = config.get("hashes", [])
+```
+The `EdedupTransform` class extends the `AbstractTableTransform`, which defines the required methods.
+For purposes of the tutorial and to simulate a more complex processing
+job, our initializer allows our transform to be configurable
+with document column name and a list of hash actors during the call to `transform()`.
+Configuration is provided by the framework in a dictionary provided to the initializer.
+Below we will cover how `doc_column` and `hashes` arguments are made available to the initializer.
+Next we define the `transform()` method itself, which includes the addition of some
+metadata.
+```python
+    def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
+        if not TransformUtils.validate_columns(table=table, required=[self.doc_column]):
+            return [], {}
+        # Inner variables
+        hashes = set()
+        unique = []
+        hd = {}
+        # Compute unique hashes for the table
+        for text in table[self.doc_column]:
+            # Compute doc hash
+            h = TransformUtils.str_to_hash(TransformUtils.normalize_string(str(text)))
+            if h not in hashes:  # Processing this hash for the first time
+                hashes.add(h)  # Remember it locally
+                hd[h] = str(text)
+                if len(hd) >= REQUEST_LEN:  # time to check remotely
+                    unique = unique + self._process_remote_hashes(hd=hd)
+                    hd = {}
+        if len(hd) > 0:  # Process remaining hashes
+            unique = unique + self._process_remote_hashes(hd=hd)
+        # Remove duplicates
+        unique_set = set(unique)
+        mask = [False] * table.num_rows
+        index = 0
+        for text in table[self.doc_column]:
+            str_text = str(text)
+            if str_text in unique_set:
+                mask[index] = True
+                unique_set.remove(str_text)
+            index += 1
+        # Create output table
+        out_table = table.filter(mask)
+        # report statistics
+        stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows}
+        return [out_table], stats
+```
+The single input to this method is the in-memory pyarrow table to be transformed.
+The return of this function is a list of tables and optional metadata.  In this
+case of simple 1:1 table conversion the list will contain a single table, result of removing
+duplicates from input table.
+The metadata is a free-form dictionary of keys with numeric values that will be aggregated
+by the framework and reported as aggregated job statistics metadata.
+If there is no metadata then simply return an empty dictionary.
+## EdedupRuntime
+First, let's define the transform runtime class.  To do this we extend
+the base abstract/interface class
+[DefaultTableTransformRuntime](../src/data_processing/ray/transform_runtime.py),
+which requires definition of the following:
+* an initializer (i.e. `init()`) that accepts a dictionary of configuration
+  data.  For this example, the configuration data will only be defined by
+  command line arguments (defined below).
+* the `get_transform_config()` method that takes `data_access_factory`, `statistics actor`, and
+  `list of files to process` and produces a dictionary of parameters used by transform.
+* the `compute_execution_stats()` method that takes take a dictionary of metadata, enhances it and
+  produces an enhanced metadata dictionary.
+We start with the simple definition of the class and its initializer
+```python
+class EdedupRuntime(DefaultTableTransformRuntime):
+    def __init__(self, params: dict[str, Any]):
+        super().__init__(params)
+        self.filters = []
+```
+Next we define the `get_transform_config()` method, which, in this case, creates supporting Ray Actors and
+adds their handles to the transform parameters
+```python
+    def get_transform_config(
+        self, data_access_factory: DataAccessFactory, statistics: ActorHandle, files: list[str]
+    ) -> dict[str, Any]:
+        self.filters = RayUtils.create_actors(
+            clazz=HashFilter,
+            params={},
+            actor_options={"num_cpus": self.params.get("hash_cpu", 0.5)},
+            n_actors=self.params.get("num_hashes", 1),
+        )
+        return {"hashes": self.filters} | self.params
+```
+Inputs to this method includes a set of parameters, that moght not be needed for this transformer, but
+rather a superset of all parameters that can be used by different implementations of transform runtime (
+see for example [fuzzy dedup](../../transforms/universal/fdedup), etc).
+The return of this function is a dictionary information for transformer initialization. In this
+implementation we add additional parameters to the input dictionary, but in general, it can be a completely
+new dictionary build here
+Finally we define the `compute_execution_stats()` method, which which enhances metadata collected by statistics
+class
+```python
+    def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]:
+    # Get filters stats
+    sum_hash = 0
+    sum_hash_mem = 0
+    remote_replies = [f.get_hash_size.remote() for f in self.filters]
+    while remote_replies:
+        # Wait for replies
+        ready, not_ready = ray.wait(remote_replies)
+        for r in ready:
+            h_size, h_memory = ray.get(r)
+            sum_hash = sum_hash + h_size
+            sum_hash_mem = sum_hash_mem + h_memory
+        remote_replies = not_ready
+    dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1))
+    return {"number of hashes": sum_hash, "hash memory, GB": sum_hash_mem, "de duplication %": dedup_prst} | stats
+```
+Input to this method is a dictionary of metadata collected by statistics object. It then enhances it by information
+collected by hash actors and custom computations based on statistics data.
+## EdedupTableTransformConfiguration
+The final class we need to implement is `EdedupTableTransformConfiguration` class and its initializer that
+define the following:
+* The short name for the transform
+* The class implementing the transform - in our case EdedupTransform
+* The transform runtime class be used - in our case EdedupRuntime
+* Command line argument support.
+First we define the class and its initializer,
+```python
+short_name = "ededup"
+cli_prefix = f"{short_name}_"
+class EdedupTableTransformConfiguration(DefaultTableTransformConfiguration):
+    def __init__(self):
+        super().__init__(name=short_name, runtime_class=EdedupRuntime, transform_class=EdedupTransform)
+        self.params = {}
+```
+The initializer extends the DefaultTableTransformConfiguration which provides simple
+capture of our configuration data and enables picklability through the network.
+It also adds a `params` field that will be used below to hold the transform's
+configuration data (used in `EdedupRuntime.init()` above).
+Next, we provide two methods that define and capture the command line configuration that
+is specific to the `EdedupTransform`, in this case the number of seconds to sleep during transformation.
+First we define the method establishes the command line arguments.
+This method is given a global argument parser to which the `EdedupTransform` arguments are added.
+It is good practice to include a common prefix to all transform-specific options (i.e. pii, lang, etc).
+In our case we will use `noop_`.
+```python
+    def add_input_params(self, parser: ArgumentParser) -> None:
+      parser.add_argument(f"--{cli_prefix}hash_cpu", type=float, default=0.5, help="number of CPUs per hash")
+      parser.add_argument(f"--{cli_prefix}num_hashes", type=int, default=0, help="number of hash actors to use")
+      parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="key for accessing data")
+```
+Next we implement a method that is called after the framework has parsed the CLI args
+and which allows us to capture the `EdedupTransform`-specific arguments and optionally validate them.
+```python
+    def apply_input_params(self, args: Namespace) -> bool:
+      captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+      self.params = self.params | captured
+      if self.params["num_hashes"] <= 0:
+        logger.info(f"Number of hashes should be greater then zero, provided {args.num_hashes}")
+        return False
+      logger.info(f"exact dedup params are {self.params}")
+      return True
+```
+## main()
+Next, we show how to launch the framework with the `EdedupTransform` using the
+framework's `TransformLauncher` class.
+```python
+if __name__ == "__main__":
+    launcher = TransformLauncher(transform_runtime_config=EdedupTransformConfiguration())
+    launcher.launch()
+```
+The launcher requires only an instance of DefaultTableTransformConfiguration
+(our `EdedupTransformConfiguration` class).
+A single method `launch()` is then invoked to run the transform in a Ray cluster.
+## Running
+Assuming the above `main()` is placed in `ededup_transform.py` we can run the transform on data
+in COS as follows:
+```shell
+python ededup_transform.py --hash_cpu 0.5 --num_hashes 2 --doc_column "contents" \
+  --run_locally True  \
+  --s3_cred "{'access_key': 'KEY', 'secret_key': 'SECRET', 'cos_url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud'}" \
+  --s3_config "{'input_folder': 'cos-optimal-llm-pile/test/david/input/', 'output_folder': 'cos-optimal-llm-pile/test/david/output/'}"
+```
+This is a minimal set of options to run locally.
+See the [launcher options](launcher-options.md) for a complete list of
+transform-independent command line options.

data_prep_toolkit-0.0.1/doc/architecture.md ADDED Viewed

@@ -0,0 +1,104 @@
+# Data Processing Architecture
+In this section we cover the high-level architecture, some of the core components.
+Transform implementation and examples are provided in the [tutorial](transform-tutorials.md).
+## Architecture
+The architecture is a "standard" implementation of [Embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel) to
+process many input files in parallel using a distribute network of RayWorkers.
+![Processing Architecture](processing-architecture.jpg)
+The architecture includes the following core components:
+* [RayLauncher](../src/data_processing/ray/transform_launcher.py) accepts and validates
+ CLI parameters to establish the Ray Orchestrator with the proper configuration.
+It uses the following components, all of which can/do define CLI configuration parameters.:
+    * [Transform Orchestrator Configuration](../src/data_processing/ray/transform_orchestrator_configuration.py) is responsible
+     for defining and validating infrastructure parameters
+     (e.g., number of workers, memory and cpu, local or remote cluster, etc.). This class has very simple state
+     (several dictionaries) and is fully pickleable. As a result framework uses its instance as a
+     parameter in remote functions/actors invocation.
+    * [DataAccessFactory](../src/data_processing/data_access/data_access_factory.py) - provides the
+      configuration for the type of DataAccess to use when reading/writing the input/output data for
+      the transforms.  Similar to Transform Orchestrator Configuration, this is a pickleable
+      instance that is passed between Launcher, Orchestrator and Workers.
+    * [TransformConfiguration](../src/data_processing/ray/transform_runtime.py) - defines specifics
+      of the transform implementation including transform implementation class, its short name, any transform-
+      specific CLI parameters, and an optional TransformRuntime class, discussed below.
+    After all parameters are validated, the ray cluster is started and the DataAccessFactory, TransformOrchestratorConfiguraiton
+    and TransformConfiguration are given to the Ray Orchestrator, via Ray remote() method invocation.
+    The Launcher waits for the Ray Orchestrator to complete.
+* [Ray Orchestrator](../src/data_processing/ray/transform_orchestrator.py) is responsible for overall management of
+  the data processing job. It creates the actors, determines the set of input data and distributes the
+  references to the data files to be processed by the workers. More specifically, it performs the following:
+  1. Uses the DataAccess instance created by the DataAccessFactory to determine the set of the files
+  to be processed.
+  2. uses the TransformConfiguration to create the TransformRuntime instance
+  3. Uses the TransformRuntime to optionally apply additional configuration (ray object storage, etc) for the configuration
+  and operation of the Transform.
+  3. uses the TransformOrchestratorConfiguration to determine the set of RayWorkers to create
+  to execute transformers in parallel, providing the following to each worker:
+      * Ray worker configuration
+      * DataAccessFactory
+      * Transform class and its TransformConfiguration containing the CLI parameters and any TransformRuntime additions.
+  4. in a load-balanced, round-robin fashion, distributes the names of the input files to the workers for them to transform/process.
+  Additionally, to provide monitoring of long-running transforms, the orchestrator is instrumented with
+  [custom metrics](https://docs.ray.io/en/latest/ray-observability/user-guides/add-app-metrics.html), that are exported to localhost:8080 (this is the endpoint that
+  Prometheus would be configured to scrape).
+  Once all data is processed, the orchestrator will collect execution statistics (from the statistics actor)
+  and build and save it in the form of execution metadata (`metadata.json`). Finally, it will return the execution
+  result to the Launcher.
+* [Ray worker](../src/data_processing/ray/transform_table_processor.py) is responsible for
+reading files (as [PyArrow Tables](https://levelup.gitconnected.com/deep-dive-into-pyarrow-understanding-its-features-and-benefits-2cce8b1466c8))
+assigned by the orchestrator, applying the transform to the input table and writing out the
+resulting table(s).  Metadata produced by each table transformation is aggregated into
+Transform Statistics (below).
+* [Transform Statistics](../src/data_processing/ray/transform_statistics.py) is a general
+purpose data collector actor aggregating the numeric metadata from different places of
+the framework (especially metadata produced by the transform).
+These statistics are reported as metadata (`metadata.json`) by the orchestrator upon completion.
+## Core Components
+Some of the core components used by the architecture are definfed here:
+* [CLIProvider](../src/data_processing/utils/cli_utils.py) - provides a general purpose
+  mechanism for defining, validating and sharing CLI parameters.
+  It is used by the DataAccessFactor and Transform Configuration (below).
+* Data Access is an abstraction layer for different data access supported by the framework. The main components
+  of this layer are:
+  * [Data Access](../src/data_processing/data_access/data_access.py) is the basic interface for the data access, and enables the identification of
+  input files to process, associated output files, checkpointing and general file reading/writing.
+    Currently, the framework implements several concrete implementations of the Data Access, including
+    [local data support](../src/data_processing/data_access/data_access_local.py) and
+    [s3](../src/data_processing/data_access/data_access_s3.py). Additional Data Access implementations can be added as required.
+  * [Data Access Factory](../src/data_processing/data_access/data_access_factory.py) is an implementation of the
+    [factory design pattern](https://www.pentalog.com/blog/design-patterns/factory-method-design-pattern/) for creation
+    of the data access instances. Data Access Factory, as a CLIProvider,  enables the definition of CLI
+    parameters that configure the instance of Data Access to be created. Data Access factory has very simple state
+    (several dictionaries) and is fully pickleable. The framework uses Data Access Factory instance as a
+    parameter in remote functions/actors invocations.
+## Transforms
+A brief discussion of the Transform components are provided here.
+For a more complete discussion, see the [tutorials](transform-tutorials.md).
+* [Transform](../src/data_processing/transform/table_transform.py) - defines the methods required
+of any transform implementation - `transform()` and `flush()` - and provides the bulk of any transform implementation
+convert one Table to 0 or more new Tables.   In general, this is not tied to the above Ray infrastructure
+and so can usually be used independent of Ray.
+* [TransformRuntime ](../src/data_processing/ray/transform_runtime.py) - this class only needs to be
+extended/implemented when additional Ray components (actors, shared memory objects, etc.) are used
+by the transform. The main method `get_transform_config()` is used to enable these extensions.
+* [TransformConfiguration](../src/data_processing/ray/transform_runtime.py) - this is the bootstrap
+  class provided to the Launcher that enables the instantiation of the Transform and the TransformRuntime within
+  the architecture.  It is a CLIProvider, which allows it to define transform-specific CLI configuration
+  that is made available to the Transform's initializer.

data_prep_toolkit-0.0.1/doc/launcher-options.md ADDED Viewed

@@ -0,0 +1,84 @@
+# Launcher Command Line Options
+A number of command line options are available when launching a transform.
+The following is a current --help output (a work in progress) for
+the `NOOPTransform` (note the --noop_sleep_sec option):
+```
+usage: noop_transform.py [-h]
+                         [--run_locally RUN_LOCALLY]
+                         [--noop_sleep_sec NOOP_SLEEP_SEC]
+                         [--data_s3_cred DATA_S3_CRED]
+                         [--data_s3_config DATA_S3_CONFIG]
+                         [--data_local_config DATA_LOCAL_CONFIG]
+                         [--data_max_files DATA_MAX_FILES]
+                         [--data_checkpointing DATA_CHECKPOINTING]
+                         [--data_data_sets DATA_DATA_SETS]
+                         [--data_max_files MAX_FILES]
+                         [--data_files_to_use DATA_FILES_TO_USE]
+                         [--data_num_samples DATA_NUM_SAMPLES]
+                         [--runtime_num_workers NUM_WORKERS]
+                         [--runtime_worker_options WORKER_OPTIONS]
+                         [--runtime_pipeline_id PIPELINE_ID] [--job_id JOB_ID]
+                         [--runtime_creation_delay CREATION_DELAY]
+                         [--runtime_code_location CODE_LOCATION]
+Driver for NOOP processing
+options:
+  -h, --help            show this help message and exit
+  --run_locally RUN_LOCALLY
+                        running ray local flag
+  --noop_sleep_sec NOOP_SLEEP_SEC
+                        Sleep actor for a number of seconds while processing the data frame, before writing the file to COS
+  --data_s3_cred S3_CRED
+                        AST string of options for cos credentials. Only required for COS or Lakehouse.
+                        access_key: access key help text
+                        secret_key: secret key help text
+                        cos_url: COS url
+                        Example: { 'access_key': 'access', 'secret_key': 'secret', 's3_url': 'https://s3.us-east.cloud-object-storage.appdomain.cloud' }
+  --data_s3_config S3_CONFIG
+                        AST string containing input/output paths.
+                        input_folder: Path to input folder of files to be processed
+                        output_folder: Path to output folder of processed files
+                        Example: { 'input_folder': 'your input folder', 'output_folder ': 'your output folder' }
+  --data_local_config LOCAL_CONFIG
+                        ast string containing input/output folders using local fs.
+                        input_folder: Path to input folder of files to be processed
+                        output_folder: Path to output folder of processed files
+                        Example: { 'input_folder': './input', 'output_folder': '/tmp/output' }
+  --data_max_files MAX_FILES
+                        Max amount of files to process
+  --data_checkpointing CHECKPOINTING
+                        checkpointing flag
+  --data_data_sets DATA_SETS
+                        List of data sets
+  --data_files_to_use DATA_FILES_TO_USE
+                        files extensions to use, default .parquet
+  --data_num_samples DATA_NUM_SAMPLES
+                        number of randomply picked files to use
+  --runtime_num_workers NUM_WORKERS
+                        number of workers
+  --runtime_worker_options WORKER_OPTIONS
+                        AST string defining worker resource requirements.
+                        num_cpus: Required number of CPUs.
+                        num_gpus: Required number of GPUs
+                        resources: The complete list can be found at
+                                   https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html#ray.remote_function.RemoteFunction.options
+                                   and contains accelerator_type, memory, name, num_cpus, num_gpus, object_store_memory, placement_group,
+                                   placement_group_bundle_index, placement_group_capture_child_tasks, resources, runtime_env,
+                                   scheduling_strategy, _metadata, concurrency_groups, lifetime, max_concurrency, max_restarts,
+                                   max_task_retries, max_pending_calls, namespace, get_if_exists
+                        Example: { 'num_cpus': '8', 'num_gpus': '1', 'resources': '{"special_hardware": 1, "custom_label": 1}' }
+  --runtime_pipeline_id PIPELINE_ID
+                        pipeline id
+  --runtime_job_id JOB_ID       job id
+  --runtime_creation_delay CREATION_DELAY
+                        delay between actor' creation
+  --runtime_code_location CODE_LOCATION
+                        AST string containing code location
+                        github: Github repository URL.
+                        commit_hash: github commit hash
+                        path: Path within the repository
+                        Example: { 'github': 'https://github.com/somerepo', 'commit_hash': '13241231asdfaed', 'path': 'transforms/universal/ededup' }
+```

data_prep_toolkit-0.0.1/doc/logo-ibm-dark.png ADDED Viewed

Binary file

data_prep_toolkit-0.0.1/doc/logo-ibm.png ADDED Viewed

Binary file