PyPI - data-prep-toolkit - Versions diffs - 0.1.1__tar.gz → 0.2.0.dev2__tar.gz - Mend

data-prep-toolkit 0.1.1tar.gz → 0.2.0.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/Makefile RENAMED Viewed

@@ -13,27 +13,19 @@ clean::
 .check-env::
 	@echo "Checks passed"
-update-toml:: .check-env
-	@# Help: Copy the Makefile distribution version into the pyproject.toml
-	sed -e 's/^version[ ]*=.*/version = "'${DPK_LIB_VERSION}'"/' pyproject.toml > tt.toml
-	mv tt.toml pyproject.toml
 setup::
-build:: update-toml venv
-	@# Help: Build the distribution for publishing to a pypi
-	rm -r dist || true
-	rm -rf src/*egg-info || true
-	${PIP} install --upgrade build
-	${PYTHON} -m build
+set-versions:: .check-env
+	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
+build:: build-dist
-publish:: .check-env update-toml
-	@# Help: Publish project to pypi
-	${PYTHON} -m twine check dist/*
-	${PYTHON} -m twine upload --verbose --non-interactive dist/*
-	#@echo "create a git tag to reference published version"
-	#@git tag ${TAG}
-	#@git push origin ${TAG}
+#build:: update-toml .defaults.build-dist
+build-dist :: set-versions .defaults.build-dist
+publish:: publish-dist
+publish-dist :: .check-env .defaults.publish-dist
 venv::	pyproject.toml
 	@# Help: Create the virtual environment using pyproject.toml
@@ -46,10 +38,14 @@ venv::	pyproject.toml
 	pip install -e .;		\
 	pip install pytest pytest-cov moto==5.0.5 markupsafe==2.0.1
+image::
+	@# Help: Placeholder does nothing for now.
+	@echo "Image building for ray is in the works (comming soon)."
 # Here we run each test directory of tests and each ray launched test separately, because
 # it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
 # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
+# TODO: the following fails.  Why?  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  .
 test::
 	@# Help: Use the already-built virtual environment to run pytest on the test directory.
 	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/data_access;
@@ -57,8 +53,5 @@ test::
 	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/launch/pure_python/launcher_test.py;
 	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/launch/pure_python/multi_launcher_test.py;
 	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/launch/pure_python/test_noop_launch.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/ray_util_test.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/multi_launcher_test.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/launcher_test.py;
-	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/launch/ray/test_noop_launch.py;

{data_prep_toolkit-0.1.1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.0.dev2}/PKG-INFO RENAMED Viewed

@@ -1,18 +1,15 @@
 Metadata-Version: 2.1
 Name: data_prep_toolkit
-Version: 0.1.1
+Version: 0.2.0.dev2
 Summary: Data Preparation Toolkit Library
 Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
 License: Apache-2.0
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
-Requires-Dist: ray[default]==2.9.3
 Requires-Dist: pyarrow==15.0.2
 Requires-Dist: boto3==1.34.69
 Requires-Dist: argparse
 Requires-Dist: mmh3
-Requires-Dist: fastapi>=0.109.1
-Requires-Dist: pillow>=10.2.0
 Provides-Extra: dev
 Requires-Dist: twine; extra == "dev"
 Requires-Dist: pytest>=7.3.2; extra == "dev"

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit"
-version = "0.1.1"
+version = "0.2.0.dev2"
 requires-python = ">=3.10"
 description = "Data Preparation Toolkit Library"
 license = {text = "Apache-2.0"}
@@ -10,14 +10,10 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "ray[default]==2.9.3",
     "pyarrow==15.0.2",
     "boto3==1.34.69",
     "argparse",
     "mmh3",
-    # These two are to fix security issue
-    "fastapi>=0.109.1",
-    "pillow>=10.2.0",
 ]
 [build-system]

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2/src/data_prep_toolkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,18 +1,15 @@
 Metadata-Version: 2.1
 Name: data_prep_toolkit
-Version: 0.1.1
+Version: 0.2.0.dev2
 Summary: Data Preparation Toolkit Library
 Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
 License: Apache-2.0
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
-Requires-Dist: ray[default]==2.9.3
 Requires-Dist: pyarrow==15.0.2
 Requires-Dist: boto3==1.34.69
 Requires-Dist: argparse
 Requires-Dist: mmh3
-Requires-Dist: fastapi>=0.109.1
-Requires-Dist: pillow>=10.2.0
 Provides-Extra: dev
 Requires-Dist: twine; extra == "dev"
 Requires-Dist: pytest>=7.3.2; extra == "dev"

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,4 +1,3 @@
-.gitignore
 Makefile
 README.md
 pyproject.toml
@@ -25,15 +24,6 @@ src/data_processing/runtime/pure_python/runtime_configuration.py
 src/data_processing/runtime/pure_python/transform_file_processor.py
 src/data_processing/runtime/pure_python/transform_launcher.py
 src/data_processing/runtime/pure_python/transform_orchestrator.py
-src/data_processing/runtime/ray/__init__.py
-src/data_processing/runtime/ray/execution_configuration.py
-src/data_processing/runtime/ray/ray_utils.py
-src/data_processing/runtime/ray/runtime_configuration.py
-src/data_processing/runtime/ray/transform_file_processor.py
-src/data_processing/runtime/ray/transform_launcher.py
-src/data_processing/runtime/ray/transform_orchestrator.py
-src/data_processing/runtime/ray/transform_runtime.py
-src/data_processing/runtime/ray/transform_statistics.py
 src/data_processing/test_support/__init__.py
 src/data_processing/test_support/abstract_test.py
 src/data_processing/test_support/data_access/__init__.py
@@ -44,6 +34,7 @@ src/data_processing/test_support/transform/__init__.py
 src/data_processing/test_support/transform/noop_transform.py
 src/data_processing/test_support/transform/transform_test.py
 src/data_processing/transform/__init__.py
+src/data_processing/transform/abstract_transform.py
 src/data_processing/transform/binary_transform.py
 src/data_processing/transform/table_transform.py
 src/data_processing/transform/transform_configuration.py
@@ -62,11 +53,11 @@ test-data/data_processing/input/sample1.parquet
 test-data/data_processing/input_multiple/sample1.parquet
 test-data/data_processing/input_multiple/sample2.parquet
 test-data/data_processing/input_multiple/sample3.parquet
-test-data/data_processing/ray/noop/expected/metadata.json
-test-data/data_processing/ray/noop/expected/sample1.parquet
-test-data/data_processing/ray/noop/expected/subdir/test1.parquet
-test-data/data_processing/ray/noop/input/sample1.parquet
-test-data/data_processing/ray/noop/input/subdir/test1.parquet
+test-data/data_processing/python/noop/expected/metadata.json
+test-data/data_processing/python/noop/expected/sample1.parquet
+test-data/data_processing/python/noop/expected/subdir/test1.parquet
+test-data/data_processing/python/noop/input/sample1.parquet
+test-data/data_processing/python/noop/input/subdir/test1.parquet
 test/data_processing_tests/data_access/daf_local_test.py
 test/data_processing_tests/data_access/data_access_local_test.py
 test/data_processing_tests/data_access/data_access_s3_test.py
@@ -74,9 +65,5 @@ test/data_processing_tests/data_access/sample_input_data_test.py
 test/data_processing_tests/launch/pure_python/launcher_test.py
 test/data_processing_tests/launch/pure_python/multi_launcher_test.py
 test/data_processing_tests/launch/pure_python/test_noop_launch.py
-test/data_processing_tests/launch/ray/launcher_test.py
-test/data_processing_tests/launch/ray/multi_launcher_test.py
-test/data_processing_tests/launch/ray/ray_util_test.py
-test/data_processing_tests/launch/ray/test_noop_launch.py
 test/data_processing_tests/transform/test_noop.py
 test/data_processing_tests/util/transform_utils_test.py

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_prep_toolkit.egg-info/requires.txt RENAMED Viewed

@@ -1,10 +1,7 @@
-ray[default]==2.9.3
 pyarrow==15.0.2
 boto3==1.34.69
 argparse
 mmh3
-fastapi>=0.109.1
-pillow>=10.2.0
 [dev]
 twine

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_local.py RENAMED Viewed

@@ -55,6 +55,14 @@ class DataAccessLocal(DataAccess):
         self.n_samples = n_samples
         self.files_to_use = files_to_use
+        logger.debug(f"Local input folder: {self.input_folder}")
+        logger.debug(f"Local output folder: {self.output_folder}")
+        logger.debug(f"Local data sets: {self.d_sets}")
+        logger.debug(f"Local checkpoint: {self.checkpoint}")
+        logger.debug(f"Local m_files: {self.m_files}")
+        logger.debug(f"Local n_samples: {self.n_samples}")
+        logger.debug(f"Local files_to_use: {self.files_to_use}")
     def get_num_samples(self) -> int:
         """
         Get number of samples for input

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/data_access/data_access_s3.py RENAMED Viewed

@@ -48,15 +48,18 @@ class DataAccessS3(DataAccess):
         :param n_samples: amount of files to randomly sample
         :param files_to_use: files extensions of files to include
         """
-        self.arrS3 = ArrowS3(
-            access_key=s3_credentials.get("access_key", ""),
-            secret_key=s3_credentials.get("secret_key", ""),
-            endpoint=s3_credentials.get("url", None),
-            region=s3_credentials.get("region", None),
-        )
+        self.s3_credentials = {} | s3_credentials
+        access_key = self.get_access_key()
+        if access_key is None:
+            raise ValueError("S3 access key not provided")
+        secret_key = self.get_secret_key()
+        if secret_key is None:
+            raise ValueError("S3 secret key not provided")
+        endpoint = self.get_endpoint()
+        region = self.get_region()
         if s3_config is None:
             self.input_folder = None
-            self.input_folder = None
+            self.output_folder = None
         else:
             self.input_folder = TransformUtils.clean_path(s3_config["input_folder"])
             self.output_folder = TransformUtils.clean_path(s3_config["output_folder"])
@@ -66,6 +69,32 @@ class DataAccessS3(DataAccess):
         self.n_samples = n_samples
         self.files_to_use = files_to_use
+        logger.debug(f"S3 access key provided: {access_key}")
+        logger.debug(f"S3 secret key provided: no soup for you!")
+        logger.debug(f"S3 region {region}")
+        logger.debug(f"S3 endpoint/url: {endpoint}")
+        logger.debug(f"S input folder: {self.input_folder}")
+        logger.debug(f"S3 output folder: {self.output_folder}")
+        logger.debug(f"S3 data sets: {self.d_sets}")
+        logger.debug(f"S3 checkpoint: {self.checkpoint}")
+        logger.debug(f"S3 m_files: {self.m_files}")
+        logger.debug(f"S3 n_samples: {self.n_samples}")
+        logger.debug(f"S3 files_to_use: {self.files_to_use}")
+        self.arrS3 = ArrowS3(access_key, secret_key, endpoint=endpoint, region=region)
+    def get_access_key(self):
+        return self.s3_credentials.get("access_key", None)
+    def get_secret_key(self):
+        return self.s3_credentials.get("secret_key", None)
+    def get_endpoint(self):
+        return self.s3_credentials.get("url", None)
+    def get_region(self):
+        return self.s3_credentials.get("region", None)
     def get_num_samples(self) -> int:
         """
         Get number of samples for input

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/pure_python/transform_file_processor.py RENAMED Viewed

@@ -13,11 +13,9 @@
 from typing import Any
 from data_processing.data_access import DataAccessFactoryBase
-from data_processing.runtime import (
-    AbstractTransformFileProcessor,
-)
-from data_processing.transform import TransformStatistics
+from data_processing.runtime import AbstractTransformFileProcessor
 from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
+from data_processing.transform import TransformStatistics
 class PythonTransformFileProcessor(AbstractTransformFileProcessor):

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/runtime/transform_file_processor.py RENAMED Viewed

@@ -58,7 +58,7 @@ class AbstractTransformFileProcessor:
             name_extension = TransformUtils.get_file_extension(f_name)
             self.logger.debug(f"Begin transforming file {f_name}")
             out_files, stats = self.transform.transform_binary(byte_array=filedata, ext=name_extension[1])
-            self.logger.debug(f"Done transforming file {f_name}")
+            self.logger.debug(f"Done transforming file {f_name}, got {len(out_files)} files")
             self.last_file_name = name_extension[0]
             self.last_file_name_next_index = None
             self.last_extension = name_extension[1]
@@ -83,7 +83,9 @@ class AbstractTransformFileProcessor:
         try:
             t_start = time.time()
             # get flush results
-            self.logger.debug(f"Begin flushing transform")
+            self.logger.debug(
+                f"Begin flushing transform, last file name {self.last_file_name}, last index {self.last_file_name_next_index}"
+            )
             out_files, stats = self.transform.flush_binary()
             self.logger.debug(f"Done flushing transform, got {len(out_files)} files")
             # Here we are using the name of the last file, that we were processing
@@ -113,9 +115,12 @@ class AbstractTransformFileProcessor:
             case 1:
                 # we have exactly 1 output file
                 file_ext = out_files[0]
-                output_name = self.data_access.get_output_location(path=f"{self.last_file_name}{file_ext[1]}")
+                lfn = self.last_file_name
+                if self.last_file_name_next_index is not None:
+                    lfn = f"{lfn}_{self.last_file_name_next_index}"
+                output_name = self.data_access.get_output_location(path=f"{lfn}{file_ext[1]}")
                 self.logger.debug(
-                    f"Writing transformed file {self.last_file_name}{self.last_extension} " f"to {output_name}"
+                    f"Writing transformed file {self.last_file_name}{self.last_extension} to {output_name}"
                 )
                 save_res = self.data_access.save_file(path=output_name, data=file_ext[0])
                 if save_res is not None:
@@ -130,7 +135,10 @@ class AbstractTransformFileProcessor:
                 else:
                     self.logger.warning(f"Failed to write file {output_name}")
                     self._publish_stats({"failed_writes": 1})
-                self.last_file_name_next_index = 0
+                if self.last_file_name_next_index is None:
+                    self.last_file_name_next_index = 0
+                else:
+                    self.last_file_name_next_index += 1
             case _:
                 # we have more then 1 file
                 file_sizes = 0

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/launch/transform_test.py RENAMED Viewed

@@ -14,7 +14,6 @@ import sys
 import tempfile
 from typing import Any
-from data_processing.runtime.ray import RayTransformLauncher
 from data_processing.runtime.transform_launcher import AbstractTransformLauncher
 from data_processing.test_support.abstract_test import AbstractTest
 from data_processing.utils import ParamsUtils
@@ -22,7 +21,7 @@ from data_processing.utils import ParamsUtils
 class AbstractTransformLauncherTest(AbstractTest):
     """
-    The Ray-based test class for all/most AbstractTransform implementations.
+    The launcher test class for all/most AbstractTransformLauncher implementations.
     Generic tests are provided here, and sub-classes must implement the _get*_fixture() method(s)
     to provide the test data for a given test method.  For example,  get_test_transform_fixtures()
     provides the test data for the test_transform() test method.
@@ -36,8 +35,8 @@ class AbstractTransformLauncherTest(AbstractTest):
         args = {} | cli_params
         local_ast = {"input_folder": in_table_path, "output_folder": out_table_path}
         args["data_local_config"] = local_ast
-        if isinstance(launcher, RayTransformLauncher):
-            args["run_locally"] = "True"
+        # if isinstance(launcher, RayTransformLauncher):
+        #     args["run_locally"] = "True"
         argv = ParamsUtils.dict_to_req(args)
         return argv
@@ -52,7 +51,7 @@ class AbstractTransformLauncherTest(AbstractTest):
         Test the given transform and its runtime using the given CLI arguments, input directory of data files and expected output directory.
         Data is processed into a temporary output directory which is then compared with the directory of expected output.
         :param launcher: launcher configured to run the transform being tested
-        :param cli_params: a map of the simulated CLI arguments (w/o --).  This includes both the transform-specific CLI parameters and  the Ray launching args.
+        :param cli_params: a map of the simulated CLI arguments (w/o --).  This includes both the transform-specific CLI parameters and the launching args.
         :param in_table_path: a directory containing the input parquet files to be processed and results compared against the expected output table path.
         :param expected_out_table_path: directory contain parquet and metadata.json that is expected to match the processed input directory.
         :return:
@@ -62,7 +61,14 @@ class AbstractTransformLauncherTest(AbstractTest):
             print(f"Using temporary output path {temp_dir}")
             sys.argv = self._get_argv(launcher, cli_params, in_table_path, temp_dir)
             launcher.launch()
-            AbstractTest.validate_directory_contents(temp_dir, expected_out_table_path)
+            self._validate_directory_contents_match(temp_dir, expected_out_table_path)
+    def _validate_directory_contents_match(self, dir: str, expected: str):
+        """
+        Confirm that the two directories contains the same files.
+        Stubbed out like this to allow spark tests to override this since spark tends to rename the files.
+        """
+        AbstractTest.validate_directory_contents(dir, expected)
     def _install_test_fixtures(self, metafunc):
         # Apply the fixtures for the method with these input names (i.e. test_transform()).

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/__init__.py RENAMED Viewed

@@ -1,6 +1,5 @@
 from .noop_transform import (
     NOOPTransform,
     NOOPPythonTransformConfiguration,
-    NOOPRayTransformConfiguration,
 )
 from .transform_test import AbstractTransformTest

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/test_support/transform/noop_transform.py RENAMED Viewed

@@ -15,13 +15,10 @@ from argparse import ArgumentParser, Namespace
 from typing import Any
 import pyarrow as pa
+from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.runtime.pure_python.runtime_configuration import (
     PythonTransformRuntimeConfiguration,
 )
-from data_processing.runtime.ray import RayTransformLauncher
-from data_processing.runtime.ray.runtime_configuration import (
-    RayTransformRuntimeConfiguration,
-)
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
 from data_processing.utils import CLIArgumentProvider, get_logger
@@ -138,22 +135,8 @@ class NOOPPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
         super().__init__(transform_config=NOOPTransformConfiguration())
-class NOOPRayTransformConfiguration(RayTransformRuntimeConfiguration):
-    """
-    Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher.
-    NOOP does not use a RayRuntime class so the superclass only needs the base
-    python-only configuration.
-    """
-    def __init__(self):
-        """
-        Initialization
-        """
-        super().__init__(transform_config=NOOPTransformConfiguration())
 if __name__ == "__main__":
     # launcher = NOOPRayLauncher()
-    launcher = RayTransformLauncher(NOOPRayTransformConfiguration())
+    launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
     logger.info("Launching noop transform")
     launcher.launch()

data_prep_toolkit-0.2.0.dev2/src/data_processing/transform/abstract_transform.py ADDED Viewed

@@ -0,0 +1,16 @@
+from typing import Any, Generic, TypeVar
+DATA = TypeVar("DATA")
+class AbstractTransform(Generic[DATA]):
+    def transform(self, data: DATA) -> tuple[list[DATA], dict[str, Any]]:
+        """
+        Converts input table into an output table.
+        If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
+        :param table: input table
+        :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
+        propagated to metadata
+        """
+        raise NotImplemented()

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/binary_transform.py RENAMED Viewed

@@ -10,10 +10,15 @@
 # limitations under the License.
 ################################################################################
-from typing import Any
+from typing import Any, TypeVar
+from data_processing.transform.abstract_transform import AbstractTransform
-class AbstractBinaryTransform:
+DATA = TypeVar("DATA")
+class AbstractBinaryTransform(AbstractTransform[DATA]):
     """
     Converts input binary file to output file(s) (binary)
     Sub-classes must provide the transform() method to provide the conversion of one binary files to 0 or

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/table_transform.py RENAMED Viewed

@@ -10,7 +10,7 @@
 # limitations under the License.
 ################################################################################
-from typing import Any
+from typing import Any, TypeVar
 import pyarrow as pa
 from data_processing.transform import AbstractBinaryTransform
@@ -20,7 +20,7 @@ from data_processing.utils import TransformUtils, get_logger
 logger = get_logger(__name__)
-class AbstractTableTransform(AbstractBinaryTransform):
+class AbstractTableTransform(AbstractBinaryTransform[pa.Table]):
     """
     Extends AbstractBinaryTransform to expect the byte arrays from to contain a pyarrow Table.
     Sub-classes are expected to implement transform() on the parsed Table instances.
@@ -59,17 +59,19 @@ class AbstractTableTransform(AbstractBinaryTransform):
         # Add number of rows to stats
         stats = stats | {"source_doc_count": table.num_rows}
         # convert tables to files
-        return self._check_and_convert_tables(out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows})
+        return self._check_and_convert_tables(
+            out_tables=out_tables, stats=stats | {"source_doc_count": table.num_rows}
+        )
-    def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
-        """
-        Converts input table into an output table.
-        If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
-        :param table: input table
-        :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
-        propagated to metadata
-        """
-        raise NotImplemented()
+    # def transform(self, table: pa.Table) -> tuple[list[pa.Table], dict[str, Any]]:
+    #     """
+    #     Converts input table into an output table.
+    #     If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray.
+    #     :param table: input table
+    #     :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be
+    #     propagated to metadata
+    #     """
+    #     raise NotImplemented()
     def flush_binary(self) -> tuple[list[tuple[bytes, str]], dict[str, Any]]:
         """

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/src/data_processing/transform/transform_configuration.py RENAMED Viewed

@@ -14,6 +14,7 @@ from argparse import ArgumentParser
 from typing import Any
 from data_processing.transform import AbstractBinaryTransform
+from data_processing.transform.abstract_transform import AbstractTransform
 from data_processing.utils import CLIArgumentProvider
@@ -22,7 +23,7 @@ class TransformConfiguration(CLIArgumentProvider):
     This is a base transform configuration class defining transform's input/output parameter
     """
-    def __init__(self, name: str, transform_class: type[AbstractBinaryTransform], remove_from_metadata: list[str] = []):
+    def __init__(self, name: str, transform_class: type[AbstractTransform], remove_from_metadata: list[str] = []):
         """
         Initialization
         :param name: transformer name
@@ -34,7 +35,7 @@ class TransformConfiguration(CLIArgumentProvider):
         self.remove_from_metadata = remove_from_metadata
         self.params = {}
-    def get_transform_class(self) -> type[AbstractBinaryTransform]:
+    def get_transform_class(self) -> type[AbstractTransform]:
         """
         Get the class extending AbstractTransform which implements a specific transformation.
         The class will generally be instantiated with a dictionary of configuration produced by

{data_prep_toolkit-0.1.1 → data_prep_toolkit-0.2.0.dev2}/test/data_processing_tests/launch/pure_python/test_noop_launch.py RENAMED Viewed

@@ -32,7 +32,7 @@ class TestRayNOOPTransform(AbstractTransformLauncherTest):
     """
     def get_test_transform_fixtures(self) -> list[tuple]:
-        basedir = "../../../../test-data/data_processing/ray/noop/"
+        basedir = "../../../../test-data/data_processing/python/noop/"
         basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
         launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
         fixtures = [(launcher, {"noop_sleep_sec": 0}, basedir + "/input", basedir + "/expected")]

data_prep_toolkit-0.1.1/.gitignore DELETED Viewed

@@ -1,35 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# Distribution / packaging
-bin/
-build/
-develop-eggs/
-dist/
-eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-.tox/
-htmlcov
-.coverage
-.cache
-nosetests.xml
-coverage.xml

data_prep_toolkit-0.1.1/src/data_processing/runtime/ray/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-from data_processing.runtime.ray.ray_utils import RayUtils
-from data_processing.runtime.ray.transform_statistics import TransformStatisticsRay
-from data_processing.runtime.ray.transform_runtime import DefaultRayTransformRuntime
-from data_processing.runtime.ray.runtime_configuration import RayTransformRuntimeConfiguration
-from data_processing.runtime.ray.transform_file_processor import RayTransformFileProcessor
-from data_processing.runtime.ray.execution_configuration import RayTransformExecutionConfiguration
-from data_processing.runtime.ray.transform_orchestrator import orchestrate
-from data_processing.runtime.ray.transform_launcher import RayTransformLauncher

data-prep-toolkit 0.1.1__tar.gz → 0.2.0.dev2__tar.gz

data-prep-toolkit 0.1.1tar.gz → 0.2.0.dev2tar.gz