PyPI - data-prep-toolkit - Versions diffs - 0.2.0.dev5__tar.gz → 0.2.1__tar.gz - Mend

data-prep-toolkit 0.2.0.dev5tar.gz → 0.2.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/Makefile RENAMED Viewed

@@ -15,13 +15,13 @@ clean::
 setup::
-set-versions:: .check-env
+set-versions: .check-env
 	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
 build:: build-dist
 #build:: update-toml .defaults.build-dist
-build-dist :: set-versions .defaults.build-dist
+build-dist :: .defaults.build-dist
 publish:: publish-dist
@@ -46,7 +46,8 @@ image::
 # it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
 # pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
 # TODO: the following fails.  Why?  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  .
-test::
+.PHONY: test
+test::   venv
 	@# Help: Use the already-built virtual environment to run pytest on the test directory.
 	source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  data_processing_tests/data_access;
 	source venv/bin/activate; export PYTHONPATH=../src;  cd test; $(PYTEST)  data_processing_tests/transform;

{data_prep_toolkit-0.2.0.dev5/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,14 @@
 Metadata-Version: 2.1
 Name: data_prep_toolkit
-Version: 0.2.0.dev5
+Version: 0.2.1
 Summary: Data Preparation Toolkit Library
 Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
 License: Apache-2.0
+Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
-Requires-Dist: pyarrow==15.0.2
+Requires-Dist: numpy<1.29.0
+Requires-Dist: pyarrow==16.1.0
 Requires-Dist: boto3==1.34.69
 Requires-Dist: argparse
 Requires-Dist: mmh3
@@ -41,12 +43,14 @@ source venv/bin/activate
 or set up your IDE to use the venv directory when developing in this project
 ## Library Artifact Build and Publish
-To test, build and publish the library to artifactory
+To test, build and publish the library
 ```shell
 make test build publish
 ```
 To up the version number, edit the Makefile to change VERSION and rerun
 the above.  This will require committing both the `Makefile` and the
 autotmatically updated `pyproject.toml` file.

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/README.md RENAMED Viewed

@@ -18,12 +18,14 @@ source venv/bin/activate
 or set up your IDE to use the venv directory when developing in this project
 ## Library Artifact Build and Publish
-To test, build and publish the library to artifactory
+To test, build and publish the library
 ```shell
 make test build publish
 ```
 To up the version number, edit the Makefile to change VERSION and rerun
 the above.  This will require committing both the `Makefile` and the
 autotmatically updated `pyproject.toml` file.

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,8 @@
 [project]
 name = "data_prep_toolkit"
-version = "0.2.0.dev5"
+version = "0.2.1"
 requires-python = ">=3.10"
+keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 description = "Data Preparation Toolkit Library"
 license = {text = "Apache-2.0"}
 readme = {file = "README.md", content-type = "text/markdown"}
@@ -10,12 +11,18 @@ authors = [
     { name = "Boris Lublinsky", email = "blublinsky@ibm.com" },
 ]
 dependencies = [
-    "pyarrow==15.0.2",
+    "numpy < 1.29.0",
+    "pyarrow==16.1.0",
     "boto3==1.34.69",
     "argparse",
     "mmh3",
 ]
+[project_urls]
+Repository = "https://github.com/IBM/data-prep-kit"
+Issues = "https://github.com/IBM/data-prep-kit/issues"
+Documentation = "https://ibm.github.io/data-prep-kit/"
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
 build-backend = "setuptools.build_meta"
@@ -34,7 +41,7 @@ dev = [
 ]
 [options]
-package_dir = ["src","test"]
+package_dir = ["src"]
 [options.packages.find]
 where = ["src/data_processing"]

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1/src/data_prep_toolkit.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,14 @@
 Metadata-Version: 2.1
 Name: data_prep_toolkit
-Version: 0.2.0.dev5
+Version: 0.2.1
 Summary: Data Preparation Toolkit Library
 Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
 License: Apache-2.0
+Keywords: data,data preprocessing,data preparation,llm,generative,ai,fine-tuning,llmapps
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
-Requires-Dist: pyarrow==15.0.2
+Requires-Dist: numpy<1.29.0
+Requires-Dist: pyarrow==16.1.0
 Requires-Dist: boto3==1.34.69
 Requires-Dist: argparse
 Requires-Dist: mmh3
@@ -41,12 +43,14 @@ source venv/bin/activate
 or set up your IDE to use the venv directory when developing in this project
 ## Library Artifact Build and Publish
-To test, build and publish the library to artifactory
+To test, build and publish the library
 ```shell
 make test build publish
 ```
 To up the version number, edit the Makefile to change VERSION and rerun
 the above.  This will require committing both the `Makefile` and the
 autotmatically updated `pyproject.toml` file.

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/SOURCES.txt RENAMED Viewed

@@ -14,16 +14,20 @@ src/data_processing/data_access/data_access_factory.py
 src/data_processing/data_access/data_access_factory_base.py
 src/data_processing/data_access/data_access_local.py
 src/data_processing/data_access/data_access_s3.py
+src/data_processing/data_access/snapshot_utils.py
 src/data_processing/runtime/__init__.py
 src/data_processing/runtime/execution_configuration.py
 src/data_processing/runtime/runtime_configuration.py
 src/data_processing/runtime/transform_file_processor.py
 src/data_processing/runtime/transform_launcher.py
 src/data_processing/runtime/pure_python/__init__.py
+src/data_processing/runtime/pure_python/execution_configuration.py
 src/data_processing/runtime/pure_python/runtime_configuration.py
 src/data_processing/runtime/pure_python/transform_file_processor.py
+src/data_processing/runtime/pure_python/transform_invoker.py
 src/data_processing/runtime/pure_python/transform_launcher.py
 src/data_processing/runtime/pure_python/transform_orchestrator.py
+src/data_processing/runtime/pure_python/transform_runtime.py
 src/data_processing/test_support/__init__.py
 src/data_processing/test_support/abstract_test.py
 src/data_processing/test_support/data_access/__init__.py
@@ -31,10 +35,10 @@ src/data_processing/test_support/data_access/data_access_factory_test.py
 src/data_processing/test_support/launch/__init__.py
 src/data_processing/test_support/launch/transform_test.py
 src/data_processing/test_support/transform/__init__.py
+src/data_processing/test_support/transform/binary_transform_test.py
 src/data_processing/test_support/transform/noop_transform.py
-src/data_processing/test_support/transform/transform_test.py
+src/data_processing/test_support/transform/table_transform_test.py
 src/data_processing/transform/__init__.py
-src/data_processing/transform/abstract_transform.py
 src/data_processing/transform/binary_transform.py
 src/data_processing/transform/table_transform.py
 src/data_processing/transform/transform_configuration.py
@@ -44,7 +48,11 @@ src/data_processing/utils/cli_utils.py
 src/data_processing/utils/config.py
 src/data_processing/utils/log.py
 src/data_processing/utils/params_utils.py
+src/data_processing/utils/pipinstaller.py
+src/data_processing/utils/transform_configuration.json
+src/data_processing/utils/transform_configurator.py
 src/data_processing/utils/transform_utils.py
+src/data_processing/utils/unrecoverable.py
 test-data/data_processing/daf/input/ds1/sample1.parquet
 test-data/data_processing/daf/input/ds1/sample2.parquet
 test-data/data_processing/daf/input/ds2/sample3.parquet
@@ -55,15 +63,19 @@ test-data/data_processing/input_multiple/sample2.parquet
 test-data/data_processing/input_multiple/sample3.parquet
 test-data/data_processing/python/noop/expected/metadata.json
 test-data/data_processing/python/noop/expected/sample1.parquet
+test-data/data_processing/python/noop/expected/test1.parquet
 test-data/data_processing/python/noop/expected/subdir/test1.parquet
 test-data/data_processing/python/noop/input/sample1.parquet
+test-data/data_processing/python/noop/input/test1.parquet
 test-data/data_processing/python/noop/input/subdir/test1.parquet
 test/data_processing_tests/data_access/daf_local_test.py
 test/data_processing_tests/data_access/data_access_local_test.py
 test/data_processing_tests/data_access/data_access_s3_test.py
 test/data_processing_tests/data_access/sample_input_data_test.py
+test/data_processing_tests/invoker/python_invoker_test.py
 test/data_processing_tests/launch/pure_python/launcher_test.py
 test/data_processing_tests/launch/pure_python/multi_launcher_test.py
 test/data_processing_tests/launch/pure_python/test_noop_launch.py
+test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py
 test/data_processing_tests/transform/test_noop.py
 test/data_processing_tests/util/transform_utils_test.py

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_prep_toolkit.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,5 @@
-pyarrow==15.0.2
+numpy<1.29.0
+pyarrow==16.1.0
 boto3==1.34.69
 argparse
 mmh3

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/__init__.py RENAMED Viewed

@@ -4,3 +4,4 @@ from data_processing.data_access.data_access_local import DataAccessLocal
 from data_processing.data_access.data_access_s3 import DataAccessS3
 from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
 from data_processing.data_access.data_access_factory import DataAccessFactory
+from data_processing.data_access.snapshot_utils import SnapshotUtils

{data_prep_toolkit-0.2.0.dev5 → data_prep_toolkit-0.2.1}/src/data_processing/data_access/arrow_s3.py RENAMED Viewed

@@ -56,6 +56,7 @@ class ArrowS3:
             config=Config(retries={"max_attempts": s3_max_attempts, "mode": "standard"}),
         )
         self.retries = s3_retries
+        self.s3_max_attempts = s3_max_attempts
     @staticmethod
     def _get_bucket_key(key: str) -> tuple[str, str]:
@@ -68,144 +69,162 @@ class ArrowS3:
         return prefixes[0], "/".join(prefixes[1:])
     # get list of the files (names and sizes) for a given prefix (including bucket name)
-    def list_files(self, key: str) -> list[dict[str, Any]]:
+    def list_files(self, key: str) -> tuple[list[dict[str, Any]], int]:
         """
         List files in the folder (hierarchically going through all sub-folders)
         :param key: complete folder name
-        :return: list of dictionaries, containing file names and length
+        :return: list of dictionaries, containing file names and length and number of retries
         """
         bucket, prefix = self._get_bucket_key(key)
         # Use paginator here to get all the files rather then 1 page
         paginator = self.s3_client.get_paginator("list_objects_v2")
         pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
         files = []
+        retries = 0
         for page in pages:
             # For every page
+            retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
             for obj in page.get("Contents", []):
                 # Get both file name and size
                 files.append({"name": f"{bucket}/{obj['Key']}", "size": obj["Size"]})
-        return files
+        return files, retries
-    def list_folders(self, key: str) -> list[str]:
+    def list_folders(self, key: str) -> tuple[list[str], int]:
         """
         Get list of folders for folder
         :param key: complete folder
-        :return: list of folders within a given folder
+        :return: list of folders within a given folder and number of retries
         """
-        bucket, prefix = self._get_bucket_key(key)
-        def _get_sub_folders(bck: str, p: str) -> list[str]:
+        def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]:
+            sub_folders = []
             # use paginator
             paginator = self.s3_client.get_paginator("list_objects_v2")
             # use Delimiter to get folders just folders
             page_iterator = paginator.paginate(Bucket=bck, Prefix=p, Delimiter="/")
-            sub_folders = []
+            internal_retries = 0
             for page in page_iterator:
                 # for every page
+                internal_retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
                 for p in page.get("CommonPrefixes", []):
-                    sub_folders.append(p["Prefix"])
+                    sf = p["Prefix"]
+                    sub_folders.append(sf)
                     # apply recursively
-                    sub_folders.extend(_get_sub_folders(bck, p["Prefix"]))
-            return sub_folders
-        prefixes = _get_sub_folders(bck=bucket, p=prefix)
-        # remove base prefix
-        return [p.removeprefix(prefix) for p in prefixes]
+                    sf, r = _get_sub_folders(bck=bck, p=sf)
+                    internal_retries += r
+                    sub_folders.extend(sf)
+            return sub_folders, internal_retries
+        bucket, prefix = self._get_bucket_key(key)
+        subs, retries = _get_sub_folders(bck=bucket, p=prefix)
+        return [f"{bucket}/{f}" for f in subs], retries
-    def read_file(self, key: str) -> bytes:
+    def read_file(self, key: str) -> tuple[bytes, int]:
         """
         Read an s3 file by name
         :param key: complete path
-        :return: byte array of file content or None if the file does not exist
+        :return: byte array of file content or None if the file does not exist and a number of retries
         """
         bucket, prefix = self._get_bucket_key(key)
+        retries = 0
         for n in range(self.retries):
             try:
                 obj = self.s3_client.get_object(Bucket=bucket, Key=prefix)
-                return obj["Body"].read()
+                retries += obj.get("ResponseMetadata", {}).get("RetryAttempts", 0)
+                return obj["Body"].read(), retries
             except Exception as e:
                 logger.error(f"failed to read file {key}, exception {e}, attempt {n}")
+                retries += self.s3_max_attempts
         logger.error(f"failed to read file {key} in {self.retries} attempts. Skipping it")
-        return None
+        return None, retries
-    def save_file(self, key: str, data: bytes) -> dict[str, Any]:
+    def save_file(self, key: str, data: bytes) -> tuple[dict[str, Any], int]:
         """
         Save file to S3
         :param key: complete path
         :param data: byte array of the file content
         :return: dictionary as
         defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
-        in the case of failure dict is None
+        in the case of failure dict is None and the number of retries
         """
         bucket, prefix = self._get_bucket_key(key)
+        retries = 0
         for n in range(self.retries):
             try:
-                return self.s3_client.put_object(Bucket=bucket, Key=prefix, Body=data)
+                res = self.s3_client.put_object(Bucket=bucket, Key=prefix, Body=data)
+                retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
+                return res, retries
             except Exception as e:
                 logger.error(f"Failed to upload file to to key {key}, exception {e}")
+                retries += self.s3_max_attempts
         logger.error(f"Failed to upload file {key}, skipping it")
-        return None
+        return None, retries
-    def read_table(self, key: str, schema: pa.schema = None) -> pa.Table:
+    def read_table(self, key: str, schema: pa.schema = None) -> tuple[pa.Table, int]:
         """
         Get an arrow table from a file with a given name
         :param key: complete path
         :param schema: Schema used for reading table, default None
-        :return: table or None if the read failed
+        :return: table or None if the read failed and the number of retries
         """
         # Read file as bytes
-        data = self.read_file(key)
+        data, retries = self.read_file(key)
         if data is None:
-            return None
-        return TransformUtils.convert_binary_to_arrow(data=data, schema=schema)
+            return None, retries
+        return TransformUtils.convert_binary_to_arrow(data=data, schema=schema), retries
-    def save_table(self, key: str, table: pa.Table) -> tuple[int, dict[str, Any]]:
+    def save_table(self, key: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
         """
         Save an arrow table to a file with a name
         :param key: complete path
         :param table: table to save
         :return: table size and a dictionary as
         defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
-        in the case of failure len is -1 and dict is None
+        in the case of failure len is -1 and dict is None and the number of retries
         """
         # convert to bytes
         data = TransformUtils.convert_arrow_to_binary(table=table)
         if data is None:
             return -1, None
         # save bytes
-        return len(data), self.save_file(key, data)
+        res, retries = self.save_file(key, data)
+        return len(data), res, retries
-    def delete_file(self, key: str) -> None:
+    def delete_file(self, key: str) -> int:
         """
         Delete file from S3
         :param key: complete path
-        :return: None
+        :return: the number of retries
         """
         bucket, prefix = self._get_bucket_key(key)
+        retries = 0
         for n in range(self.retries):
             try:
-                self.s3_client.delete_object(Bucket=bucket, Key=prefix)
-                return None
+                res = self.s3_client.delete_object(Bucket=bucket, Key=prefix)
+                retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
+                return retries
             except Exception as e:
                 logger.error(f"failed to delete file {key}, exception {e}")
-        return None
+                retries += self.s3_max_attempts
+        return retries
-    def move_file(self, source: str, dest: str) -> None:
+    def move_file(self, source: str, dest: str) -> int:
         """
         move file from source to destination
         :param source: complete source path
         :param dest: complete destination path
-        :return: None
+        :return: number of retries
         """
         s_bucket, s_prefix = self._get_bucket_key(source)
         d_bucket, d_prefix = self._get_bucket_key(dest)
         # copy source to destination and then delete source
         copy_source = {"Bucket": s_bucket, "Key": s_prefix}
+        retries = 0
         for n in range(self.retries):
             try:
-                self.s3_client.copy_object(CopySource=copy_source, Bucket=d_bucket, Key=d_prefix)
-                self.delete_file(source)
-                return None
+                res = self.s3_client.copy_object(CopySource=copy_source, Bucket=d_bucket, Key=d_prefix)
+                retries += res.get("ResponseMetadata", {}).get("RetryAttempts", 0)
+                retries += self.delete_file(source)
+                return retries
             except Exception as e:
                 logger.error(f"failed to copy file {source} to {dest}, exception {e}")
-        return None
+                retries += self.s3_max_attempts
+        return retries

data-prep-toolkit 0.2.0.dev5__tar.gz → 0.2.1__tar.gz

data-prep-toolkit 0.2.0.dev5tar.gz → 0.2.1tar.gz