data-prep-toolkit 0.2.1.dev2__tar.gz → 0.2.2.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {data_prep_toolkit-0.2.1.dev2/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.2.dev0}/PKG-INFO +3 -1
  2. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/README.md +2 -0
  3. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/pyproject.toml +2 -2
  4. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0/src/data_prep_toolkit.egg-info}/PKG-INFO +3 -1
  5. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_prep_toolkit.egg-info/SOURCES.txt +1 -1
  6. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/data_access/__init__.py +1 -0
  7. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/data_access/arrow_s3.py +7 -9
  8. data_prep_toolkit-0.2.2.dev0/src/data_processing/data_access/data_access.py +457 -0
  9. data_prep_toolkit-0.2.2.dev0/src/data_processing/data_access/data_access_local.py +249 -0
  10. data_prep_toolkit-0.2.2.dev0/src/data_processing/data_access/data_access_s3.py +207 -0
  11. data_prep_toolkit-0.2.2.dev0/src/data_processing/data_access/snapshot_utils.py +31 -0
  12. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/__init__.py +1 -1
  13. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/execution_configuration.py +5 -5
  14. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/__init__.py +4 -2
  15. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/execution_configuration.py +1 -3
  16. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/runtime_configuration.py +6 -5
  17. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/transform_file_processor.py +16 -9
  18. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/transform_launcher.py +1 -34
  19. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/transform_orchestrator.py +53 -30
  20. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/transform_runtime.py +1 -1
  21. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/transform_file_processor.py +19 -14
  22. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/transform_launcher.py +52 -1
  23. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/abstract_test.py +22 -22
  24. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/launch/transform_test.py +1 -1
  25. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/transform/__init__.py +0 -1
  26. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/transform/binary_transform.py +1 -6
  27. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/transform/table_transform.py +1 -1
  28. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/transform/transform_configuration.py +7 -5
  29. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/transform/transform_statistics.py +1 -2
  30. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/data_access/data_access_local_test.py +15 -17
  31. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/data_access/data_access_s3_test.py +1 -1
  32. data_prep_toolkit-0.2.1.dev2/src/data_processing/data_access/data_access.py +0 -245
  33. data_prep_toolkit-0.2.1.dev2/src/data_processing/data_access/data_access_local.py +0 -420
  34. data_prep_toolkit-0.2.1.dev2/src/data_processing/data_access/data_access_s3.py +0 -386
  35. data_prep_toolkit-0.2.1.dev2/src/data_processing/transform/abstract_transform.py +0 -17
  36. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/Makefile +0 -0
  37. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/setup.cfg +0 -0
  38. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  39. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
  40. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  41. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/__init__.py +0 -0
  42. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/data_access/data_access_factory.py +0 -0
  43. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  44. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/pure_python/transform_invoker.py +0 -0
  45. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/runtime/runtime_configuration.py +0 -0
  46. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/__init__.py +0 -0
  47. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/data_access/__init__.py +0 -0
  48. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  49. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/launch/__init__.py +0 -0
  50. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/transform/__init__.py +0 -0
  51. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/transform/binary_transform_test.py +0 -0
  52. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/transform/noop_transform.py +0 -0
  53. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/test_support/transform/table_transform_test.py +0 -0
  54. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/__init__.py +0 -0
  55. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/cli_utils.py +0 -0
  56. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/config.py +0 -0
  57. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/log.py +0 -0
  58. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/params_utils.py +0 -0
  59. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/pipinstaller.py +0 -0
  60. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/transform_configuration.json +0 -0
  61. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/transform_configurator.py +0 -0
  62. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/transform_utils.py +0 -0
  63. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/src/data_processing/utils/unrecoverable.py +0 -0
  64. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  65. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  66. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/invoker/python_invoker_test.py +0 -0
  67. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
  68. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
  69. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -0
  70. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +0 -0
  71. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/transform/test_noop.py +0 -0
  72. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  73. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  74. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  75. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  76. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  77. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/input/sample1.parquet +0 -0
  78. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  79. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  80. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  81. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
  82. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
  83. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
  84. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
  85. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
  86. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
  87. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.2.dev0}/test-data/data_processing/python/noop/input/test1.parquet +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1.dev2
3
+ Version: 0.2.2.dev0
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -47,8 +47,10 @@ To test, build and publish the library
47
47
  ```shell
48
48
  make test build publish
49
49
  ```
50
+
50
51
  To up the version number, edit the Makefile to change VERSION and rerun
51
52
  the above. This will require committing both the `Makefile` and the
52
53
  autotmatically updated `pyproject.toml` file.
53
54
 
54
55
 
56
+
@@ -22,8 +22,10 @@ To test, build and publish the library
22
22
  ```shell
23
23
  make test build publish
24
24
  ```
25
+
25
26
  To up the version number, edit the Makefile to change VERSION and rerun
26
27
  the above. This will require committing both the `Makefile` and the
27
28
  autotmatically updated `pyproject.toml` file.
28
29
 
29
30
 
31
+
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.2.1.dev2"
3
+ version = "0.2.2.dev0"
4
4
  requires-python = ">=3.10"
5
5
  keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
6
6
  description = "Data Preparation Toolkit Library"
@@ -41,7 +41,7 @@ dev = [
41
41
  ]
42
42
 
43
43
  [options]
44
- package_dir = ["src"]
44
+ package_dir = ["src","test"]
45
45
 
46
46
  [options.packages.find]
47
47
  where = ["src/data_processing"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1.dev2
3
+ Version: 0.2.2.dev0
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -47,8 +47,10 @@ To test, build and publish the library
47
47
  ```shell
48
48
  make test build publish
49
49
  ```
50
+
50
51
  To up the version number, edit the Makefile to change VERSION and rerun
51
52
  the above. This will require committing both the `Makefile` and the
52
53
  autotmatically updated `pyproject.toml` file.
53
54
 
54
55
 
56
+
@@ -14,6 +14,7 @@ src/data_processing/data_access/data_access_factory.py
14
14
  src/data_processing/data_access/data_access_factory_base.py
15
15
  src/data_processing/data_access/data_access_local.py
16
16
  src/data_processing/data_access/data_access_s3.py
17
+ src/data_processing/data_access/snapshot_utils.py
17
18
  src/data_processing/runtime/__init__.py
18
19
  src/data_processing/runtime/execution_configuration.py
19
20
  src/data_processing/runtime/runtime_configuration.py
@@ -38,7 +39,6 @@ src/data_processing/test_support/transform/binary_transform_test.py
38
39
  src/data_processing/test_support/transform/noop_transform.py
39
40
  src/data_processing/test_support/transform/table_transform_test.py
40
41
  src/data_processing/transform/__init__.py
41
- src/data_processing/transform/abstract_transform.py
42
42
  src/data_processing/transform/binary_transform.py
43
43
  src/data_processing/transform/table_transform.py
44
44
  src/data_processing/transform/transform_configuration.py
@@ -4,3 +4,4 @@ from data_processing.data_access.data_access_local import DataAccessLocal
4
4
  from data_processing.data_access.data_access_s3 import DataAccessS3
5
5
  from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
6
6
  from data_processing.data_access.data_access_factory import DataAccessFactory
7
+ from data_processing.data_access.snapshot_utils import SnapshotUtils
@@ -95,29 +95,27 @@ class ArrowS3:
95
95
  :param key: complete folder
96
96
  :return: list of folders within a given folder and number of retries
97
97
  """
98
- bucket, prefix = self._get_bucket_key(key)
99
-
100
98
  def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]:
99
+ sub_folders = []
101
100
  # use paginator
102
101
  paginator = self.s3_client.get_paginator("list_objects_v2")
103
102
  # use Delimiter to get folders just folders
104
103
  page_iterator = paginator.paginate(Bucket=bck, Prefix=p, Delimiter="/")
105
- sub_folders = []
106
104
  internal_retries = 0
107
105
  for page in page_iterator:
108
106
  # for every page
109
107
  internal_retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
110
108
  for p in page.get("CommonPrefixes", []):
111
- sub_folders.append(p["Prefix"])
109
+ sf = p["Prefix"]
110
+ sub_folders.append(sf)
112
111
  # apply recursively
113
- sf, r = _get_sub_folders(bck, p["Prefix"])
112
+ sf, r = _get_sub_folders(bck=bck, p=sf)
114
113
  internal_retries += r
115
114
  sub_folders.extend(sf)
116
115
  return sub_folders, internal_retries
117
-
118
- prefixes, retries = _get_sub_folders(bck=bucket, p=prefix)
119
- # remove base prefix
120
- return [p.removeprefix(prefix) for p in prefixes], retries
116
+ bucket, prefix = self._get_bucket_key(key)
117
+ subs, retries = _get_sub_folders(bck=bucket, p=prefix)
118
+ return [f"{bucket}/{f}" for f in subs], retries
121
119
 
122
120
  def read_file(self, key: str) -> tuple[bytes, int]:
123
121
  """
@@ -0,0 +1,457 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import random
14
+ from typing import Any
15
+
16
+ import pyarrow as pa
17
+ from data_processing.utils import KB, MB, GB, TransformUtils, get_logger
18
+
19
+
20
+ class DataAccess:
21
+ """
22
+ Base class for data access (interface), defining all the methods
23
+ """
24
+ def __init__(
25
+ self,
26
+ d_sets: list[str],
27
+ checkpoint: bool,
28
+ m_files: int,
29
+ n_samples: int,
30
+ files_to_use: list[str],
31
+ files_to_checkpoint: list[str],
32
+ ):
33
+ """
34
+ Create data access class for folder based configuration
35
+ :param d_sets list of the data sets to use
36
+ :param checkpoint: flag to return only files that do not exist in the output directory
37
+ :param m_files: max amount of files to return
38
+ :param n_samples: amount of files to randomly sample
39
+ :param files_to_use: files extensions of files to include
40
+ :param files_to_checkpoint: files extensions of files to use for checkpointing
41
+ """
42
+ self.d_sets = d_sets
43
+ self.checkpoint = checkpoint
44
+ self.m_files = m_files
45
+ self.n_samples = n_samples
46
+ self.files_to_use = files_to_use
47
+ self.files_to_checkpoint = files_to_checkpoint
48
+ self.logger = get_logger(__name__)
49
+
50
+ def get_output_folder(self) -> str:
51
+ """
52
+ Get output folder as a string
53
+ :return: output_folder
54
+ """
55
+ raise NotImplementedError("Subclasses should implement this!")
56
+
57
+ def get_input_folder(self) -> str:
58
+ """
59
+ Get input folder as a string
60
+ :return: input_folder
61
+ """
62
+ raise NotImplementedError("Subclasses should implement this!")
63
+
64
+ def get_random_file_set(self, n_samples: int, files: list[str]) -> list[str]:
65
+ """
66
+ Get random set of files
67
+ :param n_samples: set size
68
+ :param files: list of original files
69
+ :return: set of randomly selected files
70
+ """
71
+ # Pick files to include
72
+ if len(files) > n_samples:
73
+ # Pick files at random
74
+ files_set = [int(random.random() * len(files)) for _ in range(n_samples)]
75
+ else:
76
+ # use all existing files
77
+ files_set = range(len(files))
78
+ result = [""] * len(files_set)
79
+ index = 0
80
+ for f in files_set:
81
+ result[index] = files[f]
82
+ index += 1
83
+ self.logger.info(f"Using files {result} to sample data")
84
+ return result
85
+
86
+ def get_files_to_process(self) -> tuple[list[str], dict[str, float], int]:
87
+ """
88
+ Get files to process
89
+ :return: list of files and a dictionary of the files profile:
90
+ "max_file_size_MB",
91
+ "min_file_size_MB",
92
+ "avg_file_size_MB",
93
+ "total_file_size_MB"
94
+ and the number of operation retries.
95
+ Retries are performed on operation failures and are typically due to the resource overload.
96
+ """
97
+ if self.get_output_folder() is None:
98
+ self.logger.warning("Input/Output are not defined, returning empty list")
99
+ return [], {}, 0
100
+ path_list, path_profile, retries = self._get_files_to_process_internal()
101
+ if self.n_samples > 0:
102
+ files = self.get_random_file_set(n_samples=self.n_samples, files=path_list)
103
+ return files, path_profile, retries
104
+ return path_list, path_profile, retries
105
+
106
+ def _get_files_to_process_internal(self) -> tuple[list[str], dict[str, float], int]:
107
+ """
108
+ Get files to process
109
+ :return: list of files and a dictionary of the files profile:
110
+ "max_file_size_MB",
111
+ "min_file_size_MB",
112
+ "avg_file_size_MB",
113
+ "total_file_size_MB"
114
+ and number of operation retries.
115
+ Retries are performed on operation failures and are typically due to the resource overload.
116
+ """
117
+ # Check if we are using data sets
118
+ if self.d_sets is not None:
119
+ # get folders for the input
120
+ folders_to_use, retries = self._get_folders_to_use()
121
+ profile = {"max_file_size": 0.0, "min_file_size": 0.0, "total_file_size": 0.0}
122
+ if len(folders_to_use) > 0:
123
+ # if we have valid folders
124
+ path_list = []
125
+ max_file_size = 0
126
+ min_file_size = MB * GB
127
+ total_file_size = 0
128
+ cm_files = self.m_files
129
+ for folder in folders_to_use:
130
+ plist, profile, retries1 = self._get_input_files(
131
+ input_path=folder,
132
+ output_path=self.get_output_location(folder),
133
+ cm_files=cm_files,
134
+ min_file_size=min_file_size,
135
+ max_file_size=max_file_size,
136
+ )
137
+ retries += retries1
138
+ path_list += plist
139
+ total_file_size += profile["total_file_size"]
140
+ if len(path_list) >= cm_files > 0:
141
+ break
142
+ max_file_size = profile["max_file_size"] * MB
143
+ min_file_size = profile["min_file_size"] * MB
144
+ if cm_files > 0:
145
+ cm_files -= len(plist)
146
+ profile["total_file_size"] = total_file_size
147
+ else:
148
+ path_list = []
149
+ else:
150
+ # Get input files list
151
+ path_list, profile, retries = self._get_input_files(
152
+ input_path=self.get_input_folder(),
153
+ output_path=self.get_output_folder(),
154
+ cm_files=self.m_files,
155
+ )
156
+ return path_list, profile, retries
157
+
158
+ def _get_folders_to_use(self) -> tuple[list[str], int]:
159
+ """
160
+ convert data sets to a list of folders to use
161
+ :return: list of folders and retries
162
+ """
163
+ raise NotImplementedError("Subclasses should implement this!")
164
+
165
+ def _get_files_folder(
166
+ self,
167
+ path: str,
168
+ files_to_use: list[str],
169
+ cm_files: int,
170
+ max_file_size: int = 0,
171
+ min_file_size: int = MB * GB
172
+ ) -> tuple[list[dict[str, Any]], dict[str, float], int]:
173
+ """
174
+ Support method to get list input files and their profile
175
+ :param path: input path
176
+ :param files_to_use: file extensions to use
177
+ :param max_file_size: max file size
178
+ :param min_file_size: min file size
179
+ :param cm_files: overwrite for the m_files in the class
180
+ :return: tuple of file list, profile and number of retries
181
+ """
182
+ # Get files list.
183
+ p_list = []
184
+ total_input_file_size = 0
185
+ i = 0
186
+ files, retries = self._list_files_folder(path=path)
187
+ for file in files:
188
+ if i >= cm_files > 0:
189
+ break
190
+ # Only use specified files
191
+ f_name = str(file["name"])
192
+ if files_to_use is not None:
193
+ name_extension = TransformUtils.get_file_extension(f_name)
194
+ if name_extension[1] not in files_to_use:
195
+ continue
196
+ p_list.append(file)
197
+ size = file["size"]
198
+ total_input_file_size += size
199
+ if min_file_size > size:
200
+ min_file_size = size
201
+ if max_file_size < size:
202
+ max_file_size = size
203
+ i += 1
204
+ return (
205
+ p_list,
206
+ {
207
+ "max_file_size": max_file_size / MB,
208
+ "min_file_size": min_file_size / MB,
209
+ "total_file_size": total_input_file_size / MB,
210
+ },
211
+ retries,
212
+ )
213
+
214
+ def _get_input_files(
215
+ self,
216
+ input_path: str,
217
+ output_path: str,
218
+ cm_files: int,
219
+ max_file_size: int = 0,
220
+ min_file_size: int = MB * GB,
221
+ ) -> tuple[list[str], dict[str, float], int]:
222
+ """
223
+ Get list and size of files from input path, that do not exist in the output path
224
+ :param input_path: input path
225
+ :param output_path: output path
226
+ :param cm_files: max files to get
227
+ :return: tuple of file list, profile and number of retries
228
+ """
229
+ if not self.checkpoint:
230
+ file_sizes, profile, retries = self._get_files_folder(
231
+ path=input_path,
232
+ files_to_use=self.files_to_use,
233
+ cm_files=cm_files,
234
+ min_file_size=min_file_size,
235
+ max_file_size=max_file_size,
236
+ )
237
+ files = [fs["name"] for fs in file_sizes]
238
+ return files, profile, retries
239
+
240
+ pout_list, _, retries1 = self._get_files_folder(
241
+ path=output_path, files_to_use=self.files_to_checkpoint, cm_files=-1
242
+ )
243
+ output_base_names_ext = [file["name"].replace(self.get_output_folder(), self.get_input_folder())
244
+ for file in pout_list]
245
+ # In the case of binary transforms, an extension can be different, so just use the file names.
246
+ # Also remove duplicates
247
+ output_base_names = list(set([TransformUtils.get_file_extension(file)[0] for file in output_base_names_ext]))
248
+ p_list = []
249
+ total_input_file_size = 0
250
+ i = 0
251
+ files, _, retries = self._get_files_folder(
252
+ path=input_path, files_to_use=self.files_to_use, cm_files=-1
253
+ )
254
+ retries += retries1
255
+ for file in files:
256
+ if i >= cm_files > 0:
257
+ break
258
+ f_name = file["name"]
259
+ name_extension = TransformUtils.get_file_extension(f_name)
260
+ if self.files_to_use is not None:
261
+ if name_extension[1] not in self.files_to_use:
262
+ continue
263
+ if name_extension[0] not in output_base_names:
264
+ p_list.append(f_name)
265
+ size = file["size"]
266
+ total_input_file_size += size
267
+ if min_file_size > size:
268
+ min_file_size = size
269
+ if max_file_size < size:
270
+ max_file_size = size
271
+ i += 1
272
+ return (
273
+ p_list,
274
+ {
275
+ "max_file_size": max_file_size / MB,
276
+ "min_file_size": min_file_size / MB,
277
+ "total_file_size": total_input_file_size / MB,
278
+ },
279
+ retries,
280
+ )
281
+
282
+ def _list_files_folder(self, path: str) -> tuple[list[dict[str, Any]], int]:
283
+ """
284
+ Get files for a given folder and all sub folders
285
+ :param path: path
286
+ :return: List of files
287
+ """
288
+ raise NotImplementedError("Subclasses should implement this!")
289
+
290
+ def get_table(self, path: str) -> tuple[pa.table, int]:
291
+ """
292
+ Get pyArrow table for a given path
293
+ :param path - file path
294
+ :return: pyArrow table or None, if the table read failed and number of operation retries.
295
+ Retries are performed on operation failures and are typically due to the resource overload.
296
+ """
297
+ raise NotImplementedError("Subclasses should implement this!")
298
+
299
+ def get_file(self, path: str) -> tuple[bytes, int]:
300
+ """
301
+ Get file as a byte array
302
+ :param path: file path
303
+ :return: bytes array of file content and number of operation retries
304
+ Retries are performed on operation failures and are typically due to the resource overload.
305
+
306
+ """
307
+ raise NotImplementedError("Subclasses should implement this!")
308
+
309
+ def get_folder_files(
310
+ self, path: str, extensions: list[str] = None, return_data: bool = True
311
+ ) -> tuple[dict[str, bytes], int]:
312
+ """
313
+ Get a list of byte content of files. The path here is an absolute path and can be anywhere.
314
+ :param path: file path
315
+ :param extensions: a list of file extensions to include. If None, then all files from this and
316
+ child ones will be returned
317
+ :param return_data: flag specifying whether the actual content of files is returned (True), or just
318
+ directory is returned (False)
319
+ :return: A dictionary of file names/binary content will be returned
320
+ """
321
+ def _get_file_content(name: str, dt: bool) -> tuple[bytes, int]:
322
+ """
323
+ return file content
324
+ :param name: file name
325
+ :param dt: flag to return data or None
326
+ :return: file content, number of retries
327
+ """
328
+ if dt:
329
+ return self.get_file(name)
330
+ return None, 0
331
+
332
+ result = {}
333
+ files, _, retries = self._get_files_folder(
334
+ path=path, files_to_use=extensions, cm_files=-1
335
+ )
336
+ for file in files:
337
+ f_name = str(file["name"])
338
+ b, retries1 = _get_file_content(f_name, return_data)
339
+ retries += retries1
340
+ result[f_name] = b
341
+ return result, retries
342
+
343
+ def save_file(self, path: str, data: bytes) -> tuple[dict[str, Any], int]:
344
+ """
345
+ Save byte array to the file
346
+ :param path: file path
347
+ :param data: byte array
348
+ :return: a dictionary as
349
+ defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
350
+ in the case of failure dict is None and number of operation retries
351
+ Retries are performed on operation failures and are typically due to the resource overload.
352
+ """
353
+ raise NotImplementedError("Subclasses should implement this!")
354
+
355
+ def get_output_location(self, path: str) -> str:
356
+ """
357
+ Get output location based on input
358
+ :param path: input file location
359
+ :return: output file location
360
+ """
361
+ raise NotImplementedError("Subclasses should implement this!")
362
+
363
+ def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
364
+ """
365
+ Save table to a given location
366
+ :param path: location to save table
367
+ :param table: table
368
+ :return: size of table in memory and a dictionary as
369
+ defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
370
+ in the case of failure dict is None and number of operation retries.
371
+ Retries are performed on operation failures and are typically due to the resource overload.
372
+ """
373
+ raise NotImplementedError("Subclasses should implement this!")
374
+
375
+ def save_job_metadata(self, metadata: dict[str, Any]) -> tuple[dict[str, Any], int]:
376
+ """
377
+ Save job metadata
378
+ :param metadata: a dictionary, containing the following keys:
379
+ "pipeline",
380
+ "job details",
381
+ "code",
382
+ "job_input_params",
383
+ "execution_stats",
384
+ "job_output_stats"
385
+ two additional elements:
386
+ "source"
387
+ "target"
388
+ are filled bu implementation
389
+ :return: a dictionary as
390
+ defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
391
+ in the case of failure dict is None and number of operation retries.
392
+ Retries are performed on operation failures and are typically due to the resource overload.
393
+ """
394
+ raise NotImplementedError("Subclasses should implement this!")
395
+
396
+ def sample_input_data(self, n_samples: int = 10) -> tuple[dict[str, Any], int]:
397
+ """
398
+ Sample input data set to get average table size, average doc size, number of docs, etc.
399
+ Note that here we are not reading all of the input documents, but rather randomly pick
400
+ their subset. It gives more precise answer as subset grows, but it takes longer
401
+ :param n_samples: number of samples to use - default 10
402
+ :return: a dictionary of the files profile:
403
+ "max_file_size_MB",
404
+ "min_file_size_MB",
405
+ "avg_file_size_MB",
406
+ "total_file_size_MB"
407
+ average table size MB,
408
+ average doc size KB,
409
+ estimated number of docs
410
+ and number of operation retries
411
+ Retries are performed on operation failures and are typically due to the resource overload.
412
+ """
413
+ # get files to process
414
+ path_list, path_profile, retries = self._get_files_to_process_internal()
415
+ # Pick files to sample
416
+ files = self.get_random_file_set(n_samples=n_samples, files=path_list)
417
+ # Read table and compute number of docs and sizes
418
+ number_of_docs = []
419
+ table_sizes = []
420
+ n_tables = 0
421
+ for f in files:
422
+ table, r = self.get_table(path=f)
423
+ retries += r
424
+ if table is not None:
425
+ n_tables += 1
426
+ number_of_docs.append(table.num_rows)
427
+ # As a table size is mostly document, we can consider them roughly the same
428
+ table_sizes.append(table.nbytes)
429
+ # compute averages
430
+ if n_tables == 0:
431
+ av_number_docs = 0
432
+ av_table_size = 0
433
+ av_doc_size = 0
434
+ else:
435
+ av_number_docs = sum(number_of_docs) / n_tables
436
+ av_table_size = sum(table_sizes) / n_tables / MB
437
+ if av_number_docs == 0:
438
+ av_doc_size = 0
439
+ else:
440
+ av_doc_size = av_table_size * MB / av_number_docs / KB
441
+ self.logger.info(
442
+ f"average number of docs {av_number_docs}, average table size {av_table_size} MB, "
443
+ f"average doc size {av_doc_size} kB"
444
+ )
445
+
446
+ # compute number of docs
447
+ number_of_docs = av_number_docs * len(path_list)
448
+ self.logger.info(f"Estimated number of docs {number_of_docs}")
449
+ return (
450
+ path_profile
451
+ | {
452
+ "average table size MB": av_table_size,
453
+ "average doc size KB": av_doc_size,
454
+ "estimated number of docs": number_of_docs,
455
+ },
456
+ retries,
457
+ )