data-prep-toolkit 0.2.1__tar.gz → 0.2.1.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {data_prep_toolkit-0.2.1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1.dev1}/PKG-INFO +1 -3
  2. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/README.md +0 -2
  3. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/pyproject.toml +1 -1
  4. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -3
  5. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_prep_toolkit.egg-info/SOURCES.txt +1 -4
  6. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/data_access/__init__.py +0 -1
  7. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/data_access/arrow_s3.py +9 -7
  8. data_prep_toolkit-0.2.1.dev1/src/data_processing/data_access/data_access.py +245 -0
  9. data_prep_toolkit-0.2.1.dev1/src/data_processing/data_access/data_access_local.py +420 -0
  10. data_prep_toolkit-0.2.1.dev1/src/data_processing/data_access/data_access_s3.py +386 -0
  11. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/__init__.py +1 -1
  12. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/execution_configuration.py +5 -5
  13. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/pure_python/__init__.py +1 -6
  14. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/pure_python/runtime_configuration.py +2 -15
  15. data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_file_processor.py +51 -0
  16. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/pure_python/transform_launcher.py +36 -3
  17. data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_orchestrator.py +116 -0
  18. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/transform_file_processor.py +17 -37
  19. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/transform_launcher.py +1 -52
  20. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/abstract_test.py +22 -22
  21. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/launch/transform_test.py +1 -1
  22. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/transform/__init__.py +1 -0
  23. data_prep_toolkit-0.2.1.dev1/src/data_processing/transform/abstract_transform.py +17 -0
  24. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/transform/binary_transform.py +6 -1
  25. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/transform/table_transform.py +1 -1
  26. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/transform/transform_configuration.py +5 -7
  27. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/transform/transform_statistics.py +2 -1
  28. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/log.py +0 -7
  29. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/data_access/data_access_local_test.py +17 -15
  30. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/data_access/data_access_s3_test.py +1 -1
  31. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +5 -0
  32. data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access.py +0 -457
  33. data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access_local.py +0 -249
  34. data_prep_toolkit-0.2.1/src/data_processing/data_access/data_access_s3.py +0 -207
  35. data_prep_toolkit-0.2.1/src/data_processing/data_access/snapshot_utils.py +0 -31
  36. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/execution_configuration.py +0 -70
  37. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_file_processor.py +0 -107
  38. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -224
  39. data_prep_toolkit-0.2.1/src/data_processing/runtime/pure_python/transform_runtime.py +0 -53
  40. data_prep_toolkit-0.2.1/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +0 -37
  41. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/Makefile +0 -0
  42. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/setup.cfg +0 -0
  43. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  44. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
  45. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  46. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/__init__.py +0 -0
  47. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/data_access/data_access_factory.py +0 -0
  48. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  49. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/pure_python/transform_invoker.py +0 -0
  50. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/runtime/runtime_configuration.py +0 -0
  51. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/__init__.py +0 -0
  52. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/data_access/__init__.py +0 -0
  53. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  54. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/launch/__init__.py +0 -0
  55. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/transform/__init__.py +0 -0
  56. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/transform/binary_transform_test.py +0 -0
  57. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/transform/noop_transform.py +0 -0
  58. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/test_support/transform/table_transform_test.py +0 -0
  59. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/__init__.py +0 -0
  60. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/cli_utils.py +0 -0
  61. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/config.py +0 -0
  62. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/params_utils.py +0 -0
  63. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/pipinstaller.py +0 -0
  64. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/transform_configuration.json +0 -0
  65. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/transform_configurator.py +0 -0
  66. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/transform_utils.py +0 -0
  67. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/src/data_processing/utils/unrecoverable.py +0 -0
  68. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  69. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  70. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/invoker/python_invoker_test.py +0 -0
  71. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
  72. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
  73. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/transform/test_noop.py +0 -0
  74. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  75. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  76. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  77. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  78. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  79. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/input/sample1.parquet +0 -0
  80. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  81. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  82. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  83. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
  84. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
  85. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
  86. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
  87. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
  88. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
  89. {data_prep_toolkit-0.2.1 → data_prep_toolkit-0.2.1.dev1}/test-data/data_processing/python/noop/input/test1.parquet +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1
3
+ Version: 0.2.1.dev1
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -47,10 +47,8 @@ To test, build and publish the library
47
47
  ```shell
48
48
  make test build publish
49
49
  ```
50
-
51
50
  To up the version number, edit the Makefile to change VERSION and rerun
52
51
  the above. This will require committing both the `Makefile` and the
53
52
  autotmatically updated `pyproject.toml` file.
54
53
 
55
54
 
56
-
@@ -22,10 +22,8 @@ To test, build and publish the library
22
22
  ```shell
23
23
  make test build publish
24
24
  ```
25
-
26
25
  To up the version number, edit the Makefile to change VERSION and rerun
27
26
  the above. This will require committing both the `Makefile` and the
28
27
  autotmatically updated `pyproject.toml` file.
29
28
 
30
29
 
31
-
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.2.1"
3
+ version = "0.2.1.dev1"
4
4
  requires-python = ">=3.10"
5
5
  keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
6
6
  description = "Data Preparation Toolkit Library"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1
3
+ Version: 0.2.1.dev1
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -47,10 +47,8 @@ To test, build and publish the library
47
47
  ```shell
48
48
  make test build publish
49
49
  ```
50
-
51
50
  To up the version number, edit the Makefile to change VERSION and rerun
52
51
  the above. This will require committing both the `Makefile` and the
53
52
  autotmatically updated `pyproject.toml` file.
54
53
 
55
54
 
56
-
@@ -14,20 +14,17 @@ src/data_processing/data_access/data_access_factory.py
14
14
  src/data_processing/data_access/data_access_factory_base.py
15
15
  src/data_processing/data_access/data_access_local.py
16
16
  src/data_processing/data_access/data_access_s3.py
17
- src/data_processing/data_access/snapshot_utils.py
18
17
  src/data_processing/runtime/__init__.py
19
18
  src/data_processing/runtime/execution_configuration.py
20
19
  src/data_processing/runtime/runtime_configuration.py
21
20
  src/data_processing/runtime/transform_file_processor.py
22
21
  src/data_processing/runtime/transform_launcher.py
23
22
  src/data_processing/runtime/pure_python/__init__.py
24
- src/data_processing/runtime/pure_python/execution_configuration.py
25
23
  src/data_processing/runtime/pure_python/runtime_configuration.py
26
24
  src/data_processing/runtime/pure_python/transform_file_processor.py
27
25
  src/data_processing/runtime/pure_python/transform_invoker.py
28
26
  src/data_processing/runtime/pure_python/transform_launcher.py
29
27
  src/data_processing/runtime/pure_python/transform_orchestrator.py
30
- src/data_processing/runtime/pure_python/transform_runtime.py
31
28
  src/data_processing/test_support/__init__.py
32
29
  src/data_processing/test_support/abstract_test.py
33
30
  src/data_processing/test_support/data_access/__init__.py
@@ -39,6 +36,7 @@ src/data_processing/test_support/transform/binary_transform_test.py
39
36
  src/data_processing/test_support/transform/noop_transform.py
40
37
  src/data_processing/test_support/transform/table_transform_test.py
41
38
  src/data_processing/transform/__init__.py
39
+ src/data_processing/transform/abstract_transform.py
42
40
  src/data_processing/transform/binary_transform.py
43
41
  src/data_processing/transform/table_transform.py
44
42
  src/data_processing/transform/transform_configuration.py
@@ -76,6 +74,5 @@ test/data_processing_tests/invoker/python_invoker_test.py
76
74
  test/data_processing_tests/launch/pure_python/launcher_test.py
77
75
  test/data_processing_tests/launch/pure_python/multi_launcher_test.py
78
76
  test/data_processing_tests/launch/pure_python/test_noop_launch.py
79
- test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py
80
77
  test/data_processing_tests/transform/test_noop.py
81
78
  test/data_processing_tests/util/transform_utils_test.py
@@ -4,4 +4,3 @@ from data_processing.data_access.data_access_local import DataAccessLocal
4
4
  from data_processing.data_access.data_access_s3 import DataAccessS3
5
5
  from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
6
6
  from data_processing.data_access.data_access_factory import DataAccessFactory
7
- from data_processing.data_access.snapshot_utils import SnapshotUtils
@@ -95,27 +95,29 @@ class ArrowS3:
95
95
  :param key: complete folder
96
96
  :return: list of folders within a given folder and number of retries
97
97
  """
98
+ bucket, prefix = self._get_bucket_key(key)
99
+
98
100
  def _get_sub_folders(bck: str, p: str) -> tuple[list[str], int]:
99
- sub_folders = []
100
101
  # use paginator
101
102
  paginator = self.s3_client.get_paginator("list_objects_v2")
102
103
  # use Delimiter to get folders just folders
103
104
  page_iterator = paginator.paginate(Bucket=bck, Prefix=p, Delimiter="/")
105
+ sub_folders = []
104
106
  internal_retries = 0
105
107
  for page in page_iterator:
106
108
  # for every page
107
109
  internal_retries += page.get("ResponseMetadata", {}).get("RetryAttempts", 0)
108
110
  for p in page.get("CommonPrefixes", []):
109
- sf = p["Prefix"]
110
- sub_folders.append(sf)
111
+ sub_folders.append(p["Prefix"])
111
112
  # apply recursively
112
- sf, r = _get_sub_folders(bck=bck, p=sf)
113
+ sf, r = _get_sub_folders(bck, p["Prefix"])
113
114
  internal_retries += r
114
115
  sub_folders.extend(sf)
115
116
  return sub_folders, internal_retries
116
- bucket, prefix = self._get_bucket_key(key)
117
- subs, retries = _get_sub_folders(bck=bucket, p=prefix)
118
- return [f"{bucket}/{f}" for f in subs], retries
117
+
118
+ prefixes, retries = _get_sub_folders(bck=bucket, p=prefix)
119
+ # remove base prefix
120
+ return [p.removeprefix(prefix) for p in prefixes], retries
119
121
 
120
122
  def read_file(self, key: str) -> tuple[bytes, int]:
121
123
  """
@@ -0,0 +1,245 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import random
14
+ from typing import Any
15
+
16
+ import pyarrow as pa
17
+ from data_processing.utils import KB, MB, get_logger
18
+
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class DataAccess:
24
+ """
25
+ Base class for data access (interface), defining all the methods
26
+ """
27
+
28
+ def get_num_samples(self) -> int:
29
+ """
30
+ Get number of samples for input
31
+ :return: Number of samples
32
+ """
33
+ pass
34
+
35
+ def get_output_folder(self) -> str:
36
+ """
37
+ Get output folder as a string
38
+ :return: output_folder
39
+ """
40
+ pass
41
+
42
+ @staticmethod
43
+ def get_random_file_set(n_samples: int, files: list[str]) -> list[str]:
44
+ """
45
+ Get random set of files
46
+ :param n_samples: set size
47
+ :param files: list of original files
48
+ :return: set of randomly selected files
49
+ """
50
+ # Pick files to include
51
+ if len(files) > n_samples:
52
+ # Pick files at random
53
+ files_set = [int(random.random() * len(files)) for _ in range(n_samples)]
54
+ else:
55
+ # use all existing files
56
+ files_set = range(len(files))
57
+ result = [""] * len(files_set)
58
+ index = 0
59
+ for f in files_set:
60
+ result[index] = files[f]
61
+ index += 1
62
+ logger.info(f"Using files {result} to sample data")
63
+ return result
64
+
65
+ def get_files_to_process(self) -> tuple[list[str], dict[str, float], int]:
66
+ """
67
+ Get files to process
68
+ :return: list of files and a dictionary of the files profile:
69
+ "max_file_size_MB",
70
+ "min_file_size_MB",
71
+ "avg_file_size_MB",
72
+ "total_file_size_MB"
73
+ and the number of operation retries.
74
+ Retries are performed on operation failures and are typically due to the resource overload.
75
+ """
76
+ if self.get_output_folder() is None:
77
+ logger.warning("Input/Output are not defined, returning empty list")
78
+ return [], {}, 0
79
+ path_list, path_profile, retries = self.get_files_to_process_internal()
80
+ n_samples = self.get_num_samples()
81
+ if n_samples > 0:
82
+ files = self.get_random_file_set(n_samples=n_samples, files=path_list)
83
+ return files, path_profile, retries
84
+ return path_list, path_profile, retries
85
+
86
+ def get_files_to_process_internal(self) -> tuple[list[str], dict[str, float], int]:
87
+ """
88
+ Get files to process
89
+ :return: list of files and a dictionary of the files profile:
90
+ "max_file_size_MB",
91
+ "min_file_size_MB",
92
+ "avg_file_size_MB",
93
+ "total_file_size_MB"
94
+ and number of operation retries.
95
+ Retries are performed on operation failures and are typically due to the resource overload.
96
+ """
97
+ pass
98
+
99
+ def get_table(self, path: str) -> tuple[pa.table, int]:
100
+ """
101
+ Get pyArrow table for a given path
102
+ :param path - file path
103
+ :return: pyArrow table or None, if the table read failed and number of operation retries.
104
+ Retries are performed on operation failures and are typically due to the resource overload.
105
+ """
106
+ pass
107
+
108
+ def get_file(self, path: str) -> tuple[bytes, int]:
109
+ """
110
+ Get file as a byte array
111
+ :param path: file path
112
+ :return: bytes array of file content and number of operation retries
113
+ Retries are performed on operation failures and are typically due to the resource overload.
114
+
115
+ """
116
+ pass
117
+
118
+ def get_folder_files(
119
+ self, path: str, extensions: list[str] = None, return_data: bool = True
120
+ ) -> tuple[dict[str, bytes], int]:
121
+ """
122
+ Get a list of byte content of files. The path here is an absolute path and can be anywhere.
123
+ :param path: file path
124
+ :param extensions: a list of file extensions to include. If None, then all files from this and
125
+ child ones will be returned
126
+ :param return_data: flag specifying whether the actual content of files is returned (True), or just
127
+ directory is returned (False)
128
+ :return: A dictionary of file names/binary content will be returned
129
+ """
130
+ pass
131
+
132
+ def save_file(self, path: str, data: bytes) -> tuple[dict[str, Any], int]:
133
+ """
134
+ Save byte array to the file
135
+ :param path: file path
136
+ :param data: byte array
137
+ :return: a dictionary as
138
+ defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
139
+ in the case of failure dict is None and number of operation retries
140
+ Retries are performed on operation failures and are typically due to the resource overload.
141
+ """
142
+
143
+ def get_output_location(self, path: str) -> str:
144
+ """
145
+ Get output location based on input
146
+ :param path: input file location
147
+ :return: output file location
148
+ """
149
+ return ""
150
+
151
+ def save_table(self, path: str, table: pa.Table) -> tuple[int, dict[str, Any], int]:
152
+ """
153
+ Save table to a given location
154
+ :param path: location to save table
155
+ :param table: table
156
+ :return: size of table in memory and a dictionary as
157
+ defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
158
+ in the case of failure dict is None and number of operation retries.
159
+ Retries are performed on operation failures and are typically due to the resource overload.
160
+ """
161
+ pass
162
+
163
+ def save_job_metadata(self, metadata: dict[str, Any]) -> tuple[dict[str, Any], int]:
164
+ """
165
+ Save job metadata
166
+ :param metadata: a dictionary, containing the following keys:
167
+ "pipeline",
168
+ "job details",
169
+ "code",
170
+ "job_input_params",
171
+ "execution_stats",
172
+ "job_output_stats"
173
+ two additional elements:
174
+ "source"
175
+ "target"
176
+ are filled bu implementation
177
+ :return: a dictionary as
178
+ defined https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/put_object.html
179
+ in the case of failure dict is None and number of operation retries.
180
+ Retries are performed on operation failures and are typically due to the resource overload.
181
+ """
182
+ pass
183
+
184
+ def sample_input_data(self, n_samples: int = 10) -> tuple[dict[str, Any], int]:
185
+ """
186
+ Sample input data set to get average table size, average doc size, number of docs, etc.
187
+ Note that here we are not reading all of the input documents, but rather randomly pick
188
+ their subset. It gives more precise answer as subset grows, but it takes longer
189
+ :param n_samples: number of samples to use - default 10
190
+ :return: a dictionary of the files profile:
191
+ "max_file_size_MB",
192
+ "min_file_size_MB",
193
+ "avg_file_size_MB",
194
+ "total_file_size_MB"
195
+ average table size MB,
196
+ average doc size KB,
197
+ estimated number of docs
198
+ and number of operation retries
199
+ Retries are performed on operation failures and are typically due to the resource overload.
200
+ """
201
+ # get files to process
202
+ path_list, path_profile, retries = self.get_files_to_process_internal()
203
+ # Pick files to sample
204
+ files = self.get_random_file_set(n_samples=n_samples, files=path_list)
205
+ # Read table and compute number of docs and sizes
206
+ number_of_docs = []
207
+ table_sizes = []
208
+ n_tables = 0
209
+ for f in files:
210
+ table, r = self.get_table(path=f)
211
+ retries += r
212
+ if table is not None:
213
+ n_tables += 1
214
+ number_of_docs.append(table.num_rows)
215
+ # As a table size is mostly document, we can consider them roughly the same
216
+ table_sizes.append(table.nbytes)
217
+ # compute averages
218
+ if n_tables == 0:
219
+ av_number_docs = 0
220
+ av_table_size = 0
221
+ av_doc_size = 0
222
+ else:
223
+ av_number_docs = sum(number_of_docs) / n_tables
224
+ av_table_size = sum(table_sizes) / n_tables / MB
225
+ if av_number_docs == 0:
226
+ av_doc_size = 0
227
+ else:
228
+ av_doc_size = av_table_size * MB / av_number_docs / KB
229
+ logger.info(
230
+ f"average number of docs {av_number_docs}, average table size {av_table_size} MB, "
231
+ f"average doc size {av_doc_size} kB"
232
+ )
233
+
234
+ # compute number of docs
235
+ number_of_docs = av_number_docs * len(path_list)
236
+ logger.info(f"Estimated number of docs {number_of_docs}")
237
+ return (
238
+ path_profile
239
+ | {
240
+ "average table size MB": av_table_size,
241
+ "average doc size KB": av_doc_size,
242
+ "estimated number of docs": number_of_docs,
243
+ },
244
+ retries,
245
+ )