data-prep-toolkit 0.2.1.dev2__tar.gz → 0.2.1.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.2.1.dev2/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1.dev3}/PKG-INFO +1 -1
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/pyproject.toml +2 -2
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/SOURCES.txt +1 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/__init__.py +1 -0
- data_prep_toolkit-0.2.1.dev3/src/data_processing/data_access/snapshot_utils.py +31 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/__init__.py +4 -2
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/execution_configuration.py +1 -3
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/runtime_configuration.py +6 -5
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_file_processor.py +5 -7
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_orchestrator.py +53 -30
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/transform_file_processor.py +10 -4
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/abstract_test.py +22 -22
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/launch/transform_test.py +1 -1
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/transform_statistics.py +1 -2
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/Makefile +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/README.md +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/setup.cfg +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/arrow_s3.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_factory.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_factory_base.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_local.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_s3.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/execution_configuration.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_invoker.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_launcher.py +1 -1
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_runtime.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/runtime_configuration.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/transform_launcher.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/binary_transform_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/noop_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/table_transform_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/abstract_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/binary_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/table_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/transform_configuration.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/log.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/params_utils.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/pipinstaller.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/transform_configuration.json +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/transform_configurator.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/transform_utils.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/unrecoverable.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/invoker/python_invoker_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/transform/test_noop.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/input/test1.parquet +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_toolkit"
|
|
3
|
-
version = "0.2.1.
|
|
3
|
+
version = "0.2.1.dev3"
|
|
4
4
|
requires-python = ">=3.10"
|
|
5
5
|
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
|
|
6
6
|
description = "Data Preparation Toolkit Library"
|
|
@@ -41,7 +41,7 @@ dev = [
|
|
|
41
41
|
]
|
|
42
42
|
|
|
43
43
|
[options]
|
|
44
|
-
package_dir = ["src"]
|
|
44
|
+
package_dir = ["src","test"]
|
|
45
45
|
|
|
46
46
|
[options.packages.find]
|
|
47
47
|
where = ["src/data_processing"]
|
|
@@ -14,6 +14,7 @@ src/data_processing/data_access/data_access_factory.py
|
|
|
14
14
|
src/data_processing/data_access/data_access_factory_base.py
|
|
15
15
|
src/data_processing/data_access/data_access_local.py
|
|
16
16
|
src/data_processing/data_access/data_access_s3.py
|
|
17
|
+
src/data_processing/data_access/snapshot_utils.py
|
|
17
18
|
src/data_processing/runtime/__init__.py
|
|
18
19
|
src/data_processing/runtime/execution_configuration.py
|
|
19
20
|
src/data_processing/runtime/runtime_configuration.py
|
|
@@ -4,3 +4,4 @@ from data_processing.data_access.data_access_local import DataAccessLocal
|
|
|
4
4
|
from data_processing.data_access.data_access_s3 import DataAccessS3
|
|
5
5
|
from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
|
|
6
6
|
from data_processing.data_access.data_access_factory import DataAccessFactory
|
|
7
|
+
from data_processing.data_access.snapshot_utils import SnapshotUtils
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from data_processing.data_access import DataAccess
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SnapshotUtils:
|
|
17
|
+
"""
|
|
18
|
+
Class implementing support methods for snapshotting
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def get_snapshot_folder(data_access: DataAccess) -> str:
|
|
23
|
+
"""
|
|
24
|
+
Get snapshot folder from data access
|
|
25
|
+
:param data_access: data access class
|
|
26
|
+
:return: output folder
|
|
27
|
+
"""
|
|
28
|
+
output_folder = data_access.get_output_folder()
|
|
29
|
+
if not output_folder.endswith("/"):
|
|
30
|
+
output_folder += "/"
|
|
31
|
+
return f"{output_folder}snapshot/"
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
from data_processing.runtime.pure_python.transform_runtime import DefaultPythonTransformRuntime
|
|
2
2
|
from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
|
|
3
3
|
from data_processing.runtime.pure_python.execution_configuration import PythonTransformExecutionConfiguration
|
|
4
|
-
from data_processing.runtime.pure_python.transform_file_processor import (
|
|
5
|
-
|
|
4
|
+
from data_processing.runtime.pure_python.transform_file_processor import (
|
|
5
|
+
PythonTransformFileProcessor,
|
|
6
|
+
PythonPoolTransformFileProcessor,
|
|
7
|
+
)
|
|
6
8
|
from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
|
|
7
9
|
from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
|
|
8
10
|
from data_processing.runtime.pure_python.transform_invoker import invoke_transform, execute_python_transform
|
|
@@ -67,6 +67,4 @@ class PythonTransformExecutionConfiguration(TransformExecutionConfiguration):
|
|
|
67
67
|
get input parameters for job_input_params in metadata
|
|
68
68
|
:return: dictionary of parameters
|
|
69
69
|
"""
|
|
70
|
-
return {
|
|
71
|
-
"num_processors": self.num_processors
|
|
72
|
-
}
|
|
70
|
+
return {"num_processors": self.num_processors}
|
|
@@ -11,15 +11,16 @@
|
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
13
|
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
|
-
from data_processing.transform import TransformConfiguration
|
|
15
14
|
from data_processing.runtime.pure_python import DefaultPythonTransformRuntime
|
|
15
|
+
from data_processing.transform import TransformConfiguration
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
19
|
-
def __init__(
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
transform_config: TransformConfiguration,
|
|
22
|
+
runtime_class: type[DefaultPythonTransformRuntime] = DefaultPythonTransformRuntime,
|
|
23
|
+
):
|
|
23
24
|
"""
|
|
24
25
|
Initialization
|
|
25
26
|
:param transform_config - base configuration class
|
|
@@ -57,10 +57,10 @@ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
57
57
|
"""
|
|
58
58
|
|
|
59
59
|
def __init__(
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
60
|
+
self,
|
|
61
|
+
data_access_factory: DataAccessFactoryBase,
|
|
62
|
+
transform_params: dict[str, Any],
|
|
63
|
+
transform_class: type[AbstractBinaryTransform],
|
|
64
64
|
):
|
|
65
65
|
"""
|
|
66
66
|
Init method
|
|
@@ -104,6 +104,4 @@ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
|
|
|
104
104
|
"""
|
|
105
105
|
for key, val in stats.items():
|
|
106
106
|
# for all key/values
|
|
107
|
-
|
|
108
|
-
# for values greater then 0
|
|
109
|
-
self.stats[key] = self.stats.get(key, 0) + val
|
|
107
|
+
self.stats[key] = self.stats.get(key, 0) + val
|
|
@@ -11,19 +11,19 @@
|
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
13
|
import time
|
|
14
|
-
from typing import Any
|
|
15
|
-
from multiprocessing import Pool
|
|
16
14
|
import traceback
|
|
17
15
|
from datetime import datetime
|
|
16
|
+
from multiprocessing import Pool
|
|
17
|
+
from typing import Any
|
|
18
18
|
|
|
19
19
|
from data_processing.data_access import DataAccessFactoryBase
|
|
20
20
|
from data_processing.runtime.pure_python import (
|
|
21
|
+
PythonPoolTransformFileProcessor,
|
|
21
22
|
PythonTransformExecutionConfiguration,
|
|
22
|
-
PythonTransformRuntimeConfiguration,
|
|
23
23
|
PythonTransformFileProcessor,
|
|
24
|
-
|
|
24
|
+
PythonTransformRuntimeConfiguration,
|
|
25
25
|
)
|
|
26
|
-
from data_processing.transform import
|
|
26
|
+
from data_processing.transform import AbstractBinaryTransform, TransformStatistics
|
|
27
27
|
from data_processing.utils import get_logger
|
|
28
28
|
|
|
29
29
|
|
|
@@ -69,21 +69,28 @@ def orchestrate(
|
|
|
69
69
|
logger.debug(f"{runtime_config.get_name()} Begin processing files")
|
|
70
70
|
if execution_config.num_processors > 0:
|
|
71
71
|
# using multiprocessor pool for execution
|
|
72
|
-
statistics = _process_transforms_multiprocessor(
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
72
|
+
statistics = _process_transforms_multiprocessor(
|
|
73
|
+
files=files,
|
|
74
|
+
size=execution_config.num_processors,
|
|
75
|
+
data_access_factory=data_access_factory,
|
|
76
|
+
print_interval=print_interval,
|
|
77
|
+
transform_params=runtime.get_transform_config(
|
|
78
|
+
data_access_factory=data_access_factory, statistics=statistics, files=files
|
|
79
|
+
),
|
|
80
|
+
transform_class=runtime_config.get_transform_class(),
|
|
81
|
+
)
|
|
79
82
|
else:
|
|
80
83
|
# using sequential execution
|
|
81
|
-
_process_transforms(
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
_process_transforms(
|
|
85
|
+
files=files,
|
|
86
|
+
data_access_factory=data_access_factory,
|
|
87
|
+
print_interval=print_interval,
|
|
88
|
+
statistics=statistics,
|
|
89
|
+
transform_params=runtime.get_transform_config(
|
|
90
|
+
data_access_factory=data_access_factory, statistics=statistics, files=files
|
|
91
|
+
),
|
|
92
|
+
transform_class=runtime_config.get_transform_class(),
|
|
93
|
+
)
|
|
87
94
|
status = "success"
|
|
88
95
|
return_code = 0
|
|
89
96
|
except Exception as e:
|
|
@@ -108,8 +115,9 @@ def orchestrate(
|
|
|
108
115
|
"status": status,
|
|
109
116
|
},
|
|
110
117
|
"code": execution_config.code_location,
|
|
111
|
-
"job_input_params":
|
|
112
|
-
|
|
118
|
+
"job_input_params": input_params
|
|
119
|
+
| data_access_factory.get_input_params()
|
|
120
|
+
| execution_config.get_input_params(),
|
|
113
121
|
"job_output_stats": stats,
|
|
114
122
|
}
|
|
115
123
|
logger.debug(f"Saving job metadata: {metadata}.")
|
|
@@ -121,9 +129,14 @@ def orchestrate(
|
|
|
121
129
|
return 1
|
|
122
130
|
|
|
123
131
|
|
|
124
|
-
def _process_transforms(
|
|
125
|
-
|
|
126
|
-
|
|
132
|
+
def _process_transforms(
|
|
133
|
+
files: list[str],
|
|
134
|
+
print_interval: int,
|
|
135
|
+
data_access_factory: DataAccessFactoryBase,
|
|
136
|
+
statistics: TransformStatistics,
|
|
137
|
+
transform_params: dict[str, Any],
|
|
138
|
+
transform_class: type[AbstractBinaryTransform],
|
|
139
|
+
) -> None:
|
|
127
140
|
"""
|
|
128
141
|
Process transforms sequentially
|
|
129
142
|
:param files: list of files to process
|
|
@@ -137,8 +150,12 @@ def _process_transforms(files: list[str], print_interval: int, data_access_facto
|
|
|
137
150
|
:return: None
|
|
138
151
|
"""
|
|
139
152
|
# create executor
|
|
140
|
-
executor = PythonTransformFileProcessor(
|
|
141
|
-
|
|
153
|
+
executor = PythonTransformFileProcessor(
|
|
154
|
+
data_access_factory=data_access_factory,
|
|
155
|
+
statistics=statistics,
|
|
156
|
+
transform_params=transform_params,
|
|
157
|
+
transform_class=transform_class,
|
|
158
|
+
)
|
|
142
159
|
# process data
|
|
143
160
|
t_start = time.time()
|
|
144
161
|
completed = 0
|
|
@@ -157,9 +174,14 @@ def _process_transforms(files: list[str], print_interval: int, data_access_facto
|
|
|
157
174
|
logger.info(f"done flushing in {round(time.time() - start, 3)} sec")
|
|
158
175
|
|
|
159
176
|
|
|
160
|
-
def _process_transforms_multiprocessor(
|
|
161
|
-
|
|
162
|
-
|
|
177
|
+
def _process_transforms_multiprocessor(
|
|
178
|
+
files: list[str],
|
|
179
|
+
size: int,
|
|
180
|
+
print_interval: int,
|
|
181
|
+
data_access_factory: DataAccessFactoryBase,
|
|
182
|
+
transform_params: dict[str, Any],
|
|
183
|
+
transform_class: type[AbstractBinaryTransform],
|
|
184
|
+
) -> TransformStatistics:
|
|
163
185
|
"""
|
|
164
186
|
Process transforms using multiprocessing pool
|
|
165
187
|
:param files: list of files to process
|
|
@@ -173,8 +195,9 @@ def _process_transforms_multiprocessor(files: list[str], size: int, print_interv
|
|
|
173
195
|
# result statistics
|
|
174
196
|
statistics = TransformStatistics()
|
|
175
197
|
# create processor
|
|
176
|
-
processor = PythonPoolTransformFileProcessor(
|
|
177
|
-
|
|
198
|
+
processor = PythonPoolTransformFileProcessor(
|
|
199
|
+
data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class
|
|
200
|
+
)
|
|
178
201
|
completed = 0
|
|
179
202
|
t_start = time.time()
|
|
180
203
|
# create multiprocessing pool
|
|
@@ -23,9 +23,9 @@ class AbstractTransformFileProcessor:
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
def __init__(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
self,
|
|
27
|
+
data_access_factory: DataAccessFactoryBase,
|
|
28
|
+
transform_parameters: dict[str, Any],
|
|
29
29
|
):
|
|
30
30
|
"""
|
|
31
31
|
Init method
|
|
@@ -85,7 +85,7 @@ class AbstractTransformFileProcessor:
|
|
|
85
85
|
raise UnrecoverableException
|
|
86
86
|
# Process other exceptions
|
|
87
87
|
except Exception as e:
|
|
88
|
-
self.logger.warning(f"Exception
|
|
88
|
+
self.logger.warning(f"Exception processing file {f_name}: {traceback.format_exc()}")
|
|
89
89
|
self._publish_stats({"transform execution exception": 1})
|
|
90
90
|
|
|
91
91
|
def flush(self) -> None:
|
|
@@ -133,6 +133,12 @@ class AbstractTransformFileProcessor:
|
|
|
133
133
|
self.logger.debug(
|
|
134
134
|
f"Transform did not produce a transformed file for " f"file {self.last_file_name}.parquet"
|
|
135
135
|
)
|
|
136
|
+
self._publish_stats(
|
|
137
|
+
{
|
|
138
|
+
"result_files": len(out_files),
|
|
139
|
+
"processing_time": time.time() - t_start,
|
|
140
|
+
}
|
|
141
|
+
)
|
|
136
142
|
case 1:
|
|
137
143
|
# we have exactly 1 output file
|
|
138
144
|
file_ext = out_files[0]
|
|
@@ -75,8 +75,8 @@ class AbstractTest:
|
|
|
75
75
|
def _install_test_fixtures(self, metafunc):
|
|
76
76
|
raise NotImplemented("Sub-class must implemented this to install the fixtures for its tests.")
|
|
77
77
|
|
|
78
|
-
@
|
|
79
|
-
def validate_expected_tables(table_list: list[pa.Table], expected_table_list: list[pa.Table]):
|
|
78
|
+
@classmethod
|
|
79
|
+
def validate_expected_tables(cls, table_list: list[pa.Table], expected_table_list: list[pa.Table]):
|
|
80
80
|
"""
|
|
81
81
|
Verify with assertion messages that the two lists of Tables are equivalent.
|
|
82
82
|
:param table_list:
|
|
@@ -100,10 +100,10 @@ class AbstractTest:
|
|
|
100
100
|
r1 = t1.take([j])
|
|
101
101
|
r2 = t2.take([j])
|
|
102
102
|
# assert r1 == r2, f"Row {j} of table {i} are not equal\n\tTransformed: {r1}\n\tExpected : {r2}"
|
|
103
|
-
|
|
103
|
+
cls.validate_expected_row(i, j, r1, r2)
|
|
104
104
|
|
|
105
|
-
@
|
|
106
|
-
def validate_expected_row(table_index: int, row_index: int, test_row: pa.Table, expected_row: pa.Table):
|
|
105
|
+
@classmethod
|
|
106
|
+
def validate_expected_row(cls, table_index: int, row_index: int, test_row: pa.Table, expected_row: pa.Table):
|
|
107
107
|
"""
|
|
108
108
|
Compare the two rows for equality, allowing float values to be within a percentage
|
|
109
109
|
of each other as defined by global _allowed_float_percent_diff.
|
|
@@ -139,8 +139,8 @@ class AbstractTest:
|
|
|
139
139
|
diff = abs(test_value - expected_value)
|
|
140
140
|
assert diff <= allowed_diff, msg
|
|
141
141
|
|
|
142
|
-
@
|
|
143
|
-
def validate_expected_files(files_list: list[tuple[bytes, str]], expected_files_list: list[tuple[bytes, str]]):
|
|
142
|
+
@classmethod
|
|
143
|
+
def validate_expected_files(cls, files_list: list[tuple[bytes, str]], expected_files_list: list[tuple[bytes, str]]):
|
|
144
144
|
"""
|
|
145
145
|
Verify with assertion messages that the two lists of Tables are equivalent.
|
|
146
146
|
:param files_list:
|
|
@@ -171,15 +171,15 @@ class AbstractTest:
|
|
|
171
171
|
diff <= diff_allowed
|
|
172
172
|
), f"produced file length {lenf1} vs expected {lenf2}, exceeds allowance of {diff_allowed}"
|
|
173
173
|
|
|
174
|
-
@
|
|
175
|
-
def validate_expected_metadata_lists(metadata: list[dict[str, float]], expected_metadata: list[dict[str, float]]):
|
|
174
|
+
@classmethod
|
|
175
|
+
def validate_expected_metadata_lists(cls, metadata: list[dict[str, float]], expected_metadata: list[dict[str, float]]):
|
|
176
176
|
elen = len(expected_metadata)
|
|
177
177
|
assert len(metadata) == elen, f"Number of metadata dictionaries not the expected of {elen}"
|
|
178
178
|
for index in range(elen):
|
|
179
|
-
|
|
179
|
+
cls.validate_expected_metadata(metadata[index], expected_metadata[index])
|
|
180
180
|
|
|
181
|
-
@
|
|
182
|
-
def validate_expected_metadata(metadata: dict[str, float], expected_metadata: dict[str, float]):
|
|
181
|
+
@classmethod
|
|
182
|
+
def validate_expected_metadata(cls, metadata: dict[str, float], expected_metadata: dict[str, float]):
|
|
183
183
|
"""
|
|
184
184
|
Verify with assertion messages that the two dictionaries are as expected.
|
|
185
185
|
:param metadata:
|
|
@@ -194,8 +194,8 @@ class AbstractTest:
|
|
|
194
194
|
f"Metadata not equal\n" "\tTransformed: {metadata} Expected : {expected_metadata}"
|
|
195
195
|
)
|
|
196
196
|
|
|
197
|
-
@
|
|
198
|
-
def validate_directory_contents(directory: str, expected_dir: str, drop_columns: list[str] = []):
|
|
197
|
+
@classmethod
|
|
198
|
+
def validate_directory_contents(cls, directory: str, expected_dir: str, drop_columns: list[str] = []):
|
|
199
199
|
"""
|
|
200
200
|
Make sure the directory contents are the same.
|
|
201
201
|
:param directory:
|
|
@@ -217,28 +217,28 @@ class AbstractTest:
|
|
|
217
217
|
expected_diffs = 0
|
|
218
218
|
failed = len(dir_cmp.diff_files) != expected_diffs
|
|
219
219
|
if failed:
|
|
220
|
-
|
|
220
|
+
cls.__confirm_diffs(directory, expected_dir, dir_cmp.diff_files, "/tmp", drop_columns)
|
|
221
221
|
|
|
222
222
|
# Traverse into the subdirs since dircmp doesn't seem to do that.
|
|
223
223
|
subdirs = [f.name for f in os.scandir(expected_dir) if f.is_dir()]
|
|
224
224
|
for subdir in subdirs:
|
|
225
225
|
d1 = os.path.join(directory, subdir)
|
|
226
226
|
d2 = os.path.join(expected_dir, subdir)
|
|
227
|
-
|
|
227
|
+
cls.validate_directory_contents(d1, d2, drop_columns)
|
|
228
228
|
|
|
229
|
-
@
|
|
230
|
-
def _validate_table_files(parquet1: str, parquet2: str, drop_columns: list[str] = []):
|
|
229
|
+
@classmethod
|
|
230
|
+
def _validate_table_files(cls, parquet1: str, parquet2: str, drop_columns: list[str] = []):
|
|
231
231
|
da = DataAccessLocal()
|
|
232
232
|
t1, _ = da.get_table(parquet1)
|
|
233
233
|
t2, _ = da.get_table(parquet2)
|
|
234
234
|
if len(drop_columns) > 0:
|
|
235
235
|
t1 = t1.drop_columns(drop_columns)
|
|
236
236
|
t2 = t2.drop_columns(drop_columns)
|
|
237
|
-
|
|
237
|
+
cls.validate_expected_tables([t1], [t2])
|
|
238
238
|
|
|
239
|
-
@
|
|
239
|
+
@classmethod
|
|
240
240
|
def __confirm_diffs(
|
|
241
|
-
src_dir: str, expected_dir: str, diff_files: list, dest_dir: str, drop_columns: list[str] = []
|
|
241
|
+
cls, src_dir: str, expected_dir: str, diff_files: list, dest_dir: str, drop_columns: list[str] = []
|
|
242
242
|
):
|
|
243
243
|
"""
|
|
244
244
|
Copy all files from the source dir to the dest dir.
|
|
@@ -256,7 +256,7 @@ class AbstractTest:
|
|
|
256
256
|
# It seems file can be different on disk, but contain the same column/row values.
|
|
257
257
|
# so for these, do the inmemory comparison.
|
|
258
258
|
try:
|
|
259
|
-
|
|
259
|
+
cls._validate_table_files(expected, src, drop_columns)
|
|
260
260
|
except AssertionError as e:
|
|
261
261
|
logger.info(f"Copying file with difference: {src} to {dest}")
|
|
262
262
|
shutil.copyfile(src, dest)
|
|
@@ -65,7 +65,7 @@ class AbstractTransformLauncherTest(AbstractTest):
|
|
|
65
65
|
Confirm that the two directories contains the same files.
|
|
66
66
|
Stubbed out like this to allow spark tests to override this since spark tends to rename the files.
|
|
67
67
|
"""
|
|
68
|
-
|
|
68
|
+
self.validate_directory_contents(dir, expected, ignore_columns)
|
|
69
69
|
|
|
70
70
|
def _install_test_fixtures(self, metafunc):
|
|
71
71
|
# Apply the fixtures for the method with these input names (i.e. test_transform()).
|
|
@@ -32,8 +32,7 @@ class TransformStatistics:
|
|
|
32
32
|
:return: None
|
|
33
33
|
"""
|
|
34
34
|
for key, val in stats.items():
|
|
35
|
-
|
|
36
|
-
self.stats[key] = self.stats.get(key, 0) + val
|
|
35
|
+
self.stats[key] = self.stats.get(key, 0) + val
|
|
37
36
|
|
|
38
37
|
def get_execution_stats(self) -> dict[str, Any]:
|
|
39
38
|
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -15,8 +15,8 @@ import time
|
|
|
15
15
|
|
|
16
16
|
from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
|
|
17
17
|
from data_processing.runtime.pure_python import (
|
|
18
|
-
PythonTransformRuntimeConfiguration,
|
|
19
18
|
PythonTransformExecutionConfiguration,
|
|
19
|
+
PythonTransformRuntimeConfiguration,
|
|
20
20
|
orchestrate,
|
|
21
21
|
)
|
|
22
22
|
from data_processing.runtime.transform_launcher import AbstractTransformLauncher
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/__init__.py
RENAMED
|
File without changes
|
{data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/cli_utils.py
RENAMED
|
File without changes
|
{data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/config.py
RENAMED
|
File without changes
|
{data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/log.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|