data-prep-toolkit 0.2.1.dev1__tar.gz → 0.2.1.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.2.1.dev1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1.dev2}/PKG-INFO +1 -1
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/pyproject.toml +1 -1
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/SOURCES.txt +3 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/__init__.py +4 -1
- data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/execution_configuration.py +72 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/runtime_configuration.py +14 -2
- data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_file_processor.py +109 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/transform_launcher.py +3 -3
- data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_orchestrator.py +201 -0
- data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_runtime.py +53 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/transform_file_processor.py +21 -6
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/log.py +7 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -5
- data_prep_toolkit-0.2.1.dev2/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +37 -0
- data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_file_processor.py +0 -51
- data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -116
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/Makefile +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/README.md +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/setup.cfg +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/arrow_s3.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_factory.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_factory_base.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_local.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_s3.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/execution_configuration.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/transform_invoker.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/runtime_configuration.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/transform_launcher.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/abstract_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/data_access/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/launch/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/launch/transform_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/binary_transform_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/noop_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/table_transform_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/abstract_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/binary_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/table_transform.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/transform_configuration.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/transform_statistics.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/__init__.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/cli_utils.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/config.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/params_utils.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/pipinstaller.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/transform_configuration.json +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/transform_configurator.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/transform_utils.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/unrecoverable.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/invoker/python_invoker_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/transform/test_noop.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/util/transform_utils_test.py +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
- {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/input/test1.parquet +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "data_prep_toolkit"
|
|
3
|
-
version = "0.2.1.
|
|
3
|
+
version = "0.2.1.dev2"
|
|
4
4
|
requires-python = ">=3.10"
|
|
5
5
|
keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
|
|
6
6
|
description = "Data Preparation Toolkit Library"
|
|
@@ -20,11 +20,13 @@ src/data_processing/runtime/runtime_configuration.py
|
|
|
20
20
|
src/data_processing/runtime/transform_file_processor.py
|
|
21
21
|
src/data_processing/runtime/transform_launcher.py
|
|
22
22
|
src/data_processing/runtime/pure_python/__init__.py
|
|
23
|
+
src/data_processing/runtime/pure_python/execution_configuration.py
|
|
23
24
|
src/data_processing/runtime/pure_python/runtime_configuration.py
|
|
24
25
|
src/data_processing/runtime/pure_python/transform_file_processor.py
|
|
25
26
|
src/data_processing/runtime/pure_python/transform_invoker.py
|
|
26
27
|
src/data_processing/runtime/pure_python/transform_launcher.py
|
|
27
28
|
src/data_processing/runtime/pure_python/transform_orchestrator.py
|
|
29
|
+
src/data_processing/runtime/pure_python/transform_runtime.py
|
|
28
30
|
src/data_processing/test_support/__init__.py
|
|
29
31
|
src/data_processing/test_support/abstract_test.py
|
|
30
32
|
src/data_processing/test_support/data_access/__init__.py
|
|
@@ -74,5 +76,6 @@ test/data_processing_tests/invoker/python_invoker_test.py
|
|
|
74
76
|
test/data_processing_tests/launch/pure_python/launcher_test.py
|
|
75
77
|
test/data_processing_tests/launch/pure_python/multi_launcher_test.py
|
|
76
78
|
test/data_processing_tests/launch/pure_python/test_noop_launch.py
|
|
79
|
+
test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py
|
|
77
80
|
test/data_processing_tests/transform/test_noop.py
|
|
78
81
|
test/data_processing_tests/util/transform_utils_test.py
|
|
@@ -1,5 +1,8 @@
|
|
|
1
|
+
from data_processing.runtime.pure_python.transform_runtime import DefaultPythonTransformRuntime
|
|
1
2
|
from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
|
|
2
|
-
from data_processing.runtime.pure_python.
|
|
3
|
+
from data_processing.runtime.pure_python.execution_configuration import PythonTransformExecutionConfiguration
|
|
4
|
+
from data_processing.runtime.pure_python.transform_file_processor import (PythonTransformFileProcessor,
|
|
5
|
+
PythonPoolTransformFileProcessor)
|
|
3
6
|
from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
|
|
4
7
|
from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
|
|
5
8
|
from data_processing.runtime.pure_python.transform_invoker import invoke_transform, execute_python_transform
|
data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/execution_configuration.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from data_processing.runtime import TransformExecutionConfiguration
|
|
17
|
+
from data_processing.utils import CLIArgumentProvider, get_logger
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
cli_prefix = "runtime_"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PythonTransformExecutionConfiguration(TransformExecutionConfiguration):
|
|
27
|
+
"""
|
|
28
|
+
A class specifying and validating Python orchestrator configuration
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, name: str):
|
|
32
|
+
"""
|
|
33
|
+
Initialization
|
|
34
|
+
"""
|
|
35
|
+
super().__init__(name=name, print_params=False)
|
|
36
|
+
self.num_processors = 0
|
|
37
|
+
|
|
38
|
+
def add_input_params(self, parser: argparse.ArgumentParser) -> None:
|
|
39
|
+
"""
|
|
40
|
+
This method adds transformer specific parameter to parser
|
|
41
|
+
:param parser: parser
|
|
42
|
+
:return:
|
|
43
|
+
"""
|
|
44
|
+
parser.add_argument(f"--{cli_prefix}num_processors", type=int, default=0, help="size of multiprocessing pool")
|
|
45
|
+
|
|
46
|
+
return TransformExecutionConfiguration.add_input_params(self, parser=parser)
|
|
47
|
+
|
|
48
|
+
def apply_input_params(self, args: argparse.Namespace) -> bool:
|
|
49
|
+
"""
|
|
50
|
+
Validate transformer specific parameters
|
|
51
|
+
:param args: user defined arguments
|
|
52
|
+
:return: True, if validate pass or False otherwise
|
|
53
|
+
"""
|
|
54
|
+
if not TransformExecutionConfiguration.apply_input_params(self, args=args):
|
|
55
|
+
return False
|
|
56
|
+
captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
|
|
57
|
+
# store parameters locally
|
|
58
|
+
self.num_processors = captured["num_processors"]
|
|
59
|
+
# print them
|
|
60
|
+
if self.num_processors > 0:
|
|
61
|
+
# we are using multiprocessing
|
|
62
|
+
logger.info(f"using multiprocessing, num processors {self.num_processors}")
|
|
63
|
+
return True
|
|
64
|
+
|
|
65
|
+
def get_input_params(self) -> dict[str, Any]:
|
|
66
|
+
"""
|
|
67
|
+
get input parameters for job_input_params in metadata
|
|
68
|
+
:return: dictionary of parameters
|
|
69
|
+
"""
|
|
70
|
+
return {
|
|
71
|
+
"num_processors": self.num_processors
|
|
72
|
+
}
|
|
@@ -12,13 +12,25 @@
|
|
|
12
12
|
|
|
13
13
|
from data_processing.runtime import TransformRuntimeConfiguration
|
|
14
14
|
from data_processing.transform import TransformConfiguration
|
|
15
|
+
from data_processing.runtime.pure_python import DefaultPythonTransformRuntime
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
|
|
18
|
-
def __init__(self,
|
|
19
|
+
def __init__(self,
|
|
20
|
+
transform_config: TransformConfiguration,
|
|
21
|
+
runtime_class: type[DefaultPythonTransformRuntime] = DefaultPythonTransformRuntime,
|
|
22
|
+
):
|
|
19
23
|
"""
|
|
20
24
|
Initialization
|
|
21
25
|
:param transform_config - base configuration class
|
|
26
|
+
:param runtime_class: implementation of the transform runtime
|
|
22
27
|
"""
|
|
23
|
-
self.
|
|
28
|
+
self.runtime_class = runtime_class
|
|
24
29
|
super().__init__(transform_config=transform_config)
|
|
30
|
+
|
|
31
|
+
def create_transform_runtime(self) -> DefaultPythonTransformRuntime:
|
|
32
|
+
"""
|
|
33
|
+
Create transform runtime with the parameters captured during apply_input_params()
|
|
34
|
+
:return: transform runtime object
|
|
35
|
+
"""
|
|
36
|
+
return self.runtime_class(self.transform_config.get_transform_params())
|
data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_file_processor.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
16
|
+
from data_processing.runtime import AbstractTransformFileProcessor
|
|
17
|
+
from data_processing.transform import AbstractBinaryTransform, TransformStatistics
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PythonTransformFileProcessor(AbstractTransformFileProcessor):
|
|
21
|
+
"""
|
|
22
|
+
This is the class implementing the worker class processing of a single file
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
data_access_factory: DataAccessFactoryBase,
|
|
28
|
+
statistics: TransformStatistics,
|
|
29
|
+
transform_params: dict[str, Any],
|
|
30
|
+
transform_class: type[AbstractBinaryTransform],
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Init method
|
|
34
|
+
:param data_access_factory - data access factory
|
|
35
|
+
:param statistics - reference to statistics class
|
|
36
|
+
:param transform_params - transform parameters
|
|
37
|
+
:param transform_class: transform class
|
|
38
|
+
"""
|
|
39
|
+
# invoke superclass
|
|
40
|
+
super().__init__(
|
|
41
|
+
data_access_factory=data_access_factory,
|
|
42
|
+
transform_parameters=dict(transform_params),
|
|
43
|
+
)
|
|
44
|
+
self.transform_params["statistics"] = statistics
|
|
45
|
+
# Create local processor
|
|
46
|
+
self.transform = transform_class(self.transform_params)
|
|
47
|
+
# Create statistics
|
|
48
|
+
self.stats = statistics
|
|
49
|
+
|
|
50
|
+
def _publish_stats(self, stats: dict[str, Any]) -> None:
|
|
51
|
+
self.stats.add_stats(stats)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
|
|
55
|
+
"""
|
|
56
|
+
This is the class implementing the worker class processing of a single file
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
data_access_factory: DataAccessFactoryBase,
|
|
62
|
+
transform_params: dict[str, Any],
|
|
63
|
+
transform_class: type[AbstractBinaryTransform],
|
|
64
|
+
):
|
|
65
|
+
"""
|
|
66
|
+
Init method
|
|
67
|
+
:param data_access_factory - data access factory
|
|
68
|
+
:param transform_params - transform parameters
|
|
69
|
+
:param transform_class: transform class
|
|
70
|
+
"""
|
|
71
|
+
super().__init__(
|
|
72
|
+
data_access_factory=data_access_factory,
|
|
73
|
+
transform_parameters=dict(transform_params),
|
|
74
|
+
)
|
|
75
|
+
# Add data access and statistics to the processor parameters
|
|
76
|
+
self.transform_params["data_access"] = self.data_access
|
|
77
|
+
self.transform_class = transform_class
|
|
78
|
+
self.transform = None
|
|
79
|
+
|
|
80
|
+
def process_file(self, f_name: str) -> dict[str, Any]:
|
|
81
|
+
# re initialize statistics
|
|
82
|
+
self.stats = {}
|
|
83
|
+
if self.transform is None:
|
|
84
|
+
# create transform. Make sure to do this locally
|
|
85
|
+
self.transform = self.transform_class(self.transform_params)
|
|
86
|
+
# Invoke superclass method
|
|
87
|
+
super().process_file(f_name=f_name)
|
|
88
|
+
# return collected statistics
|
|
89
|
+
return self.stats
|
|
90
|
+
|
|
91
|
+
def flush(self) -> dict[str, Any]:
|
|
92
|
+
# re initialize statistics
|
|
93
|
+
self.stats = {}
|
|
94
|
+
# Invoke superclass method
|
|
95
|
+
super().flush()
|
|
96
|
+
# return collected statistics
|
|
97
|
+
return self.stats
|
|
98
|
+
|
|
99
|
+
def _publish_stats(self, stats: dict[str, Any]) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Publish statistics (to the local dictionary)
|
|
102
|
+
:param stats: statistics dictionary
|
|
103
|
+
:return: None
|
|
104
|
+
"""
|
|
105
|
+
for key, val in stats.items():
|
|
106
|
+
# for all key/values
|
|
107
|
+
if val > 0:
|
|
108
|
+
# for values greater then 0
|
|
109
|
+
self.stats[key] = self.stats.get(key, 0) + val
|
|
@@ -14,9 +14,9 @@ import argparse
|
|
|
14
14
|
import time
|
|
15
15
|
|
|
16
16
|
from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
|
|
17
|
-
from data_processing.runtime import TransformExecutionConfiguration
|
|
18
17
|
from data_processing.runtime.pure_python import (
|
|
19
18
|
PythonTransformRuntimeConfiguration,
|
|
19
|
+
PythonTransformExecutionConfiguration,
|
|
20
20
|
orchestrate,
|
|
21
21
|
)
|
|
22
22
|
from data_processing.runtime.transform_launcher import AbstractTransformLauncher
|
|
@@ -42,7 +42,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
42
42
|
:param data_access_factory: the factory to create DataAccess instances.
|
|
43
43
|
"""
|
|
44
44
|
super().__init__(runtime_config, data_access_factory)
|
|
45
|
-
self.execution_config =
|
|
45
|
+
self.execution_config = PythonTransformExecutionConfiguration(name=runtime_config.get_name())
|
|
46
46
|
|
|
47
47
|
def __get_parameters(self) -> bool:
|
|
48
48
|
"""
|
|
@@ -85,7 +85,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
|
|
|
85
85
|
except Exception as e:
|
|
86
86
|
logger.info(f"Exception running orchestration\n{e}")
|
|
87
87
|
finally:
|
|
88
|
-
logger.info(f"Completed execution in {(time.time() - start)/60
|
|
88
|
+
logger.info(f"Completed execution in {round((time.time() - start)/60., 3)} min, execution result {res}")
|
|
89
89
|
return res
|
|
90
90
|
|
|
91
91
|
def launch(self) -> int:
|
data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_orchestrator.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import time
|
|
14
|
+
from typing import Any
|
|
15
|
+
from multiprocessing import Pool
|
|
16
|
+
import traceback
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
|
|
19
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
20
|
+
from data_processing.runtime.pure_python import (
|
|
21
|
+
PythonTransformExecutionConfiguration,
|
|
22
|
+
PythonTransformRuntimeConfiguration,
|
|
23
|
+
PythonTransformFileProcessor,
|
|
24
|
+
PythonPoolTransformFileProcessor,
|
|
25
|
+
)
|
|
26
|
+
from data_processing.transform import TransformStatistics, AbstractBinaryTransform
|
|
27
|
+
from data_processing.utils import get_logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
logger = get_logger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def orchestrate(
|
|
34
|
+
data_access_factory: DataAccessFactoryBase,
|
|
35
|
+
runtime_config: PythonTransformRuntimeConfiguration,
|
|
36
|
+
execution_config: PythonTransformExecutionConfiguration,
|
|
37
|
+
) -> int:
|
|
38
|
+
"""
|
|
39
|
+
orchestrator for transformer execution
|
|
40
|
+
:param data_access_factory: data access factory
|
|
41
|
+
:param runtime_config: transformer configuration
|
|
42
|
+
:param execution_config: execution configuration
|
|
43
|
+
:return: 0 - success or 1 - failure
|
|
44
|
+
"""
|
|
45
|
+
start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
46
|
+
logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
|
|
47
|
+
# create statistics
|
|
48
|
+
statistics = TransformStatistics()
|
|
49
|
+
# create data access
|
|
50
|
+
data_access = data_access_factory.create_data_access()
|
|
51
|
+
if data_access is None:
|
|
52
|
+
logger.error("No DataAccess instance provided - exiting")
|
|
53
|
+
return 1
|
|
54
|
+
# create additional execution parameters
|
|
55
|
+
runtime = runtime_config.create_transform_runtime()
|
|
56
|
+
try:
|
|
57
|
+
# Get files to process
|
|
58
|
+
files, profile, retries = data_access.get_files_to_process()
|
|
59
|
+
if len(files) == 0:
|
|
60
|
+
logger.error("No input files to process - exiting")
|
|
61
|
+
return 0
|
|
62
|
+
if retries > 0:
|
|
63
|
+
statistics.add_stats({"data access retries": retries})
|
|
64
|
+
logger.info(f"Number of files is {len(files)}, source profile {profile}")
|
|
65
|
+
# Print interval
|
|
66
|
+
print_interval = int(len(files) / 100)
|
|
67
|
+
if print_interval == 0:
|
|
68
|
+
print_interval = 1
|
|
69
|
+
logger.debug(f"{runtime_config.get_name()} Begin processing files")
|
|
70
|
+
if execution_config.num_processors > 0:
|
|
71
|
+
# using multiprocessor pool for execution
|
|
72
|
+
statistics = _process_transforms_multiprocessor(files=files, size=execution_config.num_processors,
|
|
73
|
+
data_access_factory=data_access_factory,
|
|
74
|
+
print_interval=print_interval,
|
|
75
|
+
transform_params=runtime.get_transform_config(
|
|
76
|
+
data_access_factory=data_access_factory,
|
|
77
|
+
statistics=statistics, files=files),
|
|
78
|
+
transform_class=runtime_config.get_transform_class())
|
|
79
|
+
else:
|
|
80
|
+
# using sequential execution
|
|
81
|
+
_process_transforms(files=files, data_access_factory=data_access_factory,
|
|
82
|
+
print_interval=print_interval, statistics=statistics,
|
|
83
|
+
transform_params=runtime.get_transform_config(
|
|
84
|
+
data_access_factory=data_access_factory,
|
|
85
|
+
statistics=statistics, files=files),
|
|
86
|
+
transform_class=runtime_config.get_transform_class())
|
|
87
|
+
status = "success"
|
|
88
|
+
return_code = 0
|
|
89
|
+
except Exception as e:
|
|
90
|
+
logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
|
|
91
|
+
return_code = 1
|
|
92
|
+
status = "failure"
|
|
93
|
+
try:
|
|
94
|
+
# Compute execution statistics
|
|
95
|
+
logger.debug("Computing execution stats")
|
|
96
|
+
stats = statistics.get_execution_stats()
|
|
97
|
+
stats["processing_time"] = round(stats["processing_time"], 3)
|
|
98
|
+
# build and save metadata
|
|
99
|
+
logger.debug("Building job metadata")
|
|
100
|
+
input_params = runtime_config.get_transform_metadata()
|
|
101
|
+
runtime.compute_execution_stats(stats=statistics)
|
|
102
|
+
metadata = {
|
|
103
|
+
"pipeline": execution_config.pipeline_id,
|
|
104
|
+
"job details": execution_config.job_details
|
|
105
|
+
| {
|
|
106
|
+
"start_time": start_ts,
|
|
107
|
+
"end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
108
|
+
"status": status,
|
|
109
|
+
},
|
|
110
|
+
"code": execution_config.code_location,
|
|
111
|
+
"job_input_params":
|
|
112
|
+
input_params | data_access_factory.get_input_params() | execution_config.get_input_params(),
|
|
113
|
+
"job_output_stats": stats,
|
|
114
|
+
}
|
|
115
|
+
logger.debug(f"Saving job metadata: {metadata}.")
|
|
116
|
+
data_access.save_job_metadata(metadata)
|
|
117
|
+
logger.debug("Saved job metadata.")
|
|
118
|
+
return return_code
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
|
|
121
|
+
return 1
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _process_transforms(files: list[str], print_interval: int, data_access_factory: DataAccessFactoryBase,
|
|
125
|
+
statistics: TransformStatistics, transform_params: dict[str, Any],
|
|
126
|
+
transform_class: type[AbstractBinaryTransform]) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Process transforms sequentially
|
|
129
|
+
:param files: list of files to process
|
|
130
|
+
:param statistics: statistics class
|
|
131
|
+
:param print_interval: print interval
|
|
132
|
+
:param data_access_factory: data access factory
|
|
133
|
+
:param transform_params - transform parameters
|
|
134
|
+
:param transform_class: transform class
|
|
135
|
+
:return: metadata for the execution
|
|
136
|
+
|
|
137
|
+
:return: None
|
|
138
|
+
"""
|
|
139
|
+
# create executor
|
|
140
|
+
executor = PythonTransformFileProcessor(data_access_factory=data_access_factory, statistics=statistics,
|
|
141
|
+
transform_params=transform_params, transform_class=transform_class)
|
|
142
|
+
# process data
|
|
143
|
+
t_start = time.time()
|
|
144
|
+
completed = 0
|
|
145
|
+
for path in files:
|
|
146
|
+
executor.process_file(path)
|
|
147
|
+
completed += 1
|
|
148
|
+
if completed % print_interval == 0:
|
|
149
|
+
logger.info(
|
|
150
|
+
f"Completed {completed} files ({round(100 * completed / len(files), 2)}%) "
|
|
151
|
+
f"in {round((time.time() - t_start)/60., 3)} min"
|
|
152
|
+
)
|
|
153
|
+
logger.info(f"Done processing {completed} files, waiting for flush() completion.")
|
|
154
|
+
# invoke flush to ensure that all results are returned
|
|
155
|
+
start = time.time()
|
|
156
|
+
executor.flush()
|
|
157
|
+
logger.info(f"done flushing in {round(time.time() - start, 3)} sec")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _process_transforms_multiprocessor(files: list[str], size: int, print_interval: int,
|
|
161
|
+
data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any],
|
|
162
|
+
transform_class: type[AbstractBinaryTransform]) -> TransformStatistics:
|
|
163
|
+
"""
|
|
164
|
+
Process transforms using multiprocessing pool
|
|
165
|
+
:param files: list of files to process
|
|
166
|
+
:param size: pool size
|
|
167
|
+
:param print_interval: print interval
|
|
168
|
+
:param data_access_factory: data access factory
|
|
169
|
+
:param transform_params - transform parameters
|
|
170
|
+
:param transform_class: transform class
|
|
171
|
+
:return: metadata for the execution
|
|
172
|
+
"""
|
|
173
|
+
# result statistics
|
|
174
|
+
statistics = TransformStatistics()
|
|
175
|
+
# create processor
|
|
176
|
+
processor = PythonPoolTransformFileProcessor(data_access_factory=data_access_factory,
|
|
177
|
+
transform_params=transform_params, transform_class=transform_class)
|
|
178
|
+
completed = 0
|
|
179
|
+
t_start = time.time()
|
|
180
|
+
# create multiprocessing pool
|
|
181
|
+
with Pool(processes=size) as pool:
|
|
182
|
+
# execute for every input file
|
|
183
|
+
for result in pool.imap_unordered(processor.process_file, files):
|
|
184
|
+
completed += 1
|
|
185
|
+
# accumulate statistics
|
|
186
|
+
statistics.add_stats(result)
|
|
187
|
+
if completed % print_interval == 0:
|
|
188
|
+
# print intermediate statistics
|
|
189
|
+
logger.info(
|
|
190
|
+
f"Completed {completed} files ({round(100 * completed / len(files), 2)}%) "
|
|
191
|
+
f"in {round((time.time() - t_start)/60., 3)} min"
|
|
192
|
+
)
|
|
193
|
+
logger.info(f"Done processing {completed} files, waiting for flush() completion.")
|
|
194
|
+
results = [{}] * size
|
|
195
|
+
# flush
|
|
196
|
+
for i in range(size):
|
|
197
|
+
results[i] = pool.apply_async(processor.flush)
|
|
198
|
+
for s in results:
|
|
199
|
+
statistics.add_stats(s.get())
|
|
200
|
+
logger.info(f"done flushing in {time.time() - t_start} sec")
|
|
201
|
+
return statistics
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
16
|
+
from data_processing.transform import TransformStatistics
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DefaultPythonTransformRuntime:
|
|
20
|
+
"""
|
|
21
|
+
Transformer runtime used by processor to to create Transform specific environment
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, params: dict[str, Any]):
|
|
25
|
+
"""
|
|
26
|
+
Create/config this runtime.
|
|
27
|
+
:param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
|
|
28
|
+
"""
|
|
29
|
+
self.params = params
|
|
30
|
+
|
|
31
|
+
def get_transform_config(
|
|
32
|
+
self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
|
|
33
|
+
) -> dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Get the dictionary of configuration that will be provided to the transform's initializer.
|
|
36
|
+
This is the opportunity for this runtime to create a new set of configuration based on the
|
|
37
|
+
config/params provided to this instance's initializer. This may include the addition
|
|
38
|
+
of new configuration data such as ray shared memory, new actors, etc, that might be needed and
|
|
39
|
+
expected by the transform in its initializer and/or transform() methods.
|
|
40
|
+
:param data_access_factory - data access factory class being used by the RayOrchestrator.
|
|
41
|
+
:param statistics - reference to statistics actor
|
|
42
|
+
:param files - list of files to process
|
|
43
|
+
:return: dictionary of transform init params
|
|
44
|
+
"""
|
|
45
|
+
return self.params
|
|
46
|
+
|
|
47
|
+
def compute_execution_stats(self, stats: TransformStatistics) -> None:
|
|
48
|
+
"""
|
|
49
|
+
Update/augment the given statistics object with runtime-specific additions/modifications.
|
|
50
|
+
:param stats: output of statistics as aggregated across all calls to all transforms.
|
|
51
|
+
:return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
|
|
52
|
+
"""
|
|
53
|
+
return stats
|
|
@@ -13,6 +13,7 @@ import time
|
|
|
13
13
|
import traceback
|
|
14
14
|
from typing import Any
|
|
15
15
|
|
|
16
|
+
from data_processing.data_access import DataAccessFactoryBase
|
|
16
17
|
from data_processing.utils import TransformUtils, UnrecoverableException, get_logger
|
|
17
18
|
|
|
18
19
|
|
|
@@ -21,17 +22,30 @@ class AbstractTransformFileProcessor:
|
|
|
21
22
|
This is the the base class implementing processing of a single binary file
|
|
22
23
|
"""
|
|
23
24
|
|
|
24
|
-
def __init__(
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
data_access_factory: DataAccessFactoryBase,
|
|
28
|
+
transform_parameters: dict[str, Any],
|
|
29
|
+
):
|
|
25
30
|
"""
|
|
26
31
|
Init method
|
|
32
|
+
:param data_access_factory: Data Access Factory
|
|
33
|
+
:param transform_parameters: Transform parameters
|
|
27
34
|
"""
|
|
28
|
-
self.
|
|
35
|
+
self.logger = get_logger(__name__)
|
|
36
|
+
# validate parameters
|
|
37
|
+
if data_access_factory is None:
|
|
38
|
+
self.logger.error("Transform file processor: data access factory is not specified")
|
|
39
|
+
raise UnrecoverableException("data access factory is None")
|
|
29
40
|
self.transform = None
|
|
30
41
|
self.stats = None
|
|
31
42
|
self.last_file_name = None
|
|
32
43
|
self.last_extension = None
|
|
33
44
|
self.last_file_name_next_index = None
|
|
34
|
-
self.
|
|
45
|
+
self.data_access = data_access_factory.create_data_access()
|
|
46
|
+
# Add data access and statistics to the processor parameters
|
|
47
|
+
self.transform_params = transform_parameters
|
|
48
|
+
self.transform_params["data_access"] = self.data_access
|
|
35
49
|
|
|
36
50
|
def process_file(self, f_name: str) -> None:
|
|
37
51
|
"""
|
|
@@ -83,14 +97,15 @@ class AbstractTransformFileProcessor:
|
|
|
83
97
|
"""
|
|
84
98
|
if self.last_file_name is None:
|
|
85
99
|
# for some reason a given worker never processed anything. Happens in testing
|
|
86
|
-
# when the amount of workers is greater
|
|
100
|
+
# when the amount of workers is greater than the amount of files
|
|
87
101
|
self.logger.debug("skipping flush, no name for file is defined")
|
|
88
102
|
return
|
|
89
103
|
try:
|
|
90
104
|
t_start = time.time()
|
|
91
105
|
# get flush results
|
|
92
106
|
self.logger.debug(
|
|
93
|
-
f"Begin flushing transform, last file name {self.last_file_name},
|
|
107
|
+
f"Begin flushing transform, last file name {self.last_file_name}, "
|
|
108
|
+
f"last index {self.last_file_name_next_index}"
|
|
94
109
|
)
|
|
95
110
|
out_files, stats = self.transform.flush_binary()
|
|
96
111
|
self.logger.debug(f"Done flushing transform, got {len(out_files)} files")
|
|
@@ -148,7 +163,7 @@ class AbstractTransformFileProcessor:
|
|
|
148
163
|
else:
|
|
149
164
|
self.last_file_name_next_index += 1
|
|
150
165
|
case _:
|
|
151
|
-
# we have more
|
|
166
|
+
# we have more than 1 file
|
|
152
167
|
file_sizes = 0
|
|
153
168
|
output_file_name = self.data_access.get_output_location(path=self.last_file_name)
|
|
154
169
|
start_index = self.last_file_name_next_index
|
{data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/log.py
RENAMED
|
@@ -26,7 +26,13 @@ def get_log_level(name: str = None) -> str:
|
|
|
26
26
|
return level_name
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
__logger_cache = {}
|
|
30
|
+
|
|
31
|
+
|
|
29
32
|
def get_logger(name: str, level=None, file=None) -> logging.Logger:
|
|
33
|
+
logger = __logger_cache.get(name, None)
|
|
34
|
+
if logger is not None:
|
|
35
|
+
return logger
|
|
30
36
|
logger = logging.getLogger(name)
|
|
31
37
|
if level is None:
|
|
32
38
|
level = get_log_level(name)
|
|
@@ -50,6 +56,7 @@ def get_logger(name: str, level=None, file=None) -> logging.Logger:
|
|
|
50
56
|
logger.addHandler(f_handler)
|
|
51
57
|
|
|
52
58
|
# Add handlers to the logger
|
|
59
|
+
__logger_cache[name] = logger
|
|
53
60
|
return logger
|
|
54
61
|
|
|
55
62
|
|
|
@@ -20,11 +20,6 @@ from data_processing.test_support.launch.transform_test import (
|
|
|
20
20
|
from data_processing.test_support.transform import NOOPPythonTransformConfiguration
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])})
|
|
24
|
-
expected_table = table # We're a noop after all.
|
|
25
|
-
expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result
|
|
26
|
-
|
|
27
|
-
|
|
28
23
|
class TestRayNOOPTransform(AbstractTransformLauncherTest):
|
|
29
24
|
"""
|
|
30
25
|
Extends the super-class to define the test data for the tests defined there.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
from data_processing.runtime.pure_python import PythonTransformLauncher
|
|
16
|
+
from data_processing.test_support.launch.transform_test import (
|
|
17
|
+
AbstractTransformLauncherTest,
|
|
18
|
+
)
|
|
19
|
+
from data_processing.test_support.transform import NOOPPythonTransformConfiguration
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TestPythonNOOPTransform(AbstractTransformLauncherTest):
|
|
23
|
+
"""
|
|
24
|
+
Extends the super-class to define the test data for the tests defined there.
|
|
25
|
+
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def get_test_transform_fixtures(self) -> list[tuple]:
|
|
29
|
+
basedir = "../../../../test-data/data_processing/python/noop/"
|
|
30
|
+
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
|
|
31
|
+
fixtures = []
|
|
32
|
+
launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
|
|
33
|
+
fixtures.append((
|
|
34
|
+
launcher,
|
|
35
|
+
{"noop_sleep_sec": 0, "runtime_num_processors": 2},
|
|
36
|
+
basedir + "/input", basedir + "/expected"))
|
|
37
|
+
return fixtures
|
data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_file_processor.py
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
# (C) Copyright IBM Corp. 2024.
|
|
2
|
-
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
-
# you may not use this file except in compliance with the License.
|
|
4
|
-
# You may obtain a copy of the License at
|
|
5
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
-
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
-
# See the License for the specific language governing permissions and
|
|
10
|
-
# limitations under the License.
|
|
11
|
-
################################################################################
|
|
12
|
-
|
|
13
|
-
from typing import Any
|
|
14
|
-
|
|
15
|
-
from data_processing.data_access import DataAccessFactoryBase
|
|
16
|
-
from data_processing.runtime import AbstractTransformFileProcessor
|
|
17
|
-
from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
|
|
18
|
-
from data_processing.transform import TransformStatistics
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class PythonTransformFileProcessor(AbstractTransformFileProcessor):
|
|
22
|
-
"""
|
|
23
|
-
This is the class implementing the worker class processing of a single file
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(
|
|
27
|
-
self,
|
|
28
|
-
data_access_factory: DataAccessFactoryBase,
|
|
29
|
-
statistics: TransformStatistics,
|
|
30
|
-
runtime_configuration: PythonTransformRuntimeConfiguration,
|
|
31
|
-
):
|
|
32
|
-
"""
|
|
33
|
-
Init method
|
|
34
|
-
:param data_access_factory - data access factory
|
|
35
|
-
:param statistics - reference to statistics class
|
|
36
|
-
:param runtime_configuration: transform configuration class
|
|
37
|
-
"""
|
|
38
|
-
# Create data access
|
|
39
|
-
super().__init__()
|
|
40
|
-
self.data_access = data_access_factory.create_data_access()
|
|
41
|
-
# Add data access and statistics to the processor parameters
|
|
42
|
-
transform_params = dict(runtime_configuration.get_transform_params())
|
|
43
|
-
transform_params["data_access"] = self.data_access
|
|
44
|
-
transform_params["statistics"] = statistics
|
|
45
|
-
# Create local processor
|
|
46
|
-
self.transform = runtime_configuration.get_transform_class()(transform_params)
|
|
47
|
-
# Create statistics
|
|
48
|
-
self.stats = statistics
|
|
49
|
-
|
|
50
|
-
def _publish_stats(self, stats: dict[str, Any]) -> None:
|
|
51
|
-
self.stats.add_stats(stats)
|
data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_orchestrator.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
|
1
|
-
# (C) Copyright IBM Corp. 2024.
|
|
2
|
-
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
-
# you may not use this file except in compliance with the License.
|
|
4
|
-
# You may obtain a copy of the License at
|
|
5
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
-
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
-
# See the License for the specific language governing permissions and
|
|
10
|
-
# limitations under the License.
|
|
11
|
-
################################################################################
|
|
12
|
-
|
|
13
|
-
import time
|
|
14
|
-
import traceback
|
|
15
|
-
from datetime import datetime
|
|
16
|
-
|
|
17
|
-
from data_processing.data_access import DataAccessFactoryBase
|
|
18
|
-
from data_processing.runtime import (
|
|
19
|
-
TransformExecutionConfiguration,
|
|
20
|
-
TransformRuntimeConfiguration,
|
|
21
|
-
)
|
|
22
|
-
from data_processing.runtime.pure_python import PythonTransformFileProcessor
|
|
23
|
-
from data_processing.transform import TransformStatistics
|
|
24
|
-
from data_processing.utils import get_logger
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
logger = get_logger(__name__)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def orchestrate(
|
|
31
|
-
data_access_factory: DataAccessFactoryBase,
|
|
32
|
-
runtime_config: TransformRuntimeConfiguration,
|
|
33
|
-
execution_config: TransformExecutionConfiguration,
|
|
34
|
-
) -> int:
|
|
35
|
-
"""
|
|
36
|
-
orchestrator for transformer execution
|
|
37
|
-
:param data_access_factory: data access factory
|
|
38
|
-
:param runtime_config: transformer configuration
|
|
39
|
-
:param execution_config: execution configuration
|
|
40
|
-
:return: 0 - success or 1 - failure
|
|
41
|
-
"""
|
|
42
|
-
start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
43
|
-
logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
|
|
44
|
-
# create statistics
|
|
45
|
-
statistics = TransformStatistics()
|
|
46
|
-
# create data access
|
|
47
|
-
data_access = data_access_factory.create_data_access()
|
|
48
|
-
if data_access is None:
|
|
49
|
-
logger.error("No DataAccess instance provided - exiting")
|
|
50
|
-
return 1
|
|
51
|
-
try:
|
|
52
|
-
# Get files to process
|
|
53
|
-
files, profile, retries = data_access.get_files_to_process()
|
|
54
|
-
if len(files) == 0:
|
|
55
|
-
logger.error("No input files to process - exiting")
|
|
56
|
-
return 0
|
|
57
|
-
logger.info(f"Number of files is {len(files)}, source profile {profile}")
|
|
58
|
-
# Print interval
|
|
59
|
-
print_interval = int(len(files) / 100)
|
|
60
|
-
if print_interval == 0:
|
|
61
|
-
print_interval = 1
|
|
62
|
-
if retries > 0:
|
|
63
|
-
statistics.add_stats({"data access retries": retries})
|
|
64
|
-
# create executor
|
|
65
|
-
executor = PythonTransformFileProcessor(
|
|
66
|
-
data_access_factory=data_access_factory, statistics=statistics, runtime_configuration=runtime_config
|
|
67
|
-
)
|
|
68
|
-
# process data
|
|
69
|
-
logger.debug(f"{runtime_config.get_name()} Begin processing files")
|
|
70
|
-
t_start = time.time()
|
|
71
|
-
completed = 0
|
|
72
|
-
for path in files:
|
|
73
|
-
executor.process_file(path)
|
|
74
|
-
completed += 1
|
|
75
|
-
if completed % print_interval == 0:
|
|
76
|
-
logger.info(
|
|
77
|
-
f"Completed {completed} files ({100 * completed / len(files)}%) "
|
|
78
|
-
f"in {(time.time() - t_start)/60} min"
|
|
79
|
-
)
|
|
80
|
-
logger.debug(f"Done processing {completed} files, waiting for flush() completion.")
|
|
81
|
-
# invoke flush to ensure that all results are returned
|
|
82
|
-
start = time.time()
|
|
83
|
-
executor.flush()
|
|
84
|
-
logger.info(f"done flushing in {time.time() - start} sec")
|
|
85
|
-
status = "success"
|
|
86
|
-
return_code = 0
|
|
87
|
-
except Exception as e:
|
|
88
|
-
logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
|
|
89
|
-
return_code = 1
|
|
90
|
-
status = "failure"
|
|
91
|
-
try:
|
|
92
|
-
# Compute execution statistics
|
|
93
|
-
logger.debug("Computing execution stats")
|
|
94
|
-
stats = statistics.get_execution_stats()
|
|
95
|
-
# build and save metadata
|
|
96
|
-
logger.debug("Building job metadata")
|
|
97
|
-
input_params = runtime_config.get_transform_metadata()
|
|
98
|
-
metadata = {
|
|
99
|
-
"pipeline": execution_config.pipeline_id,
|
|
100
|
-
"job details": execution_config.job_details
|
|
101
|
-
| {
|
|
102
|
-
"start_time": start_ts,
|
|
103
|
-
"end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
104
|
-
"status": status,
|
|
105
|
-
},
|
|
106
|
-
"code": execution_config.code_location,
|
|
107
|
-
"job_input_params": input_params | data_access_factory.get_input_params(),
|
|
108
|
-
"job_output_stats": stats,
|
|
109
|
-
}
|
|
110
|
-
logger.debug(f"Saving job metadata: {metadata}.")
|
|
111
|
-
data_access.save_job_metadata(metadata)
|
|
112
|
-
logger.debug("Saved job metadata.")
|
|
113
|
-
return return_code
|
|
114
|
-
except Exception as e:
|
|
115
|
-
logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
|
|
116
|
-
return 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/__init__.py
RENAMED
|
File without changes
|
{data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/cli_utils.py
RENAMED
|
File without changes
|
{data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/config.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|