data-prep-toolkit 0.2.1.dev1__tar.gz → 0.2.1.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {data_prep_toolkit-0.2.1.dev1/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1.dev2}/PKG-INFO +1 -1
  2. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/pyproject.toml +1 -1
  3. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
  4. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/SOURCES.txt +3 -0
  5. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/__init__.py +4 -1
  6. data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/execution_configuration.py +72 -0
  7. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/runtime_configuration.py +14 -2
  8. data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_file_processor.py +109 -0
  9. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/transform_launcher.py +3 -3
  10. data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_orchestrator.py +201 -0
  11. data_prep_toolkit-0.2.1.dev2/src/data_processing/runtime/pure_python/transform_runtime.py +53 -0
  12. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/transform_file_processor.py +21 -6
  13. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/log.py +7 -0
  14. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -5
  15. data_prep_toolkit-0.2.1.dev2/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +37 -0
  16. data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_file_processor.py +0 -51
  17. data_prep_toolkit-0.2.1.dev1/src/data_processing/runtime/pure_python/transform_orchestrator.py +0 -116
  18. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/Makefile +0 -0
  19. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/README.md +0 -0
  20. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/setup.cfg +0 -0
  21. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  22. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
  23. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  24. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/__init__.py +0 -0
  25. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/__init__.py +0 -0
  26. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/arrow_s3.py +0 -0
  27. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access.py +0 -0
  28. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_factory.py +0 -0
  29. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  30. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_local.py +0 -0
  31. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/data_access/data_access_s3.py +0 -0
  32. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/__init__.py +0 -0
  33. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/execution_configuration.py +0 -0
  34. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/pure_python/transform_invoker.py +0 -0
  35. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/runtime_configuration.py +0 -0
  36. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/runtime/transform_launcher.py +0 -0
  37. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/__init__.py +0 -0
  38. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/abstract_test.py +0 -0
  39. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/data_access/__init__.py +0 -0
  40. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  41. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/launch/__init__.py +0 -0
  42. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/launch/transform_test.py +0 -0
  43. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/__init__.py +0 -0
  44. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/binary_transform_test.py +0 -0
  45. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/noop_transform.py +0 -0
  46. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/test_support/transform/table_transform_test.py +0 -0
  47. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/__init__.py +0 -0
  48. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/abstract_transform.py +0 -0
  49. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/binary_transform.py +0 -0
  50. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/table_transform.py +0 -0
  51. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/transform_configuration.py +0 -0
  52. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/transform/transform_statistics.py +0 -0
  53. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/__init__.py +0 -0
  54. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/cli_utils.py +0 -0
  55. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/config.py +0 -0
  56. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/params_utils.py +0 -0
  57. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/pipinstaller.py +0 -0
  58. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/transform_configuration.json +0 -0
  59. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/transform_configurator.py +0 -0
  60. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/transform_utils.py +0 -0
  61. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/src/data_processing/utils/unrecoverable.py +0 -0
  62. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  63. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
  64. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
  65. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  66. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/invoker/python_invoker_test.py +0 -0
  67. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
  68. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
  69. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/transform/test_noop.py +0 -0
  70. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  71. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  72. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  73. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  74. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  75. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input/sample1.parquet +0 -0
  76. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  77. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  78. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  79. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
  80. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
  81. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
  82. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
  83. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
  84. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
  85. {data_prep_toolkit-0.2.1.dev1 → data_prep_toolkit-0.2.1.dev2}/test-data/data_processing/python/noop/input/test1.parquet +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1.dev1
3
+ Version: 0.2.1.dev2
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.2.1.dev1"
3
+ version = "0.2.1.dev2"
4
4
  requires-python = ">=3.10"
5
5
  keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
6
6
  description = "Data Preparation Toolkit Library"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1.dev1
3
+ Version: 0.2.1.dev2
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -20,11 +20,13 @@ src/data_processing/runtime/runtime_configuration.py
20
20
  src/data_processing/runtime/transform_file_processor.py
21
21
  src/data_processing/runtime/transform_launcher.py
22
22
  src/data_processing/runtime/pure_python/__init__.py
23
+ src/data_processing/runtime/pure_python/execution_configuration.py
23
24
  src/data_processing/runtime/pure_python/runtime_configuration.py
24
25
  src/data_processing/runtime/pure_python/transform_file_processor.py
25
26
  src/data_processing/runtime/pure_python/transform_invoker.py
26
27
  src/data_processing/runtime/pure_python/transform_launcher.py
27
28
  src/data_processing/runtime/pure_python/transform_orchestrator.py
29
+ src/data_processing/runtime/pure_python/transform_runtime.py
28
30
  src/data_processing/test_support/__init__.py
29
31
  src/data_processing/test_support/abstract_test.py
30
32
  src/data_processing/test_support/data_access/__init__.py
@@ -74,5 +76,6 @@ test/data_processing_tests/invoker/python_invoker_test.py
74
76
  test/data_processing_tests/launch/pure_python/launcher_test.py
75
77
  test/data_processing_tests/launch/pure_python/multi_launcher_test.py
76
78
  test/data_processing_tests/launch/pure_python/test_noop_launch.py
79
+ test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py
77
80
  test/data_processing_tests/transform/test_noop.py
78
81
  test/data_processing_tests/util/transform_utils_test.py
@@ -1,5 +1,8 @@
1
+ from data_processing.runtime.pure_python.transform_runtime import DefaultPythonTransformRuntime
1
2
  from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
2
- from data_processing.runtime.pure_python.transform_file_processor import PythonTransformFileProcessor
3
+ from data_processing.runtime.pure_python.execution_configuration import PythonTransformExecutionConfiguration
4
+ from data_processing.runtime.pure_python.transform_file_processor import (PythonTransformFileProcessor,
5
+ PythonPoolTransformFileProcessor)
3
6
  from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
4
7
  from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
5
8
  from data_processing.runtime.pure_python.transform_invoker import invoke_transform, execute_python_transform
@@ -0,0 +1,72 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import argparse
14
+ from typing import Any
15
+
16
+ from data_processing.runtime import TransformExecutionConfiguration
17
+ from data_processing.utils import CLIArgumentProvider, get_logger
18
+
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ cli_prefix = "runtime_"
24
+
25
+
26
+ class PythonTransformExecutionConfiguration(TransformExecutionConfiguration):
27
+ """
28
+ A class specifying and validating Python orchestrator configuration
29
+ """
30
+
31
+ def __init__(self, name: str):
32
+ """
33
+ Initialization
34
+ """
35
+ super().__init__(name=name, print_params=False)
36
+ self.num_processors = 0
37
+
38
+ def add_input_params(self, parser: argparse.ArgumentParser) -> None:
39
+ """
40
+ This method adds transformer specific parameter to parser
41
+ :param parser: parser
42
+ :return:
43
+ """
44
+ parser.add_argument(f"--{cli_prefix}num_processors", type=int, default=0, help="size of multiprocessing pool")
45
+
46
+ return TransformExecutionConfiguration.add_input_params(self, parser=parser)
47
+
48
+ def apply_input_params(self, args: argparse.Namespace) -> bool:
49
+ """
50
+ Validate transformer specific parameters
51
+ :param args: user defined arguments
52
+ :return: True, if validate pass or False otherwise
53
+ """
54
+ if not TransformExecutionConfiguration.apply_input_params(self, args=args):
55
+ return False
56
+ captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
57
+ # store parameters locally
58
+ self.num_processors = captured["num_processors"]
59
+ # print them
60
+ if self.num_processors > 0:
61
+ # we are using multiprocessing
62
+ logger.info(f"using multiprocessing, num processors {self.num_processors}")
63
+ return True
64
+
65
+ def get_input_params(self) -> dict[str, Any]:
66
+ """
67
+ get input parameters for job_input_params in metadata
68
+ :return: dictionary of parameters
69
+ """
70
+ return {
71
+ "num_processors": self.num_processors
72
+ }
@@ -12,13 +12,25 @@
12
12
 
13
13
  from data_processing.runtime import TransformRuntimeConfiguration
14
14
  from data_processing.transform import TransformConfiguration
15
+ from data_processing.runtime.pure_python import DefaultPythonTransformRuntime
15
16
 
16
17
 
17
18
  class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
18
- def __init__(self, transform_config: TransformConfiguration):
19
+ def __init__(self,
20
+ transform_config: TransformConfiguration,
21
+ runtime_class: type[DefaultPythonTransformRuntime] = DefaultPythonTransformRuntime,
22
+ ):
19
23
  """
20
24
  Initialization
21
25
  :param transform_config - base configuration class
26
+ :param runtime_class: implementation of the transform runtime
22
27
  """
23
- self.transform_config = transform_config
28
+ self.runtime_class = runtime_class
24
29
  super().__init__(transform_config=transform_config)
30
+
31
+ def create_transform_runtime(self) -> DefaultPythonTransformRuntime:
32
+ """
33
+ Create transform runtime with the parameters captured during apply_input_params()
34
+ :return: transform runtime object
35
+ """
36
+ return self.runtime_class(self.transform_config.get_transform_params())
@@ -0,0 +1,109 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
16
+ from data_processing.runtime import AbstractTransformFileProcessor
17
+ from data_processing.transform import AbstractBinaryTransform, TransformStatistics
18
+
19
+
20
+ class PythonTransformFileProcessor(AbstractTransformFileProcessor):
21
+ """
22
+ This is the class implementing the worker class processing of a single file
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ data_access_factory: DataAccessFactoryBase,
28
+ statistics: TransformStatistics,
29
+ transform_params: dict[str, Any],
30
+ transform_class: type[AbstractBinaryTransform],
31
+ ):
32
+ """
33
+ Init method
34
+ :param data_access_factory - data access factory
35
+ :param statistics - reference to statistics class
36
+ :param transform_params - transform parameters
37
+ :param transform_class: transform class
38
+ """
39
+ # invoke superclass
40
+ super().__init__(
41
+ data_access_factory=data_access_factory,
42
+ transform_parameters=dict(transform_params),
43
+ )
44
+ self.transform_params["statistics"] = statistics
45
+ # Create local processor
46
+ self.transform = transform_class(self.transform_params)
47
+ # Create statistics
48
+ self.stats = statistics
49
+
50
+ def _publish_stats(self, stats: dict[str, Any]) -> None:
51
+ self.stats.add_stats(stats)
52
+
53
+
54
+ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
55
+ """
56
+ This is the class implementing the worker class processing of a single file
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ data_access_factory: DataAccessFactoryBase,
62
+ transform_params: dict[str, Any],
63
+ transform_class: type[AbstractBinaryTransform],
64
+ ):
65
+ """
66
+ Init method
67
+ :param data_access_factory - data access factory
68
+ :param transform_params - transform parameters
69
+ :param transform_class: transform class
70
+ """
71
+ super().__init__(
72
+ data_access_factory=data_access_factory,
73
+ transform_parameters=dict(transform_params),
74
+ )
75
+ # Add data access and statistics to the processor parameters
76
+ self.transform_params["data_access"] = self.data_access
77
+ self.transform_class = transform_class
78
+ self.transform = None
79
+
80
+ def process_file(self, f_name: str) -> dict[str, Any]:
81
+ # re initialize statistics
82
+ self.stats = {}
83
+ if self.transform is None:
84
+ # create transform. Make sure to do this locally
85
+ self.transform = self.transform_class(self.transform_params)
86
+ # Invoke superclass method
87
+ super().process_file(f_name=f_name)
88
+ # return collected statistics
89
+ return self.stats
90
+
91
+ def flush(self) -> dict[str, Any]:
92
+ # re initialize statistics
93
+ self.stats = {}
94
+ # Invoke superclass method
95
+ super().flush()
96
+ # return collected statistics
97
+ return self.stats
98
+
99
+ def _publish_stats(self, stats: dict[str, Any]) -> None:
100
+ """
101
+ Publish statistics (to the local dictionary)
102
+ :param stats: statistics dictionary
103
+ :return: None
104
+ """
105
+ for key, val in stats.items():
106
+ # for all key/values
107
+ if val > 0:
108
+ # for values greater then 0
109
+ self.stats[key] = self.stats.get(key, 0) + val
@@ -14,9 +14,9 @@ import argparse
14
14
  import time
15
15
 
16
16
  from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
17
- from data_processing.runtime import TransformExecutionConfiguration
18
17
  from data_processing.runtime.pure_python import (
19
18
  PythonTransformRuntimeConfiguration,
19
+ PythonTransformExecutionConfiguration,
20
20
  orchestrate,
21
21
  )
22
22
  from data_processing.runtime.transform_launcher import AbstractTransformLauncher
@@ -42,7 +42,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
42
42
  :param data_access_factory: the factory to create DataAccess instances.
43
43
  """
44
44
  super().__init__(runtime_config, data_access_factory)
45
- self.execution_config = TransformExecutionConfiguration(name=runtime_config.get_name())
45
+ self.execution_config = PythonTransformExecutionConfiguration(name=runtime_config.get_name())
46
46
 
47
47
  def __get_parameters(self) -> bool:
48
48
  """
@@ -85,7 +85,7 @@ class PythonTransformLauncher(AbstractTransformLauncher):
85
85
  except Exception as e:
86
86
  logger.info(f"Exception running orchestration\n{e}")
87
87
  finally:
88
- logger.info(f"Completed execution in {(time.time() - start)/60.} min, execution result {res}")
88
+ logger.info(f"Completed execution in {round((time.time() - start)/60., 3)} min, execution result {res}")
89
89
  return res
90
90
 
91
91
  def launch(self) -> int:
@@ -0,0 +1,201 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import time
14
+ from typing import Any
15
+ from multiprocessing import Pool
16
+ import traceback
17
+ from datetime import datetime
18
+
19
+ from data_processing.data_access import DataAccessFactoryBase
20
+ from data_processing.runtime.pure_python import (
21
+ PythonTransformExecutionConfiguration,
22
+ PythonTransformRuntimeConfiguration,
23
+ PythonTransformFileProcessor,
24
+ PythonPoolTransformFileProcessor,
25
+ )
26
+ from data_processing.transform import TransformStatistics, AbstractBinaryTransform
27
+ from data_processing.utils import get_logger
28
+
29
+
30
+ logger = get_logger(__name__)
31
+
32
+
33
+ def orchestrate(
34
+ data_access_factory: DataAccessFactoryBase,
35
+ runtime_config: PythonTransformRuntimeConfiguration,
36
+ execution_config: PythonTransformExecutionConfiguration,
37
+ ) -> int:
38
+ """
39
+ orchestrator for transformer execution
40
+ :param data_access_factory: data access factory
41
+ :param runtime_config: transformer configuration
42
+ :param execution_config: execution configuration
43
+ :return: 0 - success or 1 - failure
44
+ """
45
+ start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
46
+ logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
47
+ # create statistics
48
+ statistics = TransformStatistics()
49
+ # create data access
50
+ data_access = data_access_factory.create_data_access()
51
+ if data_access is None:
52
+ logger.error("No DataAccess instance provided - exiting")
53
+ return 1
54
+ # create additional execution parameters
55
+ runtime = runtime_config.create_transform_runtime()
56
+ try:
57
+ # Get files to process
58
+ files, profile, retries = data_access.get_files_to_process()
59
+ if len(files) == 0:
60
+ logger.error("No input files to process - exiting")
61
+ return 0
62
+ if retries > 0:
63
+ statistics.add_stats({"data access retries": retries})
64
+ logger.info(f"Number of files is {len(files)}, source profile {profile}")
65
+ # Print interval
66
+ print_interval = int(len(files) / 100)
67
+ if print_interval == 0:
68
+ print_interval = 1
69
+ logger.debug(f"{runtime_config.get_name()} Begin processing files")
70
+ if execution_config.num_processors > 0:
71
+ # using multiprocessor pool for execution
72
+ statistics = _process_transforms_multiprocessor(files=files, size=execution_config.num_processors,
73
+ data_access_factory=data_access_factory,
74
+ print_interval=print_interval,
75
+ transform_params=runtime.get_transform_config(
76
+ data_access_factory=data_access_factory,
77
+ statistics=statistics, files=files),
78
+ transform_class=runtime_config.get_transform_class())
79
+ else:
80
+ # using sequential execution
81
+ _process_transforms(files=files, data_access_factory=data_access_factory,
82
+ print_interval=print_interval, statistics=statistics,
83
+ transform_params=runtime.get_transform_config(
84
+ data_access_factory=data_access_factory,
85
+ statistics=statistics, files=files),
86
+ transform_class=runtime_config.get_transform_class())
87
+ status = "success"
88
+ return_code = 0
89
+ except Exception as e:
90
+ logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
91
+ return_code = 1
92
+ status = "failure"
93
+ try:
94
+ # Compute execution statistics
95
+ logger.debug("Computing execution stats")
96
+ stats = statistics.get_execution_stats()
97
+ stats["processing_time"] = round(stats["processing_time"], 3)
98
+ # build and save metadata
99
+ logger.debug("Building job metadata")
100
+ input_params = runtime_config.get_transform_metadata()
101
+ runtime.compute_execution_stats(stats=statistics)
102
+ metadata = {
103
+ "pipeline": execution_config.pipeline_id,
104
+ "job details": execution_config.job_details
105
+ | {
106
+ "start_time": start_ts,
107
+ "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
108
+ "status": status,
109
+ },
110
+ "code": execution_config.code_location,
111
+ "job_input_params":
112
+ input_params | data_access_factory.get_input_params() | execution_config.get_input_params(),
113
+ "job_output_stats": stats,
114
+ }
115
+ logger.debug(f"Saving job metadata: {metadata}.")
116
+ data_access.save_job_metadata(metadata)
117
+ logger.debug("Saved job metadata.")
118
+ return return_code
119
+ except Exception as e:
120
+ logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
121
+ return 1
122
+
123
+
124
+ def _process_transforms(files: list[str], print_interval: int, data_access_factory: DataAccessFactoryBase,
125
+ statistics: TransformStatistics, transform_params: dict[str, Any],
126
+ transform_class: type[AbstractBinaryTransform]) -> None:
127
+ """
128
+ Process transforms sequentially
129
+ :param files: list of files to process
130
+ :param statistics: statistics class
131
+ :param print_interval: print interval
132
+ :param data_access_factory: data access factory
133
+ :param transform_params - transform parameters
134
+ :param transform_class: transform class
135
+ :return: metadata for the execution
136
+
137
+ :return: None
138
+ """
139
+ # create executor
140
+ executor = PythonTransformFileProcessor(data_access_factory=data_access_factory, statistics=statistics,
141
+ transform_params=transform_params, transform_class=transform_class)
142
+ # process data
143
+ t_start = time.time()
144
+ completed = 0
145
+ for path in files:
146
+ executor.process_file(path)
147
+ completed += 1
148
+ if completed % print_interval == 0:
149
+ logger.info(
150
+ f"Completed {completed} files ({round(100 * completed / len(files), 2)}%) "
151
+ f"in {round((time.time() - t_start)/60., 3)} min"
152
+ )
153
+ logger.info(f"Done processing {completed} files, waiting for flush() completion.")
154
+ # invoke flush to ensure that all results are returned
155
+ start = time.time()
156
+ executor.flush()
157
+ logger.info(f"done flushing in {round(time.time() - start, 3)} sec")
158
+
159
+
160
+ def _process_transforms_multiprocessor(files: list[str], size: int, print_interval: int,
161
+ data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any],
162
+ transform_class: type[AbstractBinaryTransform]) -> TransformStatistics:
163
+ """
164
+ Process transforms using multiprocessing pool
165
+ :param files: list of files to process
166
+ :param size: pool size
167
+ :param print_interval: print interval
168
+ :param data_access_factory: data access factory
169
+ :param transform_params - transform parameters
170
+ :param transform_class: transform class
171
+ :return: metadata for the execution
172
+ """
173
+ # result statistics
174
+ statistics = TransformStatistics()
175
+ # create processor
176
+ processor = PythonPoolTransformFileProcessor(data_access_factory=data_access_factory,
177
+ transform_params=transform_params, transform_class=transform_class)
178
+ completed = 0
179
+ t_start = time.time()
180
+ # create multiprocessing pool
181
+ with Pool(processes=size) as pool:
182
+ # execute for every input file
183
+ for result in pool.imap_unordered(processor.process_file, files):
184
+ completed += 1
185
+ # accumulate statistics
186
+ statistics.add_stats(result)
187
+ if completed % print_interval == 0:
188
+ # print intermediate statistics
189
+ logger.info(
190
+ f"Completed {completed} files ({round(100 * completed / len(files), 2)}%) "
191
+ f"in {round((time.time() - t_start)/60., 3)} min"
192
+ )
193
+ logger.info(f"Done processing {completed} files, waiting for flush() completion.")
194
+ results = [{}] * size
195
+ # flush
196
+ for i in range(size):
197
+ results[i] = pool.apply_async(processor.flush)
198
+ for s in results:
199
+ statistics.add_stats(s.get())
200
+ logger.info(f"done flushing in {time.time() - t_start} sec")
201
+ return statistics
@@ -0,0 +1,53 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from typing import Any
14
+
15
+ from data_processing.data_access import DataAccessFactoryBase
16
+ from data_processing.transform import TransformStatistics
17
+
18
+
19
+ class DefaultPythonTransformRuntime:
20
+ """
21
+ Transformer runtime used by processor to to create Transform specific environment
22
+ """
23
+
24
+ def __init__(self, params: dict[str, Any]):
25
+ """
26
+ Create/config this runtime.
27
+ :param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
28
+ """
29
+ self.params = params
30
+
31
+ def get_transform_config(
32
+ self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
33
+ ) -> dict[str, Any]:
34
+ """
35
+ Get the dictionary of configuration that will be provided to the transform's initializer.
36
+ This is the opportunity for this runtime to create a new set of configuration based on the
37
+ config/params provided to this instance's initializer. This may include the addition
38
+ of new configuration data such as ray shared memory, new actors, etc, that might be needed and
39
+ expected by the transform in its initializer and/or transform() methods.
40
+ :param data_access_factory - data access factory class being used by the RayOrchestrator.
41
+ :param statistics - reference to statistics actor
42
+ :param files - list of files to process
43
+ :return: dictionary of transform init params
44
+ """
45
+ return self.params
46
+
47
+ def compute_execution_stats(self, stats: TransformStatistics) -> None:
48
+ """
49
+ Update/augment the given statistics object with runtime-specific additions/modifications.
50
+ :param stats: output of statistics as aggregated across all calls to all transforms.
51
+ :return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
52
+ """
53
+ return stats
@@ -13,6 +13,7 @@ import time
13
13
  import traceback
14
14
  from typing import Any
15
15
 
16
+ from data_processing.data_access import DataAccessFactoryBase
16
17
  from data_processing.utils import TransformUtils, UnrecoverableException, get_logger
17
18
 
18
19
 
@@ -21,17 +22,30 @@ class AbstractTransformFileProcessor:
21
22
  This is the the base class implementing processing of a single binary file
22
23
  """
23
24
 
24
- def __init__(self):
25
+ def __init__(
26
+ self,
27
+ data_access_factory: DataAccessFactoryBase,
28
+ transform_parameters: dict[str, Any],
29
+ ):
25
30
  """
26
31
  Init method
32
+ :param data_access_factory: Data Access Factory
33
+ :param transform_parameters: Transform parameters
27
34
  """
28
- self.data_access = None
35
+ self.logger = get_logger(__name__)
36
+ # validate parameters
37
+ if data_access_factory is None:
38
+ self.logger.error("Transform file processor: data access factory is not specified")
39
+ raise UnrecoverableException("data access factory is None")
29
40
  self.transform = None
30
41
  self.stats = None
31
42
  self.last_file_name = None
32
43
  self.last_extension = None
33
44
  self.last_file_name_next_index = None
34
- self.logger = get_logger(__name__)
45
+ self.data_access = data_access_factory.create_data_access()
46
+ # Add data access and statistics to the processor parameters
47
+ self.transform_params = transform_parameters
48
+ self.transform_params["data_access"] = self.data_access
35
49
 
36
50
  def process_file(self, f_name: str) -> None:
37
51
  """
@@ -83,14 +97,15 @@ class AbstractTransformFileProcessor:
83
97
  """
84
98
  if self.last_file_name is None:
85
99
  # for some reason a given worker never processed anything. Happens in testing
86
- # when the amount of workers is greater then the amount of files
100
+ # when the amount of workers is greater than the amount of files
87
101
  self.logger.debug("skipping flush, no name for file is defined")
88
102
  return
89
103
  try:
90
104
  t_start = time.time()
91
105
  # get flush results
92
106
  self.logger.debug(
93
- f"Begin flushing transform, last file name {self.last_file_name}, last index {self.last_file_name_next_index}"
107
+ f"Begin flushing transform, last file name {self.last_file_name}, "
108
+ f"last index {self.last_file_name_next_index}"
94
109
  )
95
110
  out_files, stats = self.transform.flush_binary()
96
111
  self.logger.debug(f"Done flushing transform, got {len(out_files)} files")
@@ -148,7 +163,7 @@ class AbstractTransformFileProcessor:
148
163
  else:
149
164
  self.last_file_name_next_index += 1
150
165
  case _:
151
- # we have more then 1 file
166
+ # we have more than 1 file
152
167
  file_sizes = 0
153
168
  output_file_name = self.data_access.get_output_location(path=self.last_file_name)
154
169
  start_index = self.last_file_name_next_index
@@ -26,7 +26,13 @@ def get_log_level(name: str = None) -> str:
26
26
  return level_name
27
27
 
28
28
 
29
+ __logger_cache = {}
30
+
31
+
29
32
  def get_logger(name: str, level=None, file=None) -> logging.Logger:
33
+ logger = __logger_cache.get(name, None)
34
+ if logger is not None:
35
+ return logger
30
36
  logger = logging.getLogger(name)
31
37
  if level is None:
32
38
  level = get_log_level(name)
@@ -50,6 +56,7 @@ def get_logger(name: str, level=None, file=None) -> logging.Logger:
50
56
  logger.addHandler(f_handler)
51
57
 
52
58
  # Add handlers to the logger
59
+ __logger_cache[name] = logger
53
60
  return logger
54
61
 
55
62
 
@@ -20,11 +20,6 @@ from data_processing.test_support.launch.transform_test import (
20
20
  from data_processing.test_support.transform import NOOPPythonTransformConfiguration
21
21
 
22
22
 
23
- table = pa.Table.from_pydict({"name": pa.array(["Tom"]), "age": pa.array([23])})
24
- expected_table = table # We're a noop after all.
25
- expected_metadata_list = [{"nfiles": 1, "nrows": 1}, {}] # transform() result # flush() result
26
-
27
-
28
23
  class TestRayNOOPTransform(AbstractTransformLauncherTest):
29
24
  """
30
25
  Extends the super-class to define the test data for the tests defined there.
@@ -0,0 +1,37 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ import os
14
+
15
+ from data_processing.runtime.pure_python import PythonTransformLauncher
16
+ from data_processing.test_support.launch.transform_test import (
17
+ AbstractTransformLauncherTest,
18
+ )
19
+ from data_processing.test_support.transform import NOOPPythonTransformConfiguration
20
+
21
+
22
+ class TestPythonNOOPTransform(AbstractTransformLauncherTest):
23
+ """
24
+ Extends the super-class to define the test data for the tests defined there.
25
+ The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
26
+ """
27
+
28
+ def get_test_transform_fixtures(self) -> list[tuple]:
29
+ basedir = "../../../../test-data/data_processing/python/noop/"
30
+ basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
31
+ fixtures = []
32
+ launcher = PythonTransformLauncher(NOOPPythonTransformConfiguration())
33
+ fixtures.append((
34
+ launcher,
35
+ {"noop_sleep_sec": 0, "runtime_num_processors": 2},
36
+ basedir + "/input", basedir + "/expected"))
37
+ return fixtures
@@ -1,51 +0,0 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
- from typing import Any
14
-
15
- from data_processing.data_access import DataAccessFactoryBase
16
- from data_processing.runtime import AbstractTransformFileProcessor
17
- from data_processing.runtime.pure_python import PythonTransformRuntimeConfiguration
18
- from data_processing.transform import TransformStatistics
19
-
20
-
21
- class PythonTransformFileProcessor(AbstractTransformFileProcessor):
22
- """
23
- This is the class implementing the worker class processing of a single file
24
- """
25
-
26
- def __init__(
27
- self,
28
- data_access_factory: DataAccessFactoryBase,
29
- statistics: TransformStatistics,
30
- runtime_configuration: PythonTransformRuntimeConfiguration,
31
- ):
32
- """
33
- Init method
34
- :param data_access_factory - data access factory
35
- :param statistics - reference to statistics class
36
- :param runtime_configuration: transform configuration class
37
- """
38
- # Create data access
39
- super().__init__()
40
- self.data_access = data_access_factory.create_data_access()
41
- # Add data access and statistics to the processor parameters
42
- transform_params = dict(runtime_configuration.get_transform_params())
43
- transform_params["data_access"] = self.data_access
44
- transform_params["statistics"] = statistics
45
- # Create local processor
46
- self.transform = runtime_configuration.get_transform_class()(transform_params)
47
- # Create statistics
48
- self.stats = statistics
49
-
50
- def _publish_stats(self, stats: dict[str, Any]) -> None:
51
- self.stats.add_stats(stats)
@@ -1,116 +0,0 @@
1
- # (C) Copyright IBM Corp. 2024.
2
- # Licensed under the Apache License, Version 2.0 (the “License”);
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- # http://www.apache.org/licenses/LICENSE-2.0
6
- # Unless required by applicable law or agreed to in writing, software
7
- # distributed under the License is distributed on an “AS IS” BASIS,
8
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
- # See the License for the specific language governing permissions and
10
- # limitations under the License.
11
- ################################################################################
12
-
13
- import time
14
- import traceback
15
- from datetime import datetime
16
-
17
- from data_processing.data_access import DataAccessFactoryBase
18
- from data_processing.runtime import (
19
- TransformExecutionConfiguration,
20
- TransformRuntimeConfiguration,
21
- )
22
- from data_processing.runtime.pure_python import PythonTransformFileProcessor
23
- from data_processing.transform import TransformStatistics
24
- from data_processing.utils import get_logger
25
-
26
-
27
- logger = get_logger(__name__)
28
-
29
-
30
- def orchestrate(
31
- data_access_factory: DataAccessFactoryBase,
32
- runtime_config: TransformRuntimeConfiguration,
33
- execution_config: TransformExecutionConfiguration,
34
- ) -> int:
35
- """
36
- orchestrator for transformer execution
37
- :param data_access_factory: data access factory
38
- :param runtime_config: transformer configuration
39
- :param execution_config: execution configuration
40
- :return: 0 - success or 1 - failure
41
- """
42
- start_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
43
- logger.info(f"orchestrator {runtime_config.get_name()} started at {start_ts}")
44
- # create statistics
45
- statistics = TransformStatistics()
46
- # create data access
47
- data_access = data_access_factory.create_data_access()
48
- if data_access is None:
49
- logger.error("No DataAccess instance provided - exiting")
50
- return 1
51
- try:
52
- # Get files to process
53
- files, profile, retries = data_access.get_files_to_process()
54
- if len(files) == 0:
55
- logger.error("No input files to process - exiting")
56
- return 0
57
- logger.info(f"Number of files is {len(files)}, source profile {profile}")
58
- # Print interval
59
- print_interval = int(len(files) / 100)
60
- if print_interval == 0:
61
- print_interval = 1
62
- if retries > 0:
63
- statistics.add_stats({"data access retries": retries})
64
- # create executor
65
- executor = PythonTransformFileProcessor(
66
- data_access_factory=data_access_factory, statistics=statistics, runtime_configuration=runtime_config
67
- )
68
- # process data
69
- logger.debug(f"{runtime_config.get_name()} Begin processing files")
70
- t_start = time.time()
71
- completed = 0
72
- for path in files:
73
- executor.process_file(path)
74
- completed += 1
75
- if completed % print_interval == 0:
76
- logger.info(
77
- f"Completed {completed} files ({100 * completed / len(files)}%) "
78
- f"in {(time.time() - t_start)/60} min"
79
- )
80
- logger.debug(f"Done processing {completed} files, waiting for flush() completion.")
81
- # invoke flush to ensure that all results are returned
82
- start = time.time()
83
- executor.flush()
84
- logger.info(f"done flushing in {time.time() - start} sec")
85
- status = "success"
86
- return_code = 0
87
- except Exception as e:
88
- logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
89
- return_code = 1
90
- status = "failure"
91
- try:
92
- # Compute execution statistics
93
- logger.debug("Computing execution stats")
94
- stats = statistics.get_execution_stats()
95
- # build and save metadata
96
- logger.debug("Building job metadata")
97
- input_params = runtime_config.get_transform_metadata()
98
- metadata = {
99
- "pipeline": execution_config.pipeline_id,
100
- "job details": execution_config.job_details
101
- | {
102
- "start_time": start_ts,
103
- "end_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
104
- "status": status,
105
- },
106
- "code": execution_config.code_location,
107
- "job_input_params": input_params | data_access_factory.get_input_params(),
108
- "job_output_stats": stats,
109
- }
110
- logger.debug(f"Saving job metadata: {metadata}.")
111
- data_access.save_job_metadata(metadata)
112
- logger.debug("Saved job metadata.")
113
- return return_code
114
- except Exception as e:
115
- logger.error(f"Exception during execution {e}: {traceback.print_exc()}")
116
- return 1