data-prep-toolkit 0.2.1.dev2__tar.gz → 0.2.1.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {data_prep_toolkit-0.2.1.dev2/src/data_prep_toolkit.egg-info → data_prep_toolkit-0.2.1.dev3}/PKG-INFO +1 -1
  2. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/pyproject.toml +2 -2
  3. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3/src/data_prep_toolkit.egg-info}/PKG-INFO +1 -1
  4. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/SOURCES.txt +1 -0
  5. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/__init__.py +1 -0
  6. data_prep_toolkit-0.2.1.dev3/src/data_processing/data_access/snapshot_utils.py +31 -0
  7. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/__init__.py +4 -2
  8. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/execution_configuration.py +1 -3
  9. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/runtime_configuration.py +6 -5
  10. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_file_processor.py +5 -7
  11. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_orchestrator.py +53 -30
  12. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/transform_file_processor.py +10 -4
  13. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/abstract_test.py +22 -22
  14. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/launch/transform_test.py +1 -1
  15. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/transform_statistics.py +1 -2
  16. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/Makefile +0 -0
  17. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/README.md +0 -0
  18. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/setup.cfg +0 -0
  19. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/dependency_links.txt +0 -0
  20. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/requires.txt +0 -0
  21. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_prep_toolkit.egg-info/top_level.txt +0 -0
  22. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/__init__.py +0 -0
  23. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/arrow_s3.py +0 -0
  24. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access.py +0 -0
  25. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_factory.py +0 -0
  26. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_factory_base.py +0 -0
  27. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_local.py +0 -0
  28. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/data_access/data_access_s3.py +0 -0
  29. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/__init__.py +0 -0
  30. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/execution_configuration.py +0 -0
  31. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_invoker.py +0 -0
  32. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_launcher.py +1 -1
  33. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/pure_python/transform_runtime.py +0 -0
  34. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/runtime_configuration.py +0 -0
  35. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/runtime/transform_launcher.py +0 -0
  36. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/__init__.py +0 -0
  37. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/data_access/__init__.py +0 -0
  38. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/data_access/data_access_factory_test.py +0 -0
  39. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/launch/__init__.py +0 -0
  40. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/__init__.py +0 -0
  41. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/binary_transform_test.py +0 -0
  42. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/noop_transform.py +0 -0
  43. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/test_support/transform/table_transform_test.py +0 -0
  44. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/__init__.py +0 -0
  45. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/abstract_transform.py +0 -0
  46. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/binary_transform.py +0 -0
  47. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/table_transform.py +0 -0
  48. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/transform/transform_configuration.py +0 -0
  49. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/__init__.py +0 -0
  50. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/cli_utils.py +0 -0
  51. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/config.py +0 -0
  52. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/log.py +0 -0
  53. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/params_utils.py +0 -0
  54. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/pipinstaller.py +0 -0
  55. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/transform_configuration.json +0 -0
  56. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/transform_configurator.py +0 -0
  57. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/transform_utils.py +0 -0
  58. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/src/data_processing/utils/unrecoverable.py +0 -0
  59. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/daf_local_test.py +0 -0
  60. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/data_access_local_test.py +0 -0
  61. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/data_access_s3_test.py +0 -0
  62. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/data_access/sample_input_data_test.py +0 -0
  63. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/invoker/python_invoker_test.py +0 -0
  64. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/launcher_test.py +0 -0
  65. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/multi_launcher_test.py +0 -0
  66. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/test_noop_launch.py +0 -0
  67. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/launch/pure_python/test_noop_python_multiprocessor.py +0 -0
  68. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/transform/test_noop.py +0 -0
  69. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test/data_processing_tests/util/transform_utils_test.py +0 -0
  70. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/input/ds1/sample1.parquet +0 -0
  71. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/input/ds1/sample2.parquet +0 -0
  72. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/input/ds2/sample3.parquet +0 -0
  73. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/daf/output/ds1/sample1.parquet +0 -0
  74. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input/sample1.parquet +0 -0
  75. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input_multiple/sample1.parquet +0 -0
  76. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input_multiple/sample2.parquet +0 -0
  77. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/input_multiple/sample3.parquet +0 -0
  78. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/metadata.json +0 -0
  79. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/sample1.parquet +0 -0
  80. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/subdir/test1.parquet +0 -0
  81. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/expected/test1.parquet +0 -0
  82. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/input/sample1.parquet +0 -0
  83. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/input/subdir/test1.parquet +0 -0
  84. {data_prep_toolkit-0.2.1.dev2 → data_prep_toolkit-0.2.1.dev3}/test-data/data_processing/python/noop/input/test1.parquet +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1.dev2
3
+ Version: 0.2.1.dev3
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "data_prep_toolkit"
3
- version = "0.2.1.dev2"
3
+ version = "0.2.1.dev3"
4
4
  requires-python = ">=3.10"
5
5
  keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
6
6
  description = "Data Preparation Toolkit Library"
@@ -41,7 +41,7 @@ dev = [
41
41
  ]
42
42
 
43
43
  [options]
44
- package_dir = ["src"]
44
+ package_dir = ["src","test"]
45
45
 
46
46
  [options.packages.find]
47
47
  where = ["src/data_processing"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: data_prep_toolkit
3
- Version: 0.2.1.dev2
3
+ Version: 0.2.1.dev3
4
4
  Summary: Data Preparation Toolkit Library
5
5
  Author-email: David Wood <dawood@us.ibm.com>, Boris Lublinsky <blublinsky@ibm.com>
6
6
  License: Apache-2.0
@@ -14,6 +14,7 @@ src/data_processing/data_access/data_access_factory.py
14
14
  src/data_processing/data_access/data_access_factory_base.py
15
15
  src/data_processing/data_access/data_access_local.py
16
16
  src/data_processing/data_access/data_access_s3.py
17
+ src/data_processing/data_access/snapshot_utils.py
17
18
  src/data_processing/runtime/__init__.py
18
19
  src/data_processing/runtime/execution_configuration.py
19
20
  src/data_processing/runtime/runtime_configuration.py
@@ -4,3 +4,4 @@ from data_processing.data_access.data_access_local import DataAccessLocal
4
4
  from data_processing.data_access.data_access_s3 import DataAccessS3
5
5
  from data_processing.data_access.data_access_factory_base import DataAccessFactoryBase
6
6
  from data_processing.data_access.data_access_factory import DataAccessFactory
7
+ from data_processing.data_access.snapshot_utils import SnapshotUtils
@@ -0,0 +1,31 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.data_access import DataAccess
14
+
15
+
16
+ class SnapshotUtils:
17
+ """
18
+ Class implementing support methods for snapshotting
19
+ """
20
+
21
+ @staticmethod
22
+ def get_snapshot_folder(data_access: DataAccess) -> str:
23
+ """
24
+ Get snapshot folder from data access
25
+ :param data_access: data access class
26
+ :return: output folder
27
+ """
28
+ output_folder = data_access.get_output_folder()
29
+ if not output_folder.endswith("/"):
30
+ output_folder += "/"
31
+ return f"{output_folder}snapshot/"
@@ -1,8 +1,10 @@
1
1
  from data_processing.runtime.pure_python.transform_runtime import DefaultPythonTransformRuntime
2
2
  from data_processing.runtime.pure_python.runtime_configuration import PythonTransformRuntimeConfiguration
3
3
  from data_processing.runtime.pure_python.execution_configuration import PythonTransformExecutionConfiguration
4
- from data_processing.runtime.pure_python.transform_file_processor import (PythonTransformFileProcessor,
5
- PythonPoolTransformFileProcessor)
4
+ from data_processing.runtime.pure_python.transform_file_processor import (
5
+ PythonTransformFileProcessor,
6
+ PythonPoolTransformFileProcessor,
7
+ )
6
8
  from data_processing.runtime.pure_python.transform_orchestrator import orchestrate
7
9
  from data_processing.runtime.pure_python.transform_launcher import PythonTransformLauncher
8
10
  from data_processing.runtime.pure_python.transform_invoker import invoke_transform, execute_python_transform
@@ -67,6 +67,4 @@ class PythonTransformExecutionConfiguration(TransformExecutionConfiguration):
67
67
  get input parameters for job_input_params in metadata
68
68
  :return: dictionary of parameters
69
69
  """
70
- return {
71
- "num_processors": self.num_processors
72
- }
70
+ return {"num_processors": self.num_processors}
@@ -11,15 +11,16 @@
11
11
  ################################################################################
12
12
 
13
13
  from data_processing.runtime import TransformRuntimeConfiguration
14
- from data_processing.transform import TransformConfiguration
15
14
  from data_processing.runtime.pure_python import DefaultPythonTransformRuntime
15
+ from data_processing.transform import TransformConfiguration
16
16
 
17
17
 
18
18
  class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
19
- def __init__(self,
20
- transform_config: TransformConfiguration,
21
- runtime_class: type[DefaultPythonTransformRuntime] = DefaultPythonTransformRuntime,
22
- ):
19
+ def __init__(
20
+ self,
21
+ transform_config: TransformConfiguration,
22
+ runtime_class: type[DefaultPythonTransformRuntime] = DefaultPythonTransformRuntime,
23
+ ):
23
24
  """
24
25
  Initialization
25
26
  :param transform_config - base configuration class
@@ -57,10 +57,10 @@ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
57
57
  """
58
58
 
59
59
  def __init__(
60
- self,
61
- data_access_factory: DataAccessFactoryBase,
62
- transform_params: dict[str, Any],
63
- transform_class: type[AbstractBinaryTransform],
60
+ self,
61
+ data_access_factory: DataAccessFactoryBase,
62
+ transform_params: dict[str, Any],
63
+ transform_class: type[AbstractBinaryTransform],
64
64
  ):
65
65
  """
66
66
  Init method
@@ -104,6 +104,4 @@ class PythonPoolTransformFileProcessor(AbstractTransformFileProcessor):
104
104
  """
105
105
  for key, val in stats.items():
106
106
  # for all key/values
107
- if val > 0:
108
- # for values greater then 0
109
- self.stats[key] = self.stats.get(key, 0) + val
107
+ self.stats[key] = self.stats.get(key, 0) + val
@@ -11,19 +11,19 @@
11
11
  ################################################################################
12
12
 
13
13
  import time
14
- from typing import Any
15
- from multiprocessing import Pool
16
14
  import traceback
17
15
  from datetime import datetime
16
+ from multiprocessing import Pool
17
+ from typing import Any
18
18
 
19
19
  from data_processing.data_access import DataAccessFactoryBase
20
20
  from data_processing.runtime.pure_python import (
21
+ PythonPoolTransformFileProcessor,
21
22
  PythonTransformExecutionConfiguration,
22
- PythonTransformRuntimeConfiguration,
23
23
  PythonTransformFileProcessor,
24
- PythonPoolTransformFileProcessor,
24
+ PythonTransformRuntimeConfiguration,
25
25
  )
26
- from data_processing.transform import TransformStatistics, AbstractBinaryTransform
26
+ from data_processing.transform import AbstractBinaryTransform, TransformStatistics
27
27
  from data_processing.utils import get_logger
28
28
 
29
29
 
@@ -69,21 +69,28 @@ def orchestrate(
69
69
  logger.debug(f"{runtime_config.get_name()} Begin processing files")
70
70
  if execution_config.num_processors > 0:
71
71
  # using multiprocessor pool for execution
72
- statistics = _process_transforms_multiprocessor(files=files, size=execution_config.num_processors,
73
- data_access_factory=data_access_factory,
74
- print_interval=print_interval,
75
- transform_params=runtime.get_transform_config(
76
- data_access_factory=data_access_factory,
77
- statistics=statistics, files=files),
78
- transform_class=runtime_config.get_transform_class())
72
+ statistics = _process_transforms_multiprocessor(
73
+ files=files,
74
+ size=execution_config.num_processors,
75
+ data_access_factory=data_access_factory,
76
+ print_interval=print_interval,
77
+ transform_params=runtime.get_transform_config(
78
+ data_access_factory=data_access_factory, statistics=statistics, files=files
79
+ ),
80
+ transform_class=runtime_config.get_transform_class(),
81
+ )
79
82
  else:
80
83
  # using sequential execution
81
- _process_transforms(files=files, data_access_factory=data_access_factory,
82
- print_interval=print_interval, statistics=statistics,
83
- transform_params=runtime.get_transform_config(
84
- data_access_factory=data_access_factory,
85
- statistics=statistics, files=files),
86
- transform_class=runtime_config.get_transform_class())
84
+ _process_transforms(
85
+ files=files,
86
+ data_access_factory=data_access_factory,
87
+ print_interval=print_interval,
88
+ statistics=statistics,
89
+ transform_params=runtime.get_transform_config(
90
+ data_access_factory=data_access_factory, statistics=statistics, files=files
91
+ ),
92
+ transform_class=runtime_config.get_transform_class(),
93
+ )
87
94
  status = "success"
88
95
  return_code = 0
89
96
  except Exception as e:
@@ -108,8 +115,9 @@ def orchestrate(
108
115
  "status": status,
109
116
  },
110
117
  "code": execution_config.code_location,
111
- "job_input_params":
112
- input_params | data_access_factory.get_input_params() | execution_config.get_input_params(),
118
+ "job_input_params": input_params
119
+ | data_access_factory.get_input_params()
120
+ | execution_config.get_input_params(),
113
121
  "job_output_stats": stats,
114
122
  }
115
123
  logger.debug(f"Saving job metadata: {metadata}.")
@@ -121,9 +129,14 @@ def orchestrate(
121
129
  return 1
122
130
 
123
131
 
124
- def _process_transforms(files: list[str], print_interval: int, data_access_factory: DataAccessFactoryBase,
125
- statistics: TransformStatistics, transform_params: dict[str, Any],
126
- transform_class: type[AbstractBinaryTransform]) -> None:
132
+ def _process_transforms(
133
+ files: list[str],
134
+ print_interval: int,
135
+ data_access_factory: DataAccessFactoryBase,
136
+ statistics: TransformStatistics,
137
+ transform_params: dict[str, Any],
138
+ transform_class: type[AbstractBinaryTransform],
139
+ ) -> None:
127
140
  """
128
141
  Process transforms sequentially
129
142
  :param files: list of files to process
@@ -137,8 +150,12 @@ def _process_transforms(files: list[str], print_interval: int, data_access_facto
137
150
  :return: None
138
151
  """
139
152
  # create executor
140
- executor = PythonTransformFileProcessor(data_access_factory=data_access_factory, statistics=statistics,
141
- transform_params=transform_params, transform_class=transform_class)
153
+ executor = PythonTransformFileProcessor(
154
+ data_access_factory=data_access_factory,
155
+ statistics=statistics,
156
+ transform_params=transform_params,
157
+ transform_class=transform_class,
158
+ )
142
159
  # process data
143
160
  t_start = time.time()
144
161
  completed = 0
@@ -157,9 +174,14 @@ def _process_transforms(files: list[str], print_interval: int, data_access_facto
157
174
  logger.info(f"done flushing in {round(time.time() - start, 3)} sec")
158
175
 
159
176
 
160
- def _process_transforms_multiprocessor(files: list[str], size: int, print_interval: int,
161
- data_access_factory: DataAccessFactoryBase, transform_params: dict[str, Any],
162
- transform_class: type[AbstractBinaryTransform]) -> TransformStatistics:
177
+ def _process_transforms_multiprocessor(
178
+ files: list[str],
179
+ size: int,
180
+ print_interval: int,
181
+ data_access_factory: DataAccessFactoryBase,
182
+ transform_params: dict[str, Any],
183
+ transform_class: type[AbstractBinaryTransform],
184
+ ) -> TransformStatistics:
163
185
  """
164
186
  Process transforms using multiprocessing pool
165
187
  :param files: list of files to process
@@ -173,8 +195,9 @@ def _process_transforms_multiprocessor(files: list[str], size: int, print_interv
173
195
  # result statistics
174
196
  statistics = TransformStatistics()
175
197
  # create processor
176
- processor = PythonPoolTransformFileProcessor(data_access_factory=data_access_factory,
177
- transform_params=transform_params, transform_class=transform_class)
198
+ processor = PythonPoolTransformFileProcessor(
199
+ data_access_factory=data_access_factory, transform_params=transform_params, transform_class=transform_class
200
+ )
178
201
  completed = 0
179
202
  t_start = time.time()
180
203
  # create multiprocessing pool
@@ -23,9 +23,9 @@ class AbstractTransformFileProcessor:
23
23
  """
24
24
 
25
25
  def __init__(
26
- self,
27
- data_access_factory: DataAccessFactoryBase,
28
- transform_parameters: dict[str, Any],
26
+ self,
27
+ data_access_factory: DataAccessFactoryBase,
28
+ transform_parameters: dict[str, Any],
29
29
  ):
30
30
  """
31
31
  Init method
@@ -85,7 +85,7 @@ class AbstractTransformFileProcessor:
85
85
  raise UnrecoverableException
86
86
  # Process other exceptions
87
87
  except Exception as e:
88
- self.logger.warning(f"Exception {e} processing file {f_name}: {traceback.format_exc()}")
88
+ self.logger.warning(f"Exception processing file {f_name}: {traceback.format_exc()}")
89
89
  self._publish_stats({"transform execution exception": 1})
90
90
 
91
91
  def flush(self) -> None:
@@ -133,6 +133,12 @@ class AbstractTransformFileProcessor:
133
133
  self.logger.debug(
134
134
  f"Transform did not produce a transformed file for " f"file {self.last_file_name}.parquet"
135
135
  )
136
+ self._publish_stats(
137
+ {
138
+ "result_files": len(out_files),
139
+ "processing_time": time.time() - t_start,
140
+ }
141
+ )
136
142
  case 1:
137
143
  # we have exactly 1 output file
138
144
  file_ext = out_files[0]
@@ -75,8 +75,8 @@ class AbstractTest:
75
75
  def _install_test_fixtures(self, metafunc):
76
76
  raise NotImplemented("Sub-class must implemented this to install the fixtures for its tests.")
77
77
 
78
- @staticmethod
79
- def validate_expected_tables(table_list: list[pa.Table], expected_table_list: list[pa.Table]):
78
+ @classmethod
79
+ def validate_expected_tables(cls, table_list: list[pa.Table], expected_table_list: list[pa.Table]):
80
80
  """
81
81
  Verify with assertion messages that the two lists of Tables are equivalent.
82
82
  :param table_list:
@@ -100,10 +100,10 @@ class AbstractTest:
100
100
  r1 = t1.take([j])
101
101
  r2 = t2.take([j])
102
102
  # assert r1 == r2, f"Row {j} of table {i} are not equal\n\tTransformed: {r1}\n\tExpected : {r2}"
103
- AbstractTest.validate_expected_row(i, j, r1, r2)
103
+ cls.validate_expected_row(i, j, r1, r2)
104
104
 
105
- @staticmethod
106
- def validate_expected_row(table_index: int, row_index: int, test_row: pa.Table, expected_row: pa.Table):
105
+ @classmethod
106
+ def validate_expected_row(cls, table_index: int, row_index: int, test_row: pa.Table, expected_row: pa.Table):
107
107
  """
108
108
  Compare the two rows for equality, allowing float values to be within a percentage
109
109
  of each other as defined by global _allowed_float_percent_diff.
@@ -139,8 +139,8 @@ class AbstractTest:
139
139
  diff = abs(test_value - expected_value)
140
140
  assert diff <= allowed_diff, msg
141
141
 
142
- @staticmethod
143
- def validate_expected_files(files_list: list[tuple[bytes, str]], expected_files_list: list[tuple[bytes, str]]):
142
+ @classmethod
143
+ def validate_expected_files(cls, files_list: list[tuple[bytes, str]], expected_files_list: list[tuple[bytes, str]]):
144
144
  """
145
145
  Verify with assertion messages that the two lists of Tables are equivalent.
146
146
  :param files_list:
@@ -171,15 +171,15 @@ class AbstractTest:
171
171
  diff <= diff_allowed
172
172
  ), f"produced file length {lenf1} vs expected {lenf2}, exceeds allowance of {diff_allowed}"
173
173
 
174
- @staticmethod
175
- def validate_expected_metadata_lists(metadata: list[dict[str, float]], expected_metadata: list[dict[str, float]]):
174
+ @classmethod
175
+ def validate_expected_metadata_lists(cls, metadata: list[dict[str, float]], expected_metadata: list[dict[str, float]]):
176
176
  elen = len(expected_metadata)
177
177
  assert len(metadata) == elen, f"Number of metadata dictionaries not the expected of {elen}"
178
178
  for index in range(elen):
179
- AbstractTest.validate_expected_metadata(metadata[index], expected_metadata[index])
179
+ cls.validate_expected_metadata(metadata[index], expected_metadata[index])
180
180
 
181
- @staticmethod
182
- def validate_expected_metadata(metadata: dict[str, float], expected_metadata: dict[str, float]):
181
+ @classmethod
182
+ def validate_expected_metadata(cls, metadata: dict[str, float], expected_metadata: dict[str, float]):
183
183
  """
184
184
  Verify with assertion messages that the two dictionaries are as expected.
185
185
  :param metadata:
@@ -194,8 +194,8 @@ class AbstractTest:
194
194
  f"Metadata not equal\n" "\tTransformed: {metadata} Expected : {expected_metadata}"
195
195
  )
196
196
 
197
- @staticmethod
198
- def validate_directory_contents(directory: str, expected_dir: str, drop_columns: list[str] = []):
197
+ @classmethod
198
+ def validate_directory_contents(cls, directory: str, expected_dir: str, drop_columns: list[str] = []):
199
199
  """
200
200
  Make sure the directory contents are the same.
201
201
  :param directory:
@@ -217,28 +217,28 @@ class AbstractTest:
217
217
  expected_diffs = 0
218
218
  failed = len(dir_cmp.diff_files) != expected_diffs
219
219
  if failed:
220
- AbstractTest.__confirm_diffs(directory, expected_dir, dir_cmp.diff_files, "/tmp", drop_columns)
220
+ cls.__confirm_diffs(directory, expected_dir, dir_cmp.diff_files, "/tmp", drop_columns)
221
221
 
222
222
  # Traverse into the subdirs since dircmp doesn't seem to do that.
223
223
  subdirs = [f.name for f in os.scandir(expected_dir) if f.is_dir()]
224
224
  for subdir in subdirs:
225
225
  d1 = os.path.join(directory, subdir)
226
226
  d2 = os.path.join(expected_dir, subdir)
227
- AbstractTest.validate_directory_contents(d1, d2, drop_columns)
227
+ cls.validate_directory_contents(d1, d2, drop_columns)
228
228
 
229
- @staticmethod
230
- def _validate_table_files(parquet1: str, parquet2: str, drop_columns: list[str] = []):
229
+ @classmethod
230
+ def _validate_table_files(cls, parquet1: str, parquet2: str, drop_columns: list[str] = []):
231
231
  da = DataAccessLocal()
232
232
  t1, _ = da.get_table(parquet1)
233
233
  t2, _ = da.get_table(parquet2)
234
234
  if len(drop_columns) > 0:
235
235
  t1 = t1.drop_columns(drop_columns)
236
236
  t2 = t2.drop_columns(drop_columns)
237
- AbstractTest.validate_expected_tables([t1], [t2])
237
+ cls.validate_expected_tables([t1], [t2])
238
238
 
239
- @staticmethod
239
+ @classmethod
240
240
  def __confirm_diffs(
241
- src_dir: str, expected_dir: str, diff_files: list, dest_dir: str, drop_columns: list[str] = []
241
+ cls, src_dir: str, expected_dir: str, diff_files: list, dest_dir: str, drop_columns: list[str] = []
242
242
  ):
243
243
  """
244
244
  Copy all files from the source dir to the dest dir.
@@ -256,7 +256,7 @@ class AbstractTest:
256
256
  # It seems file can be different on disk, but contain the same column/row values.
257
257
  # so for these, do the inmemory comparison.
258
258
  try:
259
- AbstractTest._validate_table_files(expected, src, drop_columns)
259
+ cls._validate_table_files(expected, src, drop_columns)
260
260
  except AssertionError as e:
261
261
  logger.info(f"Copying file with difference: {src} to {dest}")
262
262
  shutil.copyfile(src, dest)
@@ -65,7 +65,7 @@ class AbstractTransformLauncherTest(AbstractTest):
65
65
  Confirm that the two directories contains the same files.
66
66
  Stubbed out like this to allow spark tests to override this since spark tends to rename the files.
67
67
  """
68
- AbstractTest.validate_directory_contents(dir, expected, ignore_columns)
68
+ self.validate_directory_contents(dir, expected, ignore_columns)
69
69
 
70
70
  def _install_test_fixtures(self, metafunc):
71
71
  # Apply the fixtures for the method with these input names (i.e. test_transform()).
@@ -32,8 +32,7 @@ class TransformStatistics:
32
32
  :return: None
33
33
  """
34
34
  for key, val in stats.items():
35
- if val > 0:
36
- self.stats[key] = self.stats.get(key, 0) + val
35
+ self.stats[key] = self.stats.get(key, 0) + val
37
36
 
38
37
  def get_execution_stats(self) -> dict[str, Any]:
39
38
  """
@@ -15,8 +15,8 @@ import time
15
15
 
16
16
  from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
17
17
  from data_processing.runtime.pure_python import (
18
- PythonTransformRuntimeConfiguration,
19
18
  PythonTransformExecutionConfiguration,
19
+ PythonTransformRuntimeConfiguration,
20
20
  orchestrate,
21
21
  )
22
22
  from data_processing.runtime.transform_launcher import AbstractTransformLauncher