data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/METADATA +33 -1
  2. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/RECORD +34 -28
  3. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/WHEEL +1 -1
  4. data_processing/data_access/data_access.py +4 -1
  5. data_processing/data_access/data_access_local.py +0 -11
  6. data_processing/data_access/data_access_s3.py +0 -11
  7. data_processing/runtime/pure_python/transform_file_processor.py +9 -3
  8. data_processing/runtime/pure_python/transform_orchestrator.py +30 -17
  9. data_processing/runtime/pure_python/transform_runtime.py +9 -1
  10. data_processing/runtime/transform_file_processor.py +53 -32
  11. data_processing/test_support/data_access/data_access_factory_test.py +12 -0
  12. data_processing/test_support/transform/__init__.py +9 -4
  13. data_processing/test_support/transform/noop_folder_transform.py +105 -0
  14. data_processing/test_support/transform/noop_transform.py +3 -3
  15. data_processing/transform/__init__.py +2 -0
  16. data_processing/transform/abstract_transform.py +16 -0
  17. data_processing/transform/binary_transform.py +3 -2
  18. data_processing/transform/folder_transform.py +40 -0
  19. data_processing/transform/transform_configuration.py +3 -3
  20. data_processing/utils/multilock.py +160 -0
  21. data_processing/utils/unrecoverable.py +13 -0
  22. data_processing_ray/runtime/ray/transform_file_processor.py +1 -0
  23. data_processing_ray/runtime/ray/transform_orchestrator.py +18 -10
  24. data_processing_ray/runtime/ray/transform_runtime.py +9 -1
  25. data_processing_ray/test_support/transform/__init__.py +1 -0
  26. data_processing_ray/test_support/transform/noop_folder_transform.py +56 -0
  27. data_processing_ray/test_support/transform/noop_transform.py +1 -3
  28. data_processing_spark/runtime/spark/runtime_configuration.py +13 -0
  29. data_processing_spark/runtime/spark/transform_file_processor.py +4 -1
  30. data_processing_spark/runtime/spark/transform_orchestrator.py +78 -15
  31. data_processing_spark/runtime/spark/transform_runtime.py +24 -6
  32. data_processing_spark/test_support/transform/__init__.py +1 -0
  33. data_processing_spark/test_support/transform/noop_folder_transform.py +53 -0
  34. {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt +0 -0
@@ -10,24 +10,69 @@
10
10
  # limitations under the License.
11
11
  ################################################################################
12
12
 
13
+ import os
14
+ import socket
13
15
  import time
14
16
  import traceback
15
17
  from datetime import datetime
16
18
 
19
+ import yaml
17
20
  from data_processing.data_access import DataAccessFactoryBase
18
- from data_processing.transform import TransformStatistics
21
+ from data_processing.transform import TransformStatistics, AbstractFolderTransform
19
22
  from data_processing.utils import GB, get_logger
20
23
  from data_processing_spark.runtime.spark import (
24
+ SparkTransformExecutionConfiguration,
21
25
  SparkTransformFileProcessor,
22
26
  SparkTransformRuntimeConfiguration,
23
- SparkTransformExecutionConfiguration,
24
27
  )
25
28
  from pyspark import SparkConf, SparkContext
29
+ from pyspark.sql import SparkSession
26
30
 
27
31
 
28
32
  logger = get_logger(__name__)
29
33
 
30
34
 
35
+ def _init_spark(runtime_config: SparkTransformRuntimeConfiguration) -> SparkSession:
36
+ server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1"))
37
+ if server_port_https == -1:
38
+ # running locally
39
+ spark_config = {"spark.driver.host": "127.0.0.1"}
40
+ return SparkSession.builder.appName(runtime_config.get_name()).config(map=spark_config).getOrCreate()
41
+ else:
42
+ # running in Kubernetes, use spark_profile.yml and
43
+ # environment variables for configuration
44
+ server_port = os.environ["KUBERNETES_SERVICE_PORT"]
45
+ master_url = f"k8s://https://kubernetes.default:{server_port}"
46
+
47
+ # Read Spark configuration profile
48
+ config_filepath = os.path.abspath(
49
+ os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml")
50
+ )
51
+ with open(config_filepath, "r") as config_fp:
52
+ spark_config = yaml.safe_load(os.path.expandvars(config_fp.read()))
53
+ spark_config["spark.submit.deployMode"] = "client"
54
+
55
+ # configure the executor pods from template
56
+ executor_pod_template_file = os.path.join(
57
+ os.getenv("SPARK_HOME"),
58
+ "work-dir",
59
+ "src",
60
+ "templates",
61
+ "spark-executor-pod-template.yml",
62
+ )
63
+ spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file
64
+ spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always"
65
+
66
+ # Pass the driver IP address to the workers for callback
67
+ myservice_url = socket.gethostbyname(socket.gethostname())
68
+ spark_config["spark.driver.host"] = myservice_url
69
+ spark_config["spark.driver.bindAddress"] = "0.0.0.0"
70
+ spark_config["spark.decommission.enabled"] = True
71
+ logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}")
72
+ app_name = spark_config.get("spark.app.name", "my-spark-app")
73
+ return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate()
74
+
75
+
31
76
  def orchestrate(
32
77
  runtime_config: SparkTransformRuntimeConfiguration,
33
78
  execution_configuration: SparkTransformExecutionConfiguration,
@@ -45,14 +90,17 @@ def orchestrate(
45
90
  logger.info(f"orchestrator started at {start_ts}")
46
91
  # create data access
47
92
  data_access = data_access_factory.create_data_access()
93
+ bcast_params = runtime_config.get_bcast_params(data_access_factory)
48
94
  if data_access is None:
49
95
  logger.error("No DataAccess instance provided - exiting")
50
96
  return 1
51
97
  # initialize Spark
52
- conf = SparkConf().setAppName(runtime_config.get_name()).set("spark.driver.host", "127.0.0.1")
53
- sc = SparkContext(conf=conf)
98
+ spark_session = _init_spark(runtime_config)
99
+ sc = spark_session.sparkContext
100
+ # broadcast
54
101
  spark_runtime_config = sc.broadcast(runtime_config)
55
102
  daf = sc.broadcast(data_access_factory)
103
+ spark_bcast_params = sc.broadcast(bcast_params)
56
104
 
57
105
  def process_partition(iterator):
58
106
  """
@@ -63,12 +111,16 @@ def orchestrate(
63
111
  # local statistics dictionary
64
112
  statistics = TransformStatistics()
65
113
  # create transformer runtime
114
+ bcast_params = spark_bcast_params.value
66
115
  d_access_factory = daf.value
67
116
  runtime_conf = spark_runtime_config.value
68
117
  runtime = runtime_conf.create_transform_runtime()
69
118
  # create file processor
70
119
  file_processor = SparkTransformFileProcessor(
71
- data_access_factory=d_access_factory, runtime_configuration=runtime_conf, statistics=statistics
120
+ data_access_factory=d_access_factory,
121
+ runtime_configuration=runtime_conf,
122
+ statistics=statistics,
123
+ is_folder=is_folder,
72
124
  )
73
125
  first = True
74
126
  for f in iterator:
@@ -77,8 +129,11 @@ def orchestrate(
77
129
  logger.debug(f"partition {f}")
78
130
  # add additional parameters
79
131
  transform_params = (
80
- runtime.get_transform_config(partition=int(f[1]), data_access_factory=d_access_factory,
81
- statistics=statistics))
132
+ runtime.get_transform_config(
133
+ partition=int(f[1]), data_access_factory=d_access_factory, statistics=statistics
134
+ )
135
+ | bcast_params
136
+ )
82
137
  # create transform with partition number
83
138
  file_processor.create_transform(transform_params)
84
139
  first = False
@@ -92,13 +147,20 @@ def orchestrate(
92
147
  return list(statistics.get_execution_stats().items())
93
148
 
94
149
  num_partitions = 0
150
+ is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
95
151
  try:
96
- # Get files to process
97
- files, profile, retries = data_access.get_files_to_process()
98
- if len(files) == 0:
99
- logger.error("No input files to process - exiting")
100
- return 0
101
- logger.info(f"Number of files is {len(files)}, source profile {profile}")
152
+ if is_folder:
153
+ # folder transform
154
+ runtime = runtime_config.create_transform_runtime()
155
+ files = runtime.get_folders(data_access=data_access)
156
+ logger.info(f"Number of folders is {len(files)}") # Get files to process
157
+ else:
158
+ # Get files to process
159
+ files, profile, retries = data_access.get_files_to_process()
160
+ if len(files) == 0:
161
+ logger.error("No input files to process - exiting")
162
+ return 0
163
+ logger.info(f"Number of files is {len(files)}, source profile {profile}")
102
164
  # process data
103
165
  logger.debug("Begin processing files")
104
166
  # process files split by partitions
@@ -128,7 +190,7 @@ def orchestrate(
128
190
  memory = 0.0
129
191
  for i in range(executors.size()):
130
192
  memory += executors.toList().apply(i)._2()._1()
131
- resources = {"cpus": cpus, "gpus": 0, "memory": round(memory/GB, 2), "object_store": 0}
193
+ resources = {"cpus": cpus, "gpus": 0, "memory": round(memory / GB, 2), "object_store": 0}
132
194
  input_params = runtime_config.get_transform_metadata() | execution_configuration.get_input_params()
133
195
  metadata = {
134
196
  "pipeline": execution_configuration.pipeline_id,
@@ -143,7 +205,8 @@ def orchestrate(
143
205
  "execution_stats": {
144
206
  "num partitions": num_partitions,
145
207
  "execution time, min": round((time.time() - start_time) / 60, 3),
146
- } | resources,
208
+ }
209
+ | resources,
147
210
  "job_output_stats": stats,
148
211
  }
149
212
  logger.debug(f"Saving job metadata: {metadata}.")
@@ -12,7 +12,7 @@
12
12
 
13
13
  from typing import Any
14
14
 
15
- from data_processing.data_access import DataAccessFactoryBase
15
+ from data_processing.data_access import DataAccessFactoryBase, DataAccess
16
16
  from data_processing.transform import TransformStatistics
17
17
 
18
18
 
@@ -28,25 +28,43 @@ class DefaultSparkTransformRuntime:
28
28
  """
29
29
  self.params = params
30
30
 
31
+ def get_folders(self, data_access: DataAccess) -> list[str]:
32
+ """
33
+ Get folders to process
34
+ :param data_access: data access
35
+ :return: list of folders to process
36
+ """
37
+ raise NotImplemented()
38
+
31
39
  def get_transform_config(
32
40
  self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
33
41
  ) -> dict[str, Any]:
34
42
  """
35
43
  Get the dictionary of configuration that will be provided to the transform's initializer.
36
44
  This is the opportunity for this runtime to create a new set of configuration based on the
37
- config/params provided to this instance's initializer. This may include the addition
38
- of new configuration data such as ray shared memory, new actors, etc, that might be needed and
39
- expected by the transform in its initializer and/or transform() methods.
45
+ config/params provided to this instance's initializer.
46
+ :param partition - the partition assigned to this worker, needed by transforms like doc_id
40
47
  :param data_access_factory - data access factory class being used by the RayOrchestrator.
41
48
  :param statistics - reference to statistics actor
42
49
  :return: dictionary of transform init params
43
50
  """
44
51
  return self.params
45
52
 
53
+ def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
54
+ """Allows retrieving and broadcasting to all the workers very large
55
+ configuration parameters, like the list of document IDs to remove for
56
+ fuzzy dedup, or the list of blocked web domains for block listing. This
57
+ function is called by the spark runtime after spark initialization, and
58
+ before spark_context.parallelize()
59
+ :param data_access_factory - creates data_access object to download the large config parameter
60
+ """
61
+ return {}
62
+
46
63
  def compute_execution_stats(self, stats: TransformStatistics) -> None:
47
64
  """
48
65
  Update/augment the given statistics object with runtime-specific additions/modifications.
66
+ This method does not return a value; the job execution statistics are generally reported
67
+ as metadata by the Spark Orchestrator.
49
68
  :param stats: output of statistics as aggregated across all calls to all transforms.
50
- :return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
51
69
  """
52
- pass
70
+ pass
@@ -11,3 +11,4 @@
11
11
  ################################################################################
12
12
 
13
13
  from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration
14
+ from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration
@@ -0,0 +1,53 @@
1
+ # (C) Copyright IBM Corp. 2024.
2
+ # Licensed under the Apache License, Version 2.0 (the “License”);
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ # Unless required by applicable law or agreed to in writing, software
7
+ # distributed under the License is distributed on an “AS IS” BASIS,
8
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9
+ # See the License for the specific language governing permissions and
10
+ # limitations under the License.
11
+ ################################################################################
12
+
13
+ from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
14
+ from data_processing.utils import get_logger
15
+ from data_processing_spark.runtime.spark import SparkTransformLauncher
16
+ from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime
17
+ from data_processing.data_access import DataAccess
18
+
19
+
20
+ logger = get_logger(__name__)
21
+
22
+
23
+ class NOOPFolderSparkRuntime(DefaultSparkTransformRuntime):
24
+ def get_folders(self, data_access: DataAccess) -> list[str]:
25
+ """
26
+ Get folders to process
27
+ :param data_access: data access
28
+ :return: list of folders to process
29
+ """
30
+ return [data_access.get_input_folder()]
31
+
32
+
33
+ class NOOPFolderSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
34
+ """
35
+ Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
36
+ NOOP does not use a RayRuntime class so the superclass only needs the base
37
+ python-only configuration.
38
+ """
39
+
40
+ def __init__(self):
41
+ """
42
+ Initialization
43
+ """
44
+ super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
45
+ runtime_class=NOOPFolderSparkRuntime)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ # create launcher
50
+ launcher = SparkTransformLauncher(runtime_config=NOOPFolderSparkTransformConfiguration())
51
+ logger.info("Launching noop transform")
52
+ # Launch the ray actor(s) to process the input
53
+ launcher.launch()