data-prep-toolkit 0.2.2.dev1__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/METADATA +33 -1
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/RECORD +34 -28
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/WHEEL +1 -1
- data_processing/data_access/data_access.py +4 -1
- data_processing/data_access/data_access_local.py +0 -11
- data_processing/data_access/data_access_s3.py +0 -11
- data_processing/runtime/pure_python/transform_file_processor.py +9 -3
- data_processing/runtime/pure_python/transform_orchestrator.py +30 -17
- data_processing/runtime/pure_python/transform_runtime.py +9 -1
- data_processing/runtime/transform_file_processor.py +53 -32
- data_processing/test_support/data_access/data_access_factory_test.py +12 -0
- data_processing/test_support/transform/__init__.py +9 -4
- data_processing/test_support/transform/noop_folder_transform.py +105 -0
- data_processing/test_support/transform/noop_transform.py +3 -3
- data_processing/transform/__init__.py +2 -0
- data_processing/transform/abstract_transform.py +16 -0
- data_processing/transform/binary_transform.py +3 -2
- data_processing/transform/folder_transform.py +40 -0
- data_processing/transform/transform_configuration.py +3 -3
- data_processing/utils/multilock.py +160 -0
- data_processing/utils/unrecoverable.py +13 -0
- data_processing_ray/runtime/ray/transform_file_processor.py +1 -0
- data_processing_ray/runtime/ray/transform_orchestrator.py +18 -10
- data_processing_ray/runtime/ray/transform_runtime.py +9 -1
- data_processing_ray/test_support/transform/__init__.py +1 -0
- data_processing_ray/test_support/transform/noop_folder_transform.py +56 -0
- data_processing_ray/test_support/transform/noop_transform.py +1 -3
- data_processing_spark/runtime/spark/runtime_configuration.py +13 -0
- data_processing_spark/runtime/spark/transform_file_processor.py +4 -1
- data_processing_spark/runtime/spark/transform_orchestrator.py +78 -15
- data_processing_spark/runtime/spark/transform_runtime.py +24 -6
- data_processing_spark/test_support/transform/__init__.py +1 -0
- data_processing_spark/test_support/transform/noop_folder_transform.py +53 -0
- {data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt +0 -0
|
@@ -10,24 +10,69 @@
|
|
|
10
10
|
# limitations under the License.
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
|
+
import os
|
|
14
|
+
import socket
|
|
13
15
|
import time
|
|
14
16
|
import traceback
|
|
15
17
|
from datetime import datetime
|
|
16
18
|
|
|
19
|
+
import yaml
|
|
17
20
|
from data_processing.data_access import DataAccessFactoryBase
|
|
18
|
-
from data_processing.transform import TransformStatistics
|
|
21
|
+
from data_processing.transform import TransformStatistics, AbstractFolderTransform
|
|
19
22
|
from data_processing.utils import GB, get_logger
|
|
20
23
|
from data_processing_spark.runtime.spark import (
|
|
24
|
+
SparkTransformExecutionConfiguration,
|
|
21
25
|
SparkTransformFileProcessor,
|
|
22
26
|
SparkTransformRuntimeConfiguration,
|
|
23
|
-
SparkTransformExecutionConfiguration,
|
|
24
27
|
)
|
|
25
28
|
from pyspark import SparkConf, SparkContext
|
|
29
|
+
from pyspark.sql import SparkSession
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
logger = get_logger(__name__)
|
|
29
33
|
|
|
30
34
|
|
|
35
|
+
def _init_spark(runtime_config: SparkTransformRuntimeConfiguration) -> SparkSession:
|
|
36
|
+
server_port_https = int(os.getenv("KUBERNETES_SERVICE_PORT_HTTPS", "-1"))
|
|
37
|
+
if server_port_https == -1:
|
|
38
|
+
# running locally
|
|
39
|
+
spark_config = {"spark.driver.host": "127.0.0.1"}
|
|
40
|
+
return SparkSession.builder.appName(runtime_config.get_name()).config(map=spark_config).getOrCreate()
|
|
41
|
+
else:
|
|
42
|
+
# running in Kubernetes, use spark_profile.yml and
|
|
43
|
+
# environment variables for configuration
|
|
44
|
+
server_port = os.environ["KUBERNETES_SERVICE_PORT"]
|
|
45
|
+
master_url = f"k8s://https://kubernetes.default:{server_port}"
|
|
46
|
+
|
|
47
|
+
# Read Spark configuration profile
|
|
48
|
+
config_filepath = os.path.abspath(
|
|
49
|
+
os.path.join(os.getenv("SPARK_HOME"), "work-dir", "config", "spark_profile.yml")
|
|
50
|
+
)
|
|
51
|
+
with open(config_filepath, "r") as config_fp:
|
|
52
|
+
spark_config = yaml.safe_load(os.path.expandvars(config_fp.read()))
|
|
53
|
+
spark_config["spark.submit.deployMode"] = "client"
|
|
54
|
+
|
|
55
|
+
# configure the executor pods from template
|
|
56
|
+
executor_pod_template_file = os.path.join(
|
|
57
|
+
os.getenv("SPARK_HOME"),
|
|
58
|
+
"work-dir",
|
|
59
|
+
"src",
|
|
60
|
+
"templates",
|
|
61
|
+
"spark-executor-pod-template.yml",
|
|
62
|
+
)
|
|
63
|
+
spark_config["spark.kubernetes.executor.podTemplateFile"] = executor_pod_template_file
|
|
64
|
+
spark_config["spark.kubernetes.container.image.pullPolicy"] = "Always"
|
|
65
|
+
|
|
66
|
+
# Pass the driver IP address to the workers for callback
|
|
67
|
+
myservice_url = socket.gethostbyname(socket.gethostname())
|
|
68
|
+
spark_config["spark.driver.host"] = myservice_url
|
|
69
|
+
spark_config["spark.driver.bindAddress"] = "0.0.0.0"
|
|
70
|
+
spark_config["spark.decommission.enabled"] = True
|
|
71
|
+
logger.info(f"Launching Spark Session with configuration\n" f"{yaml.dump(spark_config, indent=2)}")
|
|
72
|
+
app_name = spark_config.get("spark.app.name", "my-spark-app")
|
|
73
|
+
return SparkSession.builder.master(master_url).appName(app_name).config(map=spark_config).getOrCreate()
|
|
74
|
+
|
|
75
|
+
|
|
31
76
|
def orchestrate(
|
|
32
77
|
runtime_config: SparkTransformRuntimeConfiguration,
|
|
33
78
|
execution_configuration: SparkTransformExecutionConfiguration,
|
|
@@ -45,14 +90,17 @@ def orchestrate(
|
|
|
45
90
|
logger.info(f"orchestrator started at {start_ts}")
|
|
46
91
|
# create data access
|
|
47
92
|
data_access = data_access_factory.create_data_access()
|
|
93
|
+
bcast_params = runtime_config.get_bcast_params(data_access_factory)
|
|
48
94
|
if data_access is None:
|
|
49
95
|
logger.error("No DataAccess instance provided - exiting")
|
|
50
96
|
return 1
|
|
51
97
|
# initialize Spark
|
|
52
|
-
|
|
53
|
-
sc =
|
|
98
|
+
spark_session = _init_spark(runtime_config)
|
|
99
|
+
sc = spark_session.sparkContext
|
|
100
|
+
# broadcast
|
|
54
101
|
spark_runtime_config = sc.broadcast(runtime_config)
|
|
55
102
|
daf = sc.broadcast(data_access_factory)
|
|
103
|
+
spark_bcast_params = sc.broadcast(bcast_params)
|
|
56
104
|
|
|
57
105
|
def process_partition(iterator):
|
|
58
106
|
"""
|
|
@@ -63,12 +111,16 @@ def orchestrate(
|
|
|
63
111
|
# local statistics dictionary
|
|
64
112
|
statistics = TransformStatistics()
|
|
65
113
|
# create transformer runtime
|
|
114
|
+
bcast_params = spark_bcast_params.value
|
|
66
115
|
d_access_factory = daf.value
|
|
67
116
|
runtime_conf = spark_runtime_config.value
|
|
68
117
|
runtime = runtime_conf.create_transform_runtime()
|
|
69
118
|
# create file processor
|
|
70
119
|
file_processor = SparkTransformFileProcessor(
|
|
71
|
-
data_access_factory=d_access_factory,
|
|
120
|
+
data_access_factory=d_access_factory,
|
|
121
|
+
runtime_configuration=runtime_conf,
|
|
122
|
+
statistics=statistics,
|
|
123
|
+
is_folder=is_folder,
|
|
72
124
|
)
|
|
73
125
|
first = True
|
|
74
126
|
for f in iterator:
|
|
@@ -77,8 +129,11 @@ def orchestrate(
|
|
|
77
129
|
logger.debug(f"partition {f}")
|
|
78
130
|
# add additional parameters
|
|
79
131
|
transform_params = (
|
|
80
|
-
runtime.get_transform_config(
|
|
81
|
-
|
|
132
|
+
runtime.get_transform_config(
|
|
133
|
+
partition=int(f[1]), data_access_factory=d_access_factory, statistics=statistics
|
|
134
|
+
)
|
|
135
|
+
| bcast_params
|
|
136
|
+
)
|
|
82
137
|
# create transform with partition number
|
|
83
138
|
file_processor.create_transform(transform_params)
|
|
84
139
|
first = False
|
|
@@ -92,13 +147,20 @@ def orchestrate(
|
|
|
92
147
|
return list(statistics.get_execution_stats().items())
|
|
93
148
|
|
|
94
149
|
num_partitions = 0
|
|
150
|
+
is_folder = issubclass(runtime_config.get_transform_class(), AbstractFolderTransform)
|
|
95
151
|
try:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
152
|
+
if is_folder:
|
|
153
|
+
# folder transform
|
|
154
|
+
runtime = runtime_config.create_transform_runtime()
|
|
155
|
+
files = runtime.get_folders(data_access=data_access)
|
|
156
|
+
logger.info(f"Number of folders is {len(files)}") # Get files to process
|
|
157
|
+
else:
|
|
158
|
+
# Get files to process
|
|
159
|
+
files, profile, retries = data_access.get_files_to_process()
|
|
160
|
+
if len(files) == 0:
|
|
161
|
+
logger.error("No input files to process - exiting")
|
|
162
|
+
return 0
|
|
163
|
+
logger.info(f"Number of files is {len(files)}, source profile {profile}")
|
|
102
164
|
# process data
|
|
103
165
|
logger.debug("Begin processing files")
|
|
104
166
|
# process files split by partitions
|
|
@@ -128,7 +190,7 @@ def orchestrate(
|
|
|
128
190
|
memory = 0.0
|
|
129
191
|
for i in range(executors.size()):
|
|
130
192
|
memory += executors.toList().apply(i)._2()._1()
|
|
131
|
-
resources = {"cpus": cpus, "gpus": 0, "memory": round(memory/GB, 2), "object_store": 0}
|
|
193
|
+
resources = {"cpus": cpus, "gpus": 0, "memory": round(memory / GB, 2), "object_store": 0}
|
|
132
194
|
input_params = runtime_config.get_transform_metadata() | execution_configuration.get_input_params()
|
|
133
195
|
metadata = {
|
|
134
196
|
"pipeline": execution_configuration.pipeline_id,
|
|
@@ -143,7 +205,8 @@ def orchestrate(
|
|
|
143
205
|
"execution_stats": {
|
|
144
206
|
"num partitions": num_partitions,
|
|
145
207
|
"execution time, min": round((time.time() - start_time) / 60, 3),
|
|
146
|
-
}
|
|
208
|
+
}
|
|
209
|
+
| resources,
|
|
147
210
|
"job_output_stats": stats,
|
|
148
211
|
}
|
|
149
212
|
logger.debug(f"Saving job metadata: {metadata}.")
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
|
-
from data_processing.data_access import DataAccessFactoryBase
|
|
15
|
+
from data_processing.data_access import DataAccessFactoryBase, DataAccess
|
|
16
16
|
from data_processing.transform import TransformStatistics
|
|
17
17
|
|
|
18
18
|
|
|
@@ -28,25 +28,43 @@ class DefaultSparkTransformRuntime:
|
|
|
28
28
|
"""
|
|
29
29
|
self.params = params
|
|
30
30
|
|
|
31
|
+
def get_folders(self, data_access: DataAccess) -> list[str]:
|
|
32
|
+
"""
|
|
33
|
+
Get folders to process
|
|
34
|
+
:param data_access: data access
|
|
35
|
+
:return: list of folders to process
|
|
36
|
+
"""
|
|
37
|
+
raise NotImplemented()
|
|
38
|
+
|
|
31
39
|
def get_transform_config(
|
|
32
40
|
self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics
|
|
33
41
|
) -> dict[str, Any]:
|
|
34
42
|
"""
|
|
35
43
|
Get the dictionary of configuration that will be provided to the transform's initializer.
|
|
36
44
|
This is the opportunity for this runtime to create a new set of configuration based on the
|
|
37
|
-
config/params provided to this instance's initializer.
|
|
38
|
-
|
|
39
|
-
expected by the transform in its initializer and/or transform() methods.
|
|
45
|
+
config/params provided to this instance's initializer.
|
|
46
|
+
:param partition - the partition assigned to this worker, needed by transforms like doc_id
|
|
40
47
|
:param data_access_factory - data access factory class being used by the RayOrchestrator.
|
|
41
48
|
:param statistics - reference to statistics actor
|
|
42
49
|
:return: dictionary of transform init params
|
|
43
50
|
"""
|
|
44
51
|
return self.params
|
|
45
52
|
|
|
53
|
+
def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]:
|
|
54
|
+
"""Allows retrieving and broadcasting to all the workers very large
|
|
55
|
+
configuration parameters, like the list of document IDs to remove for
|
|
56
|
+
fuzzy dedup, or the list of blocked web domains for block listing. This
|
|
57
|
+
function is called by the spark runtime after spark initialization, and
|
|
58
|
+
before spark_context.parallelize()
|
|
59
|
+
:param data_access_factory - creates data_access object to download the large config parameter
|
|
60
|
+
"""
|
|
61
|
+
return {}
|
|
62
|
+
|
|
46
63
|
def compute_execution_stats(self, stats: TransformStatistics) -> None:
|
|
47
64
|
"""
|
|
48
65
|
Update/augment the given statistics object with runtime-specific additions/modifications.
|
|
66
|
+
This method does not return a value; the job execution statistics are generally reported
|
|
67
|
+
as metadata by the Spark Orchestrator.
|
|
49
68
|
:param stats: output of statistics as aggregated across all calls to all transforms.
|
|
50
|
-
:return: job execution statistics. These are generally reported as metadata by the Ray Orchestrator.
|
|
51
69
|
"""
|
|
52
|
-
pass
|
|
70
|
+
pass
|
|
@@ -11,3 +11,4 @@
|
|
|
11
11
|
################################################################################
|
|
12
12
|
|
|
13
13
|
from data_processing_spark.test_support.transform.noop_transform import NOOPSparkTransformConfiguration
|
|
14
|
+
from data_processing_spark.test_support.transform.noop_folder_transform import NOOPFolderSparkTransformConfiguration
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# (C) Copyright IBM Corp. 2024.
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the “License”);
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
7
|
+
# distributed under the License is distributed on an “AS IS” BASIS,
|
|
8
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
9
|
+
# See the License for the specific language governing permissions and
|
|
10
|
+
# limitations under the License.
|
|
11
|
+
################################################################################
|
|
12
|
+
|
|
13
|
+
from data_processing.test_support.transform import NOOPFolderTransform, NOOPTransformConfiguration
|
|
14
|
+
from data_processing.utils import get_logger
|
|
15
|
+
from data_processing_spark.runtime.spark import SparkTransformLauncher
|
|
16
|
+
from data_processing_spark.runtime.spark import SparkTransformRuntimeConfiguration, DefaultSparkTransformRuntime
|
|
17
|
+
from data_processing.data_access import DataAccess
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NOOPFolderSparkRuntime(DefaultSparkTransformRuntime):
|
|
24
|
+
def get_folders(self, data_access: DataAccess) -> list[str]:
|
|
25
|
+
"""
|
|
26
|
+
Get folders to process
|
|
27
|
+
:param data_access: data access
|
|
28
|
+
:return: list of folders to process
|
|
29
|
+
"""
|
|
30
|
+
return [data_access.get_input_folder()]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class NOOPFolderSparkTransformConfiguration(SparkTransformRuntimeConfiguration):
|
|
34
|
+
"""
|
|
35
|
+
Implements the SparkTransformConfiguration for NOOP as required by the PythonTransformLauncher.
|
|
36
|
+
NOOP does not use a RayRuntime class so the superclass only needs the base
|
|
37
|
+
python-only configuration.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
"""
|
|
42
|
+
Initialization
|
|
43
|
+
"""
|
|
44
|
+
super().__init__(transform_config=NOOPTransformConfiguration(clazz=NOOPFolderTransform),
|
|
45
|
+
runtime_class=NOOPFolderSparkRuntime)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
if __name__ == "__main__":
|
|
49
|
+
# create launcher
|
|
50
|
+
launcher = SparkTransformLauncher(runtime_config=NOOPFolderSparkTransformConfiguration())
|
|
51
|
+
logger.info("Launching noop transform")
|
|
52
|
+
# Launch the ray actor(s) to process the input
|
|
53
|
+
launcher.launch()
|
{data_prep_toolkit-0.2.2.dev1.dist-info → data_prep_toolkit-0.2.2.dev2.dist-info}/top_level.txt
RENAMED
|
File without changes
|