AWSGlueDataplanePython 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awsglue/README.md +37 -0
- awsglue/__init__.py +15 -0
- awsglue/context.py +690 -0
- awsglue/data_sink.py +49 -0
- awsglue/data_source.py +49 -0
- awsglue/dataframe_transforms/__init__.py +17 -0
- awsglue/dataframe_transforms/apply_mapping.py +76 -0
- awsglue/dataframereader.py +41 -0
- awsglue/dataframewriter.py +21 -0
- awsglue/devutils.py +236 -0
- awsglue/dynamicframe.py +669 -0
- awsglue/functions.py +31 -0
- awsglue/glue_shell.py +38 -0
- awsglue/gluetypes.py +461 -0
- awsglue/job.py +59 -0
- awsglue/scripts/__init__.py +12 -0
- awsglue/scripts/activate_etl_connector.py +362 -0
- awsglue/scripts/connector_activation_util.py +38 -0
- awsglue/scripts/crawler_redo_from_backup.py +75 -0
- awsglue/scripts/crawler_undo.py +121 -0
- awsglue/scripts/scripts_utils.py +106 -0
- awsglue/streaming_data_source.py +28 -0
- awsglue/transforms/__init__.py +47 -0
- awsglue/transforms/apply_mapping.py +72 -0
- awsglue/transforms/coalesce.py +66 -0
- awsglue/transforms/collection_transforms.py +155 -0
- awsglue/transforms/drop_nulls.py +85 -0
- awsglue/transforms/dynamicframe_filter.py +66 -0
- awsglue/transforms/dynamicframe_map.py +72 -0
- awsglue/transforms/errors_as_dynamicframe.py +45 -0
- awsglue/transforms/field_transforms.py +469 -0
- awsglue/transforms/relationalize.py +105 -0
- awsglue/transforms/repartition.py +61 -0
- awsglue/transforms/resolve_choice.py +85 -0
- awsglue/transforms/transform.py +92 -0
- awsglue/transforms/unbox.py +112 -0
- awsglue/transforms/union.py +66 -0
- awsglue/transforms/unnest_frame.py +75 -0
- awsglue/utils.py +159 -0
- awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
- awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
- awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
- awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
awsglue/context.py
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from pyspark.sql import SQLContext
|
|
14
|
+
from pyspark.sql import SparkSession
|
|
15
|
+
from py4j.java_gateway import java_import # type: ignore
|
|
16
|
+
|
|
17
|
+
from awsglue.data_source import DataSource
|
|
18
|
+
from awsglue.streaming_data_source import StreamingDataSource
|
|
19
|
+
from awsglue.data_sink import DataSink
|
|
20
|
+
from awsglue.dataframereader import DataFrameReader
|
|
21
|
+
from awsglue.dataframewriter import DataFrameWriter
|
|
22
|
+
from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
|
|
23
|
+
from awsglue.gluetypes import DataType
|
|
24
|
+
from awsglue.utils import makeOptions, callsite
|
|
25
|
+
from pyspark.sql.dataframe import DataFrame
|
|
26
|
+
import pyspark
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import uuid
|
|
30
|
+
from py4j.java_gateway import JavaClass
|
|
31
|
+
import time
|
|
32
|
+
import logging
|
|
33
|
+
|
|
34
|
+
def register(sc):
|
|
35
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.*")
|
|
36
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.log.GlueLogger")
|
|
37
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.schema.*")
|
|
38
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.util.JsonOptions")
|
|
39
|
+
java_import(sc._jvm, "org.apache.spark.sql.glue.util.SparkUtility")
|
|
40
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.util.Job")
|
|
41
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.util.AWSConnectionUtils")
|
|
42
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.util.GluePythonUtils")
|
|
43
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.errors.CallSite")
|
|
44
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.ml.EntityDetector")
|
|
45
|
+
java_import(sc._jvm, "com.amazonaws.services.glue.dq.EvaluateDataQuality")
|
|
46
|
+
# java_import(sc._jvm, "com.amazonaws.services.glue.ml.FindMatches")
|
|
47
|
+
# java_import(sc._jvm, "com.amazonaws.services.glue.ml.FindIncrementalMatches")
|
|
48
|
+
# java_import(sc._jvm, "com.amazonaws.services.glue.ml.FillMissingValues")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class GlueContext(SQLContext):
|
|
52
|
+
Spark_SQL_Formats = {"parquet", "orc"}
|
|
53
|
+
Unsupported_Compression_Types = {"lzo"}
|
|
54
|
+
|
|
55
|
+
def __init__(self, sparkContext=None, **options):
|
|
56
|
+
if not sparkContext:
|
|
57
|
+
spark_session = SparkSession.builder.getOrCreate()
|
|
58
|
+
sparkContext = spark_session.sparkContext
|
|
59
|
+
elif type(sparkContext) == SparkSession:
|
|
60
|
+
spark_session = sparkContext
|
|
61
|
+
sparkContext = spark_session.sparkContext
|
|
62
|
+
else:
|
|
63
|
+
spark_session = SparkSession.builder.getOrCreate()
|
|
64
|
+
super(GlueContext, self).__init__(sparkContext, spark_session)
|
|
65
|
+
register(sparkContext)
|
|
66
|
+
self._glue_scala_context = self._get_glue_scala_context(**options)
|
|
67
|
+
self.create_dynamic_frame = DynamicFrameReader(self)
|
|
68
|
+
self.create_data_frame = DataFrameReader(self)
|
|
69
|
+
self.write_dynamic_frame = DynamicFrameWriter(self)
|
|
70
|
+
self.write_data_frame = DataFrameWriter(self)
|
|
71
|
+
self.spark_session = self.sparkSession
|
|
72
|
+
self._glue_logger = sparkContext._jvm.GlueLogger()
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def _ssql_ctx(self):
|
|
76
|
+
if not hasattr(self, '_glue_scala_context'):
|
|
77
|
+
self._glue_scala_context = self._get_glue_scala_context()
|
|
78
|
+
return self._glue_scala_context
|
|
79
|
+
|
|
80
|
+
def _get_glue_scala_context(self, **options):
|
|
81
|
+
min_partitions = target_partitions = None
|
|
82
|
+
if 'minPartitions' in options:
|
|
83
|
+
min_partitions = options['minPartitions']
|
|
84
|
+
target_partitions = options.get('targetPartitions', min_partitions)
|
|
85
|
+
elif 'targetPartitions' in options:
|
|
86
|
+
min_partitions = target_partitions = options.get('targetPartitions')
|
|
87
|
+
|
|
88
|
+
if min_partitions is None:
|
|
89
|
+
return self._jvm.GlueContext(self._jsc.sc())
|
|
90
|
+
else:
|
|
91
|
+
return self._jvm.GlueContext(self._jsc.sc(), min_partitions, target_partitions)
|
|
92
|
+
|
|
93
|
+
def getSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options):
|
|
94
|
+
"""Creates a DataSource object.
|
|
95
|
+
|
|
96
|
+
This can be used to read DynamicFrames from external sources.
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
>>> data_source = context.getSource("file", paths=["/in/path"])
|
|
100
|
+
>>> data_source.setFormat("json")
|
|
101
|
+
>>> myFrame = data_source.getFrame()
|
|
102
|
+
"""
|
|
103
|
+
options["callSite"] = callsite()
|
|
104
|
+
compressionType = options.get("compressionType", "")
|
|
105
|
+
if compressionType in self.Unsupported_Compression_Types and format == None:
|
|
106
|
+
raise Exception("When using compressionType {}, the format parameter must be specified.".format(compressionType))
|
|
107
|
+
#if get unsupported compression type, fallback to use spark sql datasource.
|
|
108
|
+
if((format and format.lower() in self.Spark_SQL_Formats) or (compressionType in self.Unsupported_Compression_Types)):
|
|
109
|
+
connection_type = format
|
|
110
|
+
|
|
111
|
+
j_source = self._ssql_ctx.getSource(connection_type,
|
|
112
|
+
makeOptions(self._sc, options), transformation_ctx, push_down_predicate)
|
|
113
|
+
|
|
114
|
+
prefix = None
|
|
115
|
+
if 'paths' in options and options['paths'] != None:
|
|
116
|
+
paths = options['paths']
|
|
117
|
+
prefix = os.path.commonprefix(paths)
|
|
118
|
+
if prefix != None:
|
|
119
|
+
prefix = prefix.split(':')[-1]
|
|
120
|
+
prefix = re.sub('[:/.]', '', prefix)
|
|
121
|
+
|
|
122
|
+
# in case paths is not in options or no common prefix
|
|
123
|
+
if prefix == None:
|
|
124
|
+
prefix = str(uuid.uuid1())
|
|
125
|
+
prefix = re.sub('[-]', '_', prefix)
|
|
126
|
+
|
|
127
|
+
return DataSource(j_source, self, prefix)
|
|
128
|
+
|
|
129
|
+
def getStreamingSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options):
|
|
130
|
+
"""Creates a Streaming Data Source object.
|
|
131
|
+
|
|
132
|
+
This can be used to read Dataframes from external sources.
|
|
133
|
+
"""
|
|
134
|
+
options["callSite"] = callsite()
|
|
135
|
+
if(format and format.lower() in self.Spark_SQL_Formats):
|
|
136
|
+
connection_type = format
|
|
137
|
+
|
|
138
|
+
j_source = self._ssql_ctx.getSource(connection_type,
|
|
139
|
+
makeOptions(self._sc, options), transformation_ctx, push_down_predicate)
|
|
140
|
+
|
|
141
|
+
prefix = None
|
|
142
|
+
if 'paths' in options and options['paths'] != None:
|
|
143
|
+
paths = options['paths']
|
|
144
|
+
prefix = os.path.commonprefix(paths)
|
|
145
|
+
if prefix != None:
|
|
146
|
+
prefix = prefix.split(':')[-1]
|
|
147
|
+
prefix = re.sub('[:/.]', '', prefix)
|
|
148
|
+
|
|
149
|
+
# in case paths is not in options or no common prefix
|
|
150
|
+
if prefix == None:
|
|
151
|
+
prefix = str(uuid.uuid1())
|
|
152
|
+
prefix = re.sub('[-]', '_', prefix)
|
|
153
|
+
|
|
154
|
+
return StreamingDataSource(j_source, self, prefix)
|
|
155
|
+
|
|
156
|
+
def get_catalog_schema_as_spark_schema(self, database = None, table_name = None, catalog_id = None):
|
|
157
|
+
return self._ssql_ctx.getCatalogSchemaAsSparkSchema(database, table_name, catalog_id)
|
|
158
|
+
|
|
159
|
+
def create_dynamic_frame_from_rdd(self, data, name, schema=None, sample_ratio=None, transformation_ctx=""):
|
|
160
|
+
"""Creates a DynamicFrame from an RDD.
|
|
161
|
+
"""
|
|
162
|
+
df = super(GlueContext, self).createDataFrame(data, schema, sample_ratio)
|
|
163
|
+
return DynamicFrame.fromDF(df, self, name)
|
|
164
|
+
|
|
165
|
+
def create_dynamic_frame_from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "",
|
|
166
|
+
transformation_ctx = "", push_down_predicate="", additional_options = {},
|
|
167
|
+
catalog_id = None, **kwargs):
|
|
168
|
+
"""
|
|
169
|
+
Creates a DynamicFrame with catalog database, table name and an optional catalog id
|
|
170
|
+
:param database: database in catalog
|
|
171
|
+
:param table_name: table name
|
|
172
|
+
:param redshift_tmp_dir: tmp dir
|
|
173
|
+
:param transformation_ctx: transformation context
|
|
174
|
+
:param push_down_predicate
|
|
175
|
+
:param additional_options
|
|
176
|
+
:param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
177
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
178
|
+
:return: dynamic frame with potential errors
|
|
179
|
+
"""
|
|
180
|
+
if database is not None and "name_space" in kwargs:
|
|
181
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
182
|
+
elif database is None and "name_space" not in kwargs:
|
|
183
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
184
|
+
elif "name_space" in kwargs:
|
|
185
|
+
db = kwargs.pop("name_space")
|
|
186
|
+
else:
|
|
187
|
+
db = database
|
|
188
|
+
|
|
189
|
+
if table_name is None:
|
|
190
|
+
raise Exception("Parameter table_name is missing.")
|
|
191
|
+
source = DataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
|
|
192
|
+
push_down_predicate,
|
|
193
|
+
makeOptions(self._sc, additional_options), catalog_id),
|
|
194
|
+
self, table_name)
|
|
195
|
+
return source.getFrame(**kwargs)
|
|
196
|
+
|
|
197
|
+
def create_data_frame_from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "",
|
|
198
|
+
transformation_ctx = "", push_down_predicate="", additional_options = {},
|
|
199
|
+
catalog_id = None, **kwargs):
|
|
200
|
+
"""
|
|
201
|
+
Creates a DataFrame with catalog database, table name and an optional catalog id
|
|
202
|
+
:param database: database in catalog
|
|
203
|
+
:param table_name: table name
|
|
204
|
+
:param redshift_tmp_dir: tmp dir
|
|
205
|
+
:param transformation_ctx: transformation context
|
|
206
|
+
:param push_down_predicate
|
|
207
|
+
:param additional_options
|
|
208
|
+
:param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
209
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
210
|
+
:return: data frame with potential errors
|
|
211
|
+
"""
|
|
212
|
+
if database is not None and "name_space" in kwargs:
|
|
213
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
214
|
+
elif database is None and "name_space" not in kwargs:
|
|
215
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
216
|
+
elif "name_space" in kwargs:
|
|
217
|
+
db = kwargs.pop("name_space")
|
|
218
|
+
else:
|
|
219
|
+
db = database
|
|
220
|
+
|
|
221
|
+
if table_name is None:
|
|
222
|
+
raise Exception("Parameter table_name is missing.")
|
|
223
|
+
source = StreamingDataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
|
|
224
|
+
push_down_predicate,
|
|
225
|
+
makeOptions(self._sc, additional_options), catalog_id),
|
|
226
|
+
self, table_name)
|
|
227
|
+
return source.getFrame()
|
|
228
|
+
|
|
229
|
+
def create_dynamic_frame_from_options(self, connection_type, connection_options={},
|
|
230
|
+
format=None, format_options={}, transformation_ctx = "", push_down_predicate= "", **kwargs):
|
|
231
|
+
"""Creates a DynamicFrame with the specified connection and format.
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
>>> myFrame = context.createDynamicFrame(connection_type="file",
|
|
235
|
+
>>> connection_options={"paths": ["/in/path"]},
|
|
236
|
+
>>> format="json")
|
|
237
|
+
|
|
238
|
+
"""
|
|
239
|
+
source = self.getSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
|
|
240
|
+
|
|
241
|
+
if (format and format not in self.Spark_SQL_Formats and connection_options.get("compressionType", "") not in self.Unsupported_Compression_Types):
|
|
242
|
+
source.setFormat(format, **format_options)
|
|
243
|
+
|
|
244
|
+
return source.getFrame(**kwargs)
|
|
245
|
+
|
|
246
|
+
def create_sample_dynamic_frame_from_catalog(self, database = None, table_name = None, num = None, sample_options = {}, redshift_tmp_dir = "",
|
|
247
|
+
transformation_ctx = "", push_down_predicate="", additional_options = {},
|
|
248
|
+
catalog_id = None, erieTxId = "", asOfTime = "", **kwargs):
|
|
249
|
+
"""
|
|
250
|
+
return a list of sample dynamic records with catalog database, table name and an optional catalog id
|
|
251
|
+
:param database: database in catalog
|
|
252
|
+
:param table_name: table name
|
|
253
|
+
:param num: number of sample records
|
|
254
|
+
:param sample_options: options for sampling behavior
|
|
255
|
+
:param transformation_ctx: transformation context
|
|
256
|
+
:param push_down_predicate
|
|
257
|
+
:param additional_options
|
|
258
|
+
:param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
259
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
260
|
+
:return: dynamic frame with potential errors
|
|
261
|
+
"""
|
|
262
|
+
if database is not None and "name_space" in kwargs:
|
|
263
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
264
|
+
elif database is None and "name_space" not in kwargs:
|
|
265
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
266
|
+
elif "name_space" in kwargs:
|
|
267
|
+
db = kwargs.pop("name_space")
|
|
268
|
+
else:
|
|
269
|
+
db = database
|
|
270
|
+
|
|
271
|
+
if table_name is None:
|
|
272
|
+
raise Exception("Parameter table_name is missing.")
|
|
273
|
+
source = DataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
|
|
274
|
+
push_down_predicate,
|
|
275
|
+
makeOptions(self._sc, additional_options), catalog_id),
|
|
276
|
+
self, table_name)
|
|
277
|
+
return source.getSampleFrame(num, **sample_options)
|
|
278
|
+
|
|
279
|
+
def create_sample_dynamic_frame_from_options(self, connection_type, connection_options={}, num = None, sample_options = {},
|
|
280
|
+
format=None, format_options={}, transformation_ctx = "", push_down_predicate= "", **kwargs):
|
|
281
|
+
"""Creates a list of sample dynamic records with the specified connection and format.
|
|
282
|
+
"""
|
|
283
|
+
source = self.getSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
|
|
284
|
+
|
|
285
|
+
if (format and format not in self.Spark_SQL_Formats):
|
|
286
|
+
source.setFormat(format, **format_options)
|
|
287
|
+
|
|
288
|
+
return source.getSampleFrame(num, **sample_options)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def create_data_frame_from_options(self, connection_type, connection_options={},
|
|
292
|
+
format=None, format_options={}, transformation_ctx = "", push_down_predicate= "", **kwargs):
|
|
293
|
+
"""Creates a DataFrame with the specified connection and format. Used for streaming data sources
|
|
294
|
+
"""
|
|
295
|
+
source = self.getStreamingSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
|
|
296
|
+
|
|
297
|
+
if (format and format not in self.Spark_SQL_Formats):
|
|
298
|
+
source.setFormat(format, **format_options)
|
|
299
|
+
|
|
300
|
+
return source.getFrame()
|
|
301
|
+
|
|
302
|
+
def getSink(self, connection_type, format = None, transformation_ctx = "", **options):
|
|
303
|
+
"""Gets a DataSink object.
|
|
304
|
+
|
|
305
|
+
This can be used to write DynamicFrames to external targets.
|
|
306
|
+
Check SparkSQL format first to make sure to return the expected sink
|
|
307
|
+
|
|
308
|
+
Example:
|
|
309
|
+
>>> data_sink = context.getSink("s3")
|
|
310
|
+
>>> data_sink.setFormat("json"),
|
|
311
|
+
>>> data_sink.writeFrame(myFrame)
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
if(format and format.lower() in self.Spark_SQL_Formats):
|
|
315
|
+
connection_type = format
|
|
316
|
+
j_sink = self._ssql_ctx.getSink(connection_type,
|
|
317
|
+
makeOptions(self._sc, options), transformation_ctx)
|
|
318
|
+
return DataSink(j_sink, self)
|
|
319
|
+
|
|
320
|
+
def write_dynamic_frame_from_options(self, frame, connection_type, connection_options={},
|
|
321
|
+
format=None, format_options={}, transformation_ctx = ""):
|
|
322
|
+
"""
|
|
323
|
+
Writes a DynamicFrame using the specified connection and format
|
|
324
|
+
:param frame:
|
|
325
|
+
:param connection_type: s3, redshift, jdbc, dynamo and so on
|
|
326
|
+
:param connection_options: like path, dbtable
|
|
327
|
+
:param format: json, csv or other format, this is used for s3 or tape connection which supports multiple format
|
|
328
|
+
:param format_options: delimiter and so on
|
|
329
|
+
:return: dynamic_frame with potential errors
|
|
330
|
+
|
|
331
|
+
>>> data_sink = context.write_dynamic_frame_by_options(frame,
|
|
332
|
+
>>> connection_type="s3",
|
|
333
|
+
>>> path="/out/path",
|
|
334
|
+
>>> format="json")
|
|
335
|
+
"""
|
|
336
|
+
return self.write_from_options(frame, connection_type,
|
|
337
|
+
connection_options,
|
|
338
|
+
format, format_options, transformation_ctx)
|
|
339
|
+
|
|
340
|
+
def write_from_options(self, frame_or_dfc, connection_type,
|
|
341
|
+
connection_options={}, format={}, format_options={},
|
|
342
|
+
transformation_ctx = "", **kwargs):
|
|
343
|
+
if isinstance(frame_or_dfc, DynamicFrameCollection):
|
|
344
|
+
new_options = dict(list(connection_options.items())
|
|
345
|
+
+ [("useFrameName", True)])
|
|
346
|
+
elif isinstance(frame_or_dfc, DynamicFrame):
|
|
347
|
+
new_options = connection_options
|
|
348
|
+
else:
|
|
349
|
+
raise TypeError("frame_or_dfc must be DynamicFrame or"
|
|
350
|
+
"DynamicFrameCollection. Got " +
|
|
351
|
+
str(type(frame_or_dfc)))
|
|
352
|
+
|
|
353
|
+
# Handle parquet and ORC case, make sure to get the right SparkSQL sink
|
|
354
|
+
sink = self.getSink(connection_type, format, transformation_ctx, **new_options)
|
|
355
|
+
if (format and format not in self.Spark_SQL_Formats):
|
|
356
|
+
sink.setFormat(format, **format_options)
|
|
357
|
+
|
|
358
|
+
if 'accumulator_size' in kwargs and kwargs['accumulator_size'] > 0:
|
|
359
|
+
sink.setAccumulableSize(kwargs['accumulator_size'])
|
|
360
|
+
|
|
361
|
+
return sink.write(frame_or_dfc)
|
|
362
|
+
|
|
363
|
+
# Note that since the table name is included in the catalog specification,
|
|
364
|
+
# it doesn't make sense to include a version of this method for DFCs.
|
|
365
|
+
def write_dynamic_frame_from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "",
|
|
366
|
+
transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
|
|
367
|
+
"""
|
|
368
|
+
Writes a DynamicFrame to a location defined in the catalog's database, table name and an optional catalog id
|
|
369
|
+
:param frame: dynamic frame to be written
|
|
370
|
+
:param database: database in catalog
|
|
371
|
+
:param table_name: table name
|
|
372
|
+
:param redshift_tmp_dir: tmp dir
|
|
373
|
+
:param transformation_ctx: transformation context
|
|
374
|
+
:param additional_options
|
|
375
|
+
:param catalog_id catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
376
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
377
|
+
:return: dynamic frame with potential errors
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
if database is not None and "name_space" in kwargs:
|
|
381
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
382
|
+
elif database is None and "name_space" not in kwargs:
|
|
383
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
384
|
+
elif "name_space" in kwargs:
|
|
385
|
+
db = kwargs.pop("name_space")
|
|
386
|
+
else:
|
|
387
|
+
db = database
|
|
388
|
+
|
|
389
|
+
if table_name is None:
|
|
390
|
+
raise Exception("Parameter table_name is missing.")
|
|
391
|
+
|
|
392
|
+
j_sink = self._ssql_ctx.getCatalogSink(db, table_name, redshift_tmp_dir, transformation_ctx,
|
|
393
|
+
makeOptions(self._sc, additional_options), catalog_id)
|
|
394
|
+
return DataSink(j_sink, self).write(frame)
|
|
395
|
+
|
|
396
|
+
def write_data_frame_from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "",
|
|
397
|
+
transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
|
|
398
|
+
if database is not None and "name_space" in kwargs:
|
|
399
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
400
|
+
elif database is None and "name_space" not in kwargs:
|
|
401
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
402
|
+
elif "name_space" in kwargs:
|
|
403
|
+
db = kwargs.pop("name_space")
|
|
404
|
+
else:
|
|
405
|
+
db = database
|
|
406
|
+
|
|
407
|
+
if table_name is None:
|
|
408
|
+
raise Exception("Parameter table_name is missing.")
|
|
409
|
+
|
|
410
|
+
j_sink = self._ssql_ctx.getCatalogSink(db, table_name, redshift_tmp_dir, transformation_ctx,
|
|
411
|
+
makeOptions(self._sc, additional_options), catalog_id)
|
|
412
|
+
return DataSink(j_sink, self).writeDataFrame(frame, self)
|
|
413
|
+
|
|
414
|
+
def write_dynamic_frame_from_jdbc_conf(self, frame, catalog_connection, connection_options={},
|
|
415
|
+
redshift_tmp_dir = "", transformation_ctx = "", catalog_id = None):
|
|
416
|
+
"""
|
|
417
|
+
:param frame: dynamic frame to be written
|
|
418
|
+
:param catalog_connection: catalog connection name, used to access JDBC configuration
|
|
419
|
+
:param connection_options: dbtable and so on
|
|
420
|
+
:param redshift_tmp_dir: tmp dir
|
|
421
|
+
:param transformation_ctx: transformation context
|
|
422
|
+
:param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
423
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
424
|
+
:return: dynamic frame with potential errors
|
|
425
|
+
"""
|
|
426
|
+
self.write_from_jdbc_conf(frame, catalog_connection, connection_options, redshift_tmp_dir, transformation_ctx,
|
|
427
|
+
catalog_id)
|
|
428
|
+
|
|
429
|
+
def write_from_jdbc_conf(self, frame_or_dfc, catalog_connection, connection_options={},
|
|
430
|
+
redshift_tmp_dir = "", transformation_ctx = "", catalog_id = None):
|
|
431
|
+
if isinstance(frame_or_dfc, DynamicFrameCollection):
|
|
432
|
+
new_options = dict(list(connection_options.items())
|
|
433
|
+
+ [("useFrameName", True)])
|
|
434
|
+
elif isinstance(frame_or_dfc, DynamicFrame):
|
|
435
|
+
new_options = connection_options
|
|
436
|
+
else:
|
|
437
|
+
raise TypeError("frame_or_dfc must be DynamicFrame or"
|
|
438
|
+
"DynamicFrameCollection. Got " +
|
|
439
|
+
str(type(frame_or_dfc)))
|
|
440
|
+
|
|
441
|
+
j_sink = self._ssql_ctx.getJDBCSink(catalog_connection, makeOptions(self._sc, new_options), redshift_tmp_dir,
|
|
442
|
+
transformation_ctx, catalog_id)
|
|
443
|
+
return DataSink(j_sink, self).write(frame_or_dfc)
|
|
444
|
+
|
|
445
|
+
def convert_resolve_option(self, path, action, target):
|
|
446
|
+
|
|
447
|
+
if action.upper() == "KEEPASSTRUCT":
|
|
448
|
+
return self._jvm.ResolveSpec.apply(path, "make_struct")
|
|
449
|
+
elif action.upper() == "PROJECT":
|
|
450
|
+
if target is None or not isinstance(target, DataType):
|
|
451
|
+
raise ValueError("Target type must be specified with project action.")
|
|
452
|
+
|
|
453
|
+
return self._jvm.ResolveSpec.apply(path, "project:{}".format(target.typeName()))
|
|
454
|
+
else:
|
|
455
|
+
raise ValueError("Invalid resolve action {}. ".format(action) +
|
|
456
|
+
"Action must be one of KeepAsStruct and Project.")
|
|
457
|
+
|
|
458
|
+
def extract_jdbc_conf(self, connection_name, catalog_id=None):
|
|
459
|
+
"""
|
|
460
|
+
Get the username, password, vendor and url from the connection object in the catalog
|
|
461
|
+
:param connection_name: name of the connection in the catalog
|
|
462
|
+
:param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
463
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
464
|
+
:return: dict with keys "user", "password", "vendor", "url"
|
|
465
|
+
"""
|
|
466
|
+
return self._ssql_ctx.extractJDBCConf(connection_name, catalog_id)
|
|
467
|
+
|
|
468
|
+
def purge_table(self, database, table_name, options={}, transformation_ctx="", catalog_id=None):
|
|
469
|
+
"""
|
|
470
|
+
Delete files from s3 for the given catalog's database and table. If all files in a partition are deleted, that
|
|
471
|
+
partition is deleted from the catalog too
|
|
472
|
+
:param database: database name in catalog
|
|
473
|
+
:param table_name: table name in catalog
|
|
474
|
+
:param options: Options to filter files to be deleted and manifest file generation
|
|
475
|
+
retentionPeriod: Number of hours. Files newer than the retention period will be retained.
|
|
476
|
+
168 hours - (7 days) by default
|
|
477
|
+
partitionPredicate: Partitions satisfying this predicate will be deleted.
|
|
478
|
+
Files within the retention period in these partitions will not be deleted.
|
|
479
|
+
"" - empty by default
|
|
480
|
+
excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
|
|
481
|
+
Set() - empty set by default
|
|
482
|
+
manifestFilePath: optional path for manifest file generation. All files that were successfully purged
|
|
483
|
+
or transitioned will be recorded in Success.csv and those that failed in Failed.csv
|
|
484
|
+
:param transformation_ctx: transformation context (used in manifest file path)
|
|
485
|
+
:param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
486
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
487
|
+
:return: void return type
|
|
488
|
+
"""
|
|
489
|
+
self._ssql_ctx.purgeTable(database, table_name, makeOptions(self._sc, options), transformation_ctx, catalog_id)
|
|
490
|
+
|
|
491
|
+
def purge_s3_path(self, s3_path, options={}, transformation_ctx=""):
|
|
492
|
+
"""
|
|
493
|
+
Deletes files from a given s3 path recursively
|
|
494
|
+
:param s3_path: s3 path of the files to be deleted in the format s3://<bucket>/<prefix>/
|
|
495
|
+
:param options: Options to filter files to be deleted and manifest file generation
|
|
496
|
+
retentionPeriod: Number of hours. Files newer than the retention period will be retained.
|
|
497
|
+
168 hours - (7 days) by default
|
|
498
|
+
excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
|
|
499
|
+
Set() - empty set by default
|
|
500
|
+
manifestFilePath: optional path for manifest file generation. All files that were successfully purged
|
|
501
|
+
or transitioned will be recorded in Success.csv and those that failed in Failed.csv
|
|
502
|
+
:param transformation_ctx: transformation context (used in manifest file path)
|
|
503
|
+
:return: void return type
|
|
504
|
+
"""
|
|
505
|
+
self._ssql_ctx.purgeS3Path(s3_path, makeOptions(self._sc, options), transformation_ctx)
|
|
506
|
+
|
|
507
|
+
def transition_table(self, database, table_name, transition_to, options={}, transformation_ctx="", catalog_id=None):
|
|
508
|
+
"""
|
|
509
|
+
Transitions the storage class of the files stored on s3 for the given catalog's database and table
|
|
510
|
+
:param database: database name in catalog
|
|
511
|
+
:param table_name: table name in catalog
|
|
512
|
+
:param transition_to: S3 storage class to transition to
|
|
513
|
+
https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html
|
|
514
|
+
:param options: Options to filter files to be transitioned and manifest file generation
|
|
515
|
+
retentionPeriod: Number of hours. Files newer than the retention period will be retained.
|
|
516
|
+
168 hours - (7 days) by default
|
|
517
|
+
partitionPredicate: Partitions satisfying this predicate will be deleted.
|
|
518
|
+
Files within the retention period in these partitions will not be deleted.
|
|
519
|
+
"" - empty by default
|
|
520
|
+
excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
|
|
521
|
+
Set() - empty set by default
|
|
522
|
+
manifestFilePath: optional path for manifest file generation. All files that were successfully purged
|
|
523
|
+
or transitioned will be recorded in Success.csv and those that failed in Failed.csv
|
|
524
|
+
accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform
|
|
525
|
+
roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform
|
|
526
|
+
:param transformation_ctx: transformation context (used in manifest file path)
|
|
527
|
+
:param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
|
|
528
|
+
Set to None by default (None defaults to the catalog id of the calling account in the service)
|
|
529
|
+
:return: void return type
|
|
530
|
+
"""
|
|
531
|
+
self._ssql_ctx.transitionTable(database, table_name, transition_to, makeOptions(self._sc, options),
|
|
532
|
+
transformation_ctx, catalog_id)
|
|
533
|
+
|
|
534
|
+
def transition_s3_path(self, s3_path, transition_to, options={}, transformation_ctx=""):
|
|
535
|
+
"""
|
|
536
|
+
Transition files in a given s3 path recursively
|
|
537
|
+
:param s3_path: s3 path of the files to be transitioned in the format s3://<bucket>/<prefix>/
|
|
538
|
+
:param transition_to: S3 storage class to transition to
|
|
539
|
+
https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html
|
|
540
|
+
:param options: Options to filter files to be deleted and manifest file generation
|
|
541
|
+
retentionPeriod Number of hours. Files newer than the retention period will be retained.
|
|
542
|
+
168 hours - (7 days) by default
|
|
543
|
+
excludeStorageClasses Files with storage class in the excludeStorageClasses set are not deleted.
|
|
544
|
+
Set() - empty set by default
|
|
545
|
+
manifestFilePath optional path for manifest file generation. All files that were successfully purged
|
|
546
|
+
or transitioned will be recorded in Success.csv and those that failed in Failed.csv
|
|
547
|
+
accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform
|
|
548
|
+
roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform
|
|
549
|
+
:param transformation_ctx: transformation context (used in manifest file path)
|
|
550
|
+
:return: void return type
|
|
551
|
+
"""
|
|
552
|
+
self._ssql_ctx.transitionS3Path(s3_path, transition_to, makeOptions(self._sc, options), transformation_ctx)
|
|
553
|
+
|
|
554
|
+
def get_logger(self):
|
|
555
|
+
return self._glue_logger
|
|
556
|
+
|
|
557
|
+
def currentTimeMillis(self):
|
|
558
|
+
return int(round(time.time() * 1000))
|
|
559
|
+
|
|
560
|
+
def getSampleStreamingDynamicFrame(self, frame, options={}, batch_function=None):
|
|
561
|
+
if "windowSize" not in options:
|
|
562
|
+
raise ValueError("Missing windowSize argument")
|
|
563
|
+
|
|
564
|
+
windowSize = options["windowSize"]
|
|
565
|
+
pollingTimeInMs = int(options.get("pollingTimeInMs", 10000))
|
|
566
|
+
recordPollingLimit = int(options.get("recordPollingLimit", 100))
|
|
567
|
+
|
|
568
|
+
# Use a different implementation here due to Py4J limitation
|
|
569
|
+
def convert_window_size_to_milis(window_size):
|
|
570
|
+
if type(window_size) != str or " " not in window_size.strip():
|
|
571
|
+
raise ValueError("Received invalid window size")
|
|
572
|
+
chunks = window_size.strip().split(" ")
|
|
573
|
+
if len(chunks) != 2:
|
|
574
|
+
raise ValueError("Received invalid window size")
|
|
575
|
+
unit = chunks[1].lower()
|
|
576
|
+
if "second" in unit:
|
|
577
|
+
multiplier = 1000
|
|
578
|
+
elif "minute" in unit:
|
|
579
|
+
multiplier = 1000 * 60
|
|
580
|
+
elif "hour" in unit:
|
|
581
|
+
multiplier = 1000 * 60 * 60
|
|
582
|
+
else:
|
|
583
|
+
raise ValueError("Received invalid window size")
|
|
584
|
+
try:
|
|
585
|
+
quantity = int(chunks[0])
|
|
586
|
+
except:
|
|
587
|
+
raise ValueError("Received invalid window size")
|
|
588
|
+
return quantity * multiplier
|
|
589
|
+
|
|
590
|
+
windowSizeInMilis = convert_window_size_to_milis(windowSize)
|
|
591
|
+
if windowSizeInMilis >= pollingTimeInMs:
|
|
592
|
+
raise ValueError("Polling time needs to be larger than window size")
|
|
593
|
+
|
|
594
|
+
tableId = str(uuid.uuid4()).replace("-", "")
|
|
595
|
+
writer = frame.writeStream\
|
|
596
|
+
.trigger(processingTime=windowSize)\
|
|
597
|
+
.queryName(tableId)\
|
|
598
|
+
.format("memory")
|
|
599
|
+
if batch_function is not None:
|
|
600
|
+
writer = writer.foreachBatch(batch_function)
|
|
601
|
+
|
|
602
|
+
query = writer.start()
|
|
603
|
+
resultDF = self.spark_session.sql("select * from " + tableId + " limit " + str(recordPollingLimit))
|
|
604
|
+
time.sleep(pollingTimeInMs / 1000)
|
|
605
|
+
query.stop()
|
|
606
|
+
return DynamicFrame.fromDF(resultDF, self, tableId)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def forEachBatch(self, frame, batch_function, options = {}):
|
|
610
|
+
if "windowSize" not in options:
|
|
611
|
+
raise Exception("Missing windowSize argument")
|
|
612
|
+
if "checkpointLocation" not in options:
|
|
613
|
+
raise Exception("Missing checkpointLocation argument")
|
|
614
|
+
|
|
615
|
+
windowSize = options["windowSize"]
|
|
616
|
+
checkpointLocation = options["checkpointLocation"]
|
|
617
|
+
|
|
618
|
+
java_import(self._jvm, "org.apache.spark.metrics.source.StreamingSource")
|
|
619
|
+
|
|
620
|
+
run = {'value': 0}
|
|
621
|
+
retry_attempt = {'value': 0}
|
|
622
|
+
|
|
623
|
+
def batch_function_with_persist(data_frame, batchId):
|
|
624
|
+
|
|
625
|
+
# This condition is true when the previous batch succeeded
|
|
626
|
+
if run['value'] > retry_attempt['value']:
|
|
627
|
+
run['value'] = 0
|
|
628
|
+
if retry_attempt['value'] > 0:
|
|
629
|
+
retry_attempt['value'] = 0
|
|
630
|
+
logging.info("The previous batch was succeeded. Reset the retry attempt counter to 0.")
|
|
631
|
+
run['value'] += 1
|
|
632
|
+
|
|
633
|
+
# process the batch
|
|
634
|
+
startTime = self.currentTimeMillis()
|
|
635
|
+
if "persistDataFrame" in options and options["persistDataFrame"].lower() == "false":
|
|
636
|
+
if len(data_frame.take(1)):
|
|
637
|
+
batch_function(data_frame, batchId)
|
|
638
|
+
else:
|
|
639
|
+
storage_level = options.get("storageLevel", "MEMORY_AND_DISK").upper()
|
|
640
|
+
data_frame.persist(getattr(pyspark.StorageLevel, storage_level))
|
|
641
|
+
num_records = data_frame.count()
|
|
642
|
+
if num_records > 0:
|
|
643
|
+
batch_function(data_frame, batchId)
|
|
644
|
+
data_frame.unpersist()
|
|
645
|
+
self._jvm.StreamingSource.updateNumRecords(num_records)
|
|
646
|
+
self._jvm.StreamingSource.updateBatchProcessingTimeInMs(self.currentTimeMillis() - startTime)
|
|
647
|
+
|
|
648
|
+
query = frame.writeStream.foreachBatch(batch_function_with_persist).trigger(processingTime=windowSize).option("checkpointLocation", checkpointLocation)
|
|
649
|
+
|
|
650
|
+
batch_max_retries = int(options.get('batchMaxRetries', 3))
|
|
651
|
+
if batch_max_retries < 0 or batch_max_retries > 100:
|
|
652
|
+
raise ValueError('Please specify the number of retries as an integer in the range of [0, 100].')
|
|
653
|
+
|
|
654
|
+
while (True):
|
|
655
|
+
try:
|
|
656
|
+
if retry_attempt['value'] > 0:
|
|
657
|
+
logging.warning("Retrying micro batch processing, attempt {} out of {}. ".format(retry_attempt['value'], batch_max_retries))
|
|
658
|
+
query.start().awaitTermination()
|
|
659
|
+
except Exception as e:
|
|
660
|
+
|
|
661
|
+
if str(e).startswith("CheckpointMetadataNotFound"):
|
|
662
|
+
raise e
|
|
663
|
+
|
|
664
|
+
retry_attempt['value'] += 1
|
|
665
|
+
|
|
666
|
+
if retry_attempt['value'] > batch_max_retries:
|
|
667
|
+
self._glue_logger.error("Exceeded the maximum number of batch retries. Throwing the exception. ")
|
|
668
|
+
raise e
|
|
669
|
+
|
|
670
|
+
backOffTime = retry_attempt['value'] if (retry_attempt['value'] < 3) else 5
|
|
671
|
+
time.sleep(backOffTime)
|
|
672
|
+
|
|
673
|
+
"""
|
|
674
|
+
Appends ingestion time columns like ingest_year, ingest_month, ingest_day, ingest_hour, ingest_minute to the
|
|
675
|
+
input DataFrame.
|
|
676
|
+
:param df Input DataFrame in which to append the ingestion time columns.
|
|
677
|
+
:param timeGranularity Time Granularity until which to add the time granularity columns.
|
|
678
|
+
:return DataFrame after appending the time granularity columns.
|
|
679
|
+
"""
|
|
680
|
+
def add_ingestion_time_columns(self, frame, time_granularity):
|
|
681
|
+
return DataFrame(self._ssql_ctx.addIngestionTimeColumns(frame._jdf, time_granularity), frame.sql_ctx)
|
|
682
|
+
|
|
683
|
+
def start_transaction(self, read_only):
|
|
684
|
+
return self._ssql_ctx.startTransaction(read_only)
|
|
685
|
+
|
|
686
|
+
def commit_transaction(self, transaction_id, wait_for_commit=True):
|
|
687
|
+
return self._ssql_ctx.commitTransaction(transaction_id, wait_for_commit)
|
|
688
|
+
|
|
689
|
+
def cancel_transaction(self, transaction_id):
|
|
690
|
+
return self._ssql_ctx.cancelTransaction(transaction_id)
|