AWSGlueDataplanePython 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. awsglue/README.md +37 -0
  2. awsglue/__init__.py +15 -0
  3. awsglue/context.py +690 -0
  4. awsglue/data_sink.py +49 -0
  5. awsglue/data_source.py +49 -0
  6. awsglue/dataframe_transforms/__init__.py +17 -0
  7. awsglue/dataframe_transforms/apply_mapping.py +76 -0
  8. awsglue/dataframereader.py +41 -0
  9. awsglue/dataframewriter.py +21 -0
  10. awsglue/devutils.py +236 -0
  11. awsglue/dynamicframe.py +669 -0
  12. awsglue/functions.py +31 -0
  13. awsglue/glue_shell.py +38 -0
  14. awsglue/gluetypes.py +461 -0
  15. awsglue/job.py +59 -0
  16. awsglue/scripts/__init__.py +12 -0
  17. awsglue/scripts/activate_etl_connector.py +362 -0
  18. awsglue/scripts/connector_activation_util.py +38 -0
  19. awsglue/scripts/crawler_redo_from_backup.py +75 -0
  20. awsglue/scripts/crawler_undo.py +121 -0
  21. awsglue/scripts/scripts_utils.py +106 -0
  22. awsglue/streaming_data_source.py +28 -0
  23. awsglue/transforms/__init__.py +47 -0
  24. awsglue/transforms/apply_mapping.py +72 -0
  25. awsglue/transforms/coalesce.py +66 -0
  26. awsglue/transforms/collection_transforms.py +155 -0
  27. awsglue/transforms/drop_nulls.py +85 -0
  28. awsglue/transforms/dynamicframe_filter.py +66 -0
  29. awsglue/transforms/dynamicframe_map.py +72 -0
  30. awsglue/transforms/errors_as_dynamicframe.py +45 -0
  31. awsglue/transforms/field_transforms.py +469 -0
  32. awsglue/transforms/relationalize.py +105 -0
  33. awsglue/transforms/repartition.py +61 -0
  34. awsglue/transforms/resolve_choice.py +85 -0
  35. awsglue/transforms/transform.py +92 -0
  36. awsglue/transforms/unbox.py +112 -0
  37. awsglue/transforms/union.py +66 -0
  38. awsglue/transforms/unnest_frame.py +75 -0
  39. awsglue/utils.py +159 -0
  40. awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
  41. awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
  42. awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
  43. awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
  44. awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
  45. awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
awsglue/context.py ADDED
@@ -0,0 +1,690 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from pyspark.sql import SQLContext
14
+ from pyspark.sql import SparkSession
15
+ from py4j.java_gateway import java_import # type: ignore
16
+
17
+ from awsglue.data_source import DataSource
18
+ from awsglue.streaming_data_source import StreamingDataSource
19
+ from awsglue.data_sink import DataSink
20
+ from awsglue.dataframereader import DataFrameReader
21
+ from awsglue.dataframewriter import DataFrameWriter
22
+ from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
23
+ from awsglue.gluetypes import DataType
24
+ from awsglue.utils import makeOptions, callsite
25
+ from pyspark.sql.dataframe import DataFrame
26
+ import pyspark
27
+ import os
28
+ import re
29
+ import uuid
30
+ from py4j.java_gateway import JavaClass
31
+ import time
32
+ import logging
33
+
34
+ def register(sc):
35
+ java_import(sc._jvm, "com.amazonaws.services.glue.*")
36
+ java_import(sc._jvm, "com.amazonaws.services.glue.log.GlueLogger")
37
+ java_import(sc._jvm, "com.amazonaws.services.glue.schema.*")
38
+ java_import(sc._jvm, "com.amazonaws.services.glue.util.JsonOptions")
39
+ java_import(sc._jvm, "org.apache.spark.sql.glue.util.SparkUtility")
40
+ java_import(sc._jvm, "com.amazonaws.services.glue.util.Job")
41
+ java_import(sc._jvm, "com.amazonaws.services.glue.util.AWSConnectionUtils")
42
+ java_import(sc._jvm, "com.amazonaws.services.glue.util.GluePythonUtils")
43
+ java_import(sc._jvm, "com.amazonaws.services.glue.errors.CallSite")
44
+ java_import(sc._jvm, "com.amazonaws.services.glue.ml.EntityDetector")
45
+ java_import(sc._jvm, "com.amazonaws.services.glue.dq.EvaluateDataQuality")
46
+ # java_import(sc._jvm, "com.amazonaws.services.glue.ml.FindMatches")
47
+ # java_import(sc._jvm, "com.amazonaws.services.glue.ml.FindIncrementalMatches")
48
+ # java_import(sc._jvm, "com.amazonaws.services.glue.ml.FillMissingValues")
49
+
50
+
51
+ class GlueContext(SQLContext):
52
+ Spark_SQL_Formats = {"parquet", "orc"}
53
+ Unsupported_Compression_Types = {"lzo"}
54
+
55
+ def __init__(self, sparkContext=None, **options):
56
+ if not sparkContext:
57
+ spark_session = SparkSession.builder.getOrCreate()
58
+ sparkContext = spark_session.sparkContext
59
+ elif type(sparkContext) == SparkSession:
60
+ spark_session = sparkContext
61
+ sparkContext = spark_session.sparkContext
62
+ else:
63
+ spark_session = SparkSession.builder.getOrCreate()
64
+ super(GlueContext, self).__init__(sparkContext, spark_session)
65
+ register(sparkContext)
66
+ self._glue_scala_context = self._get_glue_scala_context(**options)
67
+ self.create_dynamic_frame = DynamicFrameReader(self)
68
+ self.create_data_frame = DataFrameReader(self)
69
+ self.write_dynamic_frame = DynamicFrameWriter(self)
70
+ self.write_data_frame = DataFrameWriter(self)
71
+ self.spark_session = self.sparkSession
72
+ self._glue_logger = sparkContext._jvm.GlueLogger()
73
+
74
+ @property
75
+ def _ssql_ctx(self):
76
+ if not hasattr(self, '_glue_scala_context'):
77
+ self._glue_scala_context = self._get_glue_scala_context()
78
+ return self._glue_scala_context
79
+
80
+ def _get_glue_scala_context(self, **options):
81
+ min_partitions = target_partitions = None
82
+ if 'minPartitions' in options:
83
+ min_partitions = options['minPartitions']
84
+ target_partitions = options.get('targetPartitions', min_partitions)
85
+ elif 'targetPartitions' in options:
86
+ min_partitions = target_partitions = options.get('targetPartitions')
87
+
88
+ if min_partitions is None:
89
+ return self._jvm.GlueContext(self._jsc.sc())
90
+ else:
91
+ return self._jvm.GlueContext(self._jsc.sc(), min_partitions, target_partitions)
92
+
93
+ def getSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options):
94
+ """Creates a DataSource object.
95
+
96
+ This can be used to read DynamicFrames from external sources.
97
+
98
+ Example:
99
+ >>> data_source = context.getSource("file", paths=["/in/path"])
100
+ >>> data_source.setFormat("json")
101
+ >>> myFrame = data_source.getFrame()
102
+ """
103
+ options["callSite"] = callsite()
104
+ compressionType = options.get("compressionType", "")
105
+ if compressionType in self.Unsupported_Compression_Types and format == None:
106
+ raise Exception("When using compressionType {}, the format parameter must be specified.".format(compressionType))
107
+ #if get unsupported compression type, fallback to use spark sql datasource.
108
+ if((format and format.lower() in self.Spark_SQL_Formats) or (compressionType in self.Unsupported_Compression_Types)):
109
+ connection_type = format
110
+
111
+ j_source = self._ssql_ctx.getSource(connection_type,
112
+ makeOptions(self._sc, options), transformation_ctx, push_down_predicate)
113
+
114
+ prefix = None
115
+ if 'paths' in options and options['paths'] != None:
116
+ paths = options['paths']
117
+ prefix = os.path.commonprefix(paths)
118
+ if prefix != None:
119
+ prefix = prefix.split(':')[-1]
120
+ prefix = re.sub('[:/.]', '', prefix)
121
+
122
+ # in case paths is not in options or no common prefix
123
+ if prefix == None:
124
+ prefix = str(uuid.uuid1())
125
+ prefix = re.sub('[-]', '_', prefix)
126
+
127
+ return DataSource(j_source, self, prefix)
128
+
129
+ def getStreamingSource(self, connection_type, format = None, transformation_ctx = "", push_down_predicate= "", **options):
130
+ """Creates a Streaming Data Source object.
131
+
132
+ This can be used to read Dataframes from external sources.
133
+ """
134
+ options["callSite"] = callsite()
135
+ if(format and format.lower() in self.Spark_SQL_Formats):
136
+ connection_type = format
137
+
138
+ j_source = self._ssql_ctx.getSource(connection_type,
139
+ makeOptions(self._sc, options), transformation_ctx, push_down_predicate)
140
+
141
+ prefix = None
142
+ if 'paths' in options and options['paths'] != None:
143
+ paths = options['paths']
144
+ prefix = os.path.commonprefix(paths)
145
+ if prefix != None:
146
+ prefix = prefix.split(':')[-1]
147
+ prefix = re.sub('[:/.]', '', prefix)
148
+
149
+ # in case paths is not in options or no common prefix
150
+ if prefix == None:
151
+ prefix = str(uuid.uuid1())
152
+ prefix = re.sub('[-]', '_', prefix)
153
+
154
+ return StreamingDataSource(j_source, self, prefix)
155
+
156
+ def get_catalog_schema_as_spark_schema(self, database = None, table_name = None, catalog_id = None):
157
+ return self._ssql_ctx.getCatalogSchemaAsSparkSchema(database, table_name, catalog_id)
158
+
159
+ def create_dynamic_frame_from_rdd(self, data, name, schema=None, sample_ratio=None, transformation_ctx=""):
160
+ """Creates a DynamicFrame from an RDD.
161
+ """
162
+ df = super(GlueContext, self).createDataFrame(data, schema, sample_ratio)
163
+ return DynamicFrame.fromDF(df, self, name)
164
+
165
+ def create_dynamic_frame_from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "",
166
+ transformation_ctx = "", push_down_predicate="", additional_options = {},
167
+ catalog_id = None, **kwargs):
168
+ """
169
+ Creates a DynamicFrame with catalog database, table name and an optional catalog id
170
+ :param database: database in catalog
171
+ :param table_name: table name
172
+ :param redshift_tmp_dir: tmp dir
173
+ :param transformation_ctx: transformation context
174
+ :param push_down_predicate
175
+ :param additional_options
176
+ :param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
177
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
178
+ :return: dynamic frame with potential errors
179
+ """
180
+ if database is not None and "name_space" in kwargs:
181
+ raise Exception("Parameter name_space and database are both specified, choose one.")
182
+ elif database is None and "name_space" not in kwargs:
183
+ raise Exception("Parameter name_space or database is missing.")
184
+ elif "name_space" in kwargs:
185
+ db = kwargs.pop("name_space")
186
+ else:
187
+ db = database
188
+
189
+ if table_name is None:
190
+ raise Exception("Parameter table_name is missing.")
191
+ source = DataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
192
+ push_down_predicate,
193
+ makeOptions(self._sc, additional_options), catalog_id),
194
+ self, table_name)
195
+ return source.getFrame(**kwargs)
196
+
197
+ def create_data_frame_from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "",
198
+ transformation_ctx = "", push_down_predicate="", additional_options = {},
199
+ catalog_id = None, **kwargs):
200
+ """
201
+ Creates a DataFrame with catalog database, table name and an optional catalog id
202
+ :param database: database in catalog
203
+ :param table_name: table name
204
+ :param redshift_tmp_dir: tmp dir
205
+ :param transformation_ctx: transformation context
206
+ :param push_down_predicate
207
+ :param additional_options
208
+ :param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
209
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
210
+ :return: data frame with potential errors
211
+ """
212
+ if database is not None and "name_space" in kwargs:
213
+ raise Exception("Parameter name_space and database are both specified, choose one.")
214
+ elif database is None and "name_space" not in kwargs:
215
+ raise Exception("Parameter name_space or database is missing.")
216
+ elif "name_space" in kwargs:
217
+ db = kwargs.pop("name_space")
218
+ else:
219
+ db = database
220
+
221
+ if table_name is None:
222
+ raise Exception("Parameter table_name is missing.")
223
+ source = StreamingDataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
224
+ push_down_predicate,
225
+ makeOptions(self._sc, additional_options), catalog_id),
226
+ self, table_name)
227
+ return source.getFrame()
228
+
229
+ def create_dynamic_frame_from_options(self, connection_type, connection_options={},
230
+ format=None, format_options={}, transformation_ctx = "", push_down_predicate= "", **kwargs):
231
+ """Creates a DynamicFrame with the specified connection and format.
232
+
233
+ Example:
234
+ >>> myFrame = context.createDynamicFrame(connection_type="file",
235
+ >>> connection_options={"paths": ["/in/path"]},
236
+ >>> format="json")
237
+
238
+ """
239
+ source = self.getSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
240
+
241
+ if (format and format not in self.Spark_SQL_Formats and connection_options.get("compressionType", "") not in self.Unsupported_Compression_Types):
242
+ source.setFormat(format, **format_options)
243
+
244
+ return source.getFrame(**kwargs)
245
+
246
+ def create_sample_dynamic_frame_from_catalog(self, database = None, table_name = None, num = None, sample_options = {}, redshift_tmp_dir = "",
247
+ transformation_ctx = "", push_down_predicate="", additional_options = {},
248
+ catalog_id = None, erieTxId = "", asOfTime = "", **kwargs):
249
+ """
250
+ return a list of sample dynamic records with catalog database, table name and an optional catalog id
251
+ :param database: database in catalog
252
+ :param table_name: table name
253
+ :param num: number of sample records
254
+ :param sample_options: options for sampling behavior
255
+ :param transformation_ctx: transformation context
256
+ :param push_down_predicate
257
+ :param additional_options
258
+ :param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
259
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
260
+ :return: dynamic frame with potential errors
261
+ """
262
+ if database is not None and "name_space" in kwargs:
263
+ raise Exception("Parameter name_space and database are both specified, choose one.")
264
+ elif database is None and "name_space" not in kwargs:
265
+ raise Exception("Parameter name_space or database is missing.")
266
+ elif "name_space" in kwargs:
267
+ db = kwargs.pop("name_space")
268
+ else:
269
+ db = database
270
+
271
+ if table_name is None:
272
+ raise Exception("Parameter table_name is missing.")
273
+ source = DataSource(self._ssql_ctx.getCatalogSource(db, table_name, redshift_tmp_dir, transformation_ctx,
274
+ push_down_predicate,
275
+ makeOptions(self._sc, additional_options), catalog_id),
276
+ self, table_name)
277
+ return source.getSampleFrame(num, **sample_options)
278
+
279
+ def create_sample_dynamic_frame_from_options(self, connection_type, connection_options={}, num = None, sample_options = {},
280
+ format=None, format_options={}, transformation_ctx = "", push_down_predicate= "", **kwargs):
281
+ """Creates a list of sample dynamic records with the specified connection and format.
282
+ """
283
+ source = self.getSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
284
+
285
+ if (format and format not in self.Spark_SQL_Formats):
286
+ source.setFormat(format, **format_options)
287
+
288
+ return source.getSampleFrame(num, **sample_options)
289
+
290
+
291
+ def create_data_frame_from_options(self, connection_type, connection_options={},
292
+ format=None, format_options={}, transformation_ctx = "", push_down_predicate= "", **kwargs):
293
+ """Creates a DataFrame with the specified connection and format. Used for streaming data sources
294
+ """
295
+ source = self.getStreamingSource(connection_type, format, transformation_ctx, push_down_predicate, **connection_options)
296
+
297
+ if (format and format not in self.Spark_SQL_Formats):
298
+ source.setFormat(format, **format_options)
299
+
300
+ return source.getFrame()
301
+
302
+ def getSink(self, connection_type, format = None, transformation_ctx = "", **options):
303
+ """Gets a DataSink object.
304
+
305
+ This can be used to write DynamicFrames to external targets.
306
+ Check SparkSQL format first to make sure to return the expected sink
307
+
308
+ Example:
309
+ >>> data_sink = context.getSink("s3")
310
+ >>> data_sink.setFormat("json"),
311
+ >>> data_sink.writeFrame(myFrame)
312
+ """
313
+
314
+ if(format and format.lower() in self.Spark_SQL_Formats):
315
+ connection_type = format
316
+ j_sink = self._ssql_ctx.getSink(connection_type,
317
+ makeOptions(self._sc, options), transformation_ctx)
318
+ return DataSink(j_sink, self)
319
+
320
+ def write_dynamic_frame_from_options(self, frame, connection_type, connection_options={},
321
+ format=None, format_options={}, transformation_ctx = ""):
322
+ """
323
+ Writes a DynamicFrame using the specified connection and format
324
+ :param frame:
325
+ :param connection_type: s3, redshift, jdbc, dynamo and so on
326
+ :param connection_options: like path, dbtable
327
+ :param format: json, csv or other format, this is used for s3 or tape connection which supports multiple format
328
+ :param format_options: delimiter and so on
329
+ :return: dynamic_frame with potential errors
330
+
331
+ >>> data_sink = context.write_dynamic_frame_by_options(frame,
332
+ >>> connection_type="s3",
333
+ >>> path="/out/path",
334
+ >>> format="json")
335
+ """
336
+ return self.write_from_options(frame, connection_type,
337
+ connection_options,
338
+ format, format_options, transformation_ctx)
339
+
340
+ def write_from_options(self, frame_or_dfc, connection_type,
341
+ connection_options={}, format={}, format_options={},
342
+ transformation_ctx = "", **kwargs):
343
+ if isinstance(frame_or_dfc, DynamicFrameCollection):
344
+ new_options = dict(list(connection_options.items())
345
+ + [("useFrameName", True)])
346
+ elif isinstance(frame_or_dfc, DynamicFrame):
347
+ new_options = connection_options
348
+ else:
349
+ raise TypeError("frame_or_dfc must be DynamicFrame or"
350
+ "DynamicFrameCollection. Got " +
351
+ str(type(frame_or_dfc)))
352
+
353
+ # Handle parquet and ORC case, make sure to get the right SparkSQL sink
354
+ sink = self.getSink(connection_type, format, transformation_ctx, **new_options)
355
+ if (format and format not in self.Spark_SQL_Formats):
356
+ sink.setFormat(format, **format_options)
357
+
358
+ if 'accumulator_size' in kwargs and kwargs['accumulator_size'] > 0:
359
+ sink.setAccumulableSize(kwargs['accumulator_size'])
360
+
361
+ return sink.write(frame_or_dfc)
362
+
363
+ # Note that since the table name is included in the catalog specification,
364
+ # it doesn't make sense to include a version of this method for DFCs.
365
+ def write_dynamic_frame_from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "",
366
+ transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
367
+ """
368
+ Writes a DynamicFrame to a location defined in the catalog's database, table name and an optional catalog id
369
+ :param frame: dynamic frame to be written
370
+ :param database: database in catalog
371
+ :param table_name: table name
372
+ :param redshift_tmp_dir: tmp dir
373
+ :param transformation_ctx: transformation context
374
+ :param additional_options
375
+ :param catalog_id catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
376
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
377
+ :return: dynamic frame with potential errors
378
+ """
379
+
380
+ if database is not None and "name_space" in kwargs:
381
+ raise Exception("Parameter name_space and database are both specified, choose one.")
382
+ elif database is None and "name_space" not in kwargs:
383
+ raise Exception("Parameter name_space or database is missing.")
384
+ elif "name_space" in kwargs:
385
+ db = kwargs.pop("name_space")
386
+ else:
387
+ db = database
388
+
389
+ if table_name is None:
390
+ raise Exception("Parameter table_name is missing.")
391
+
392
+ j_sink = self._ssql_ctx.getCatalogSink(db, table_name, redshift_tmp_dir, transformation_ctx,
393
+ makeOptions(self._sc, additional_options), catalog_id)
394
+ return DataSink(j_sink, self).write(frame)
395
+
396
+ def write_data_frame_from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "",
397
+ transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
398
+ if database is not None and "name_space" in kwargs:
399
+ raise Exception("Parameter name_space and database are both specified, choose one.")
400
+ elif database is None and "name_space" not in kwargs:
401
+ raise Exception("Parameter name_space or database is missing.")
402
+ elif "name_space" in kwargs:
403
+ db = kwargs.pop("name_space")
404
+ else:
405
+ db = database
406
+
407
+ if table_name is None:
408
+ raise Exception("Parameter table_name is missing.")
409
+
410
+ j_sink = self._ssql_ctx.getCatalogSink(db, table_name, redshift_tmp_dir, transformation_ctx,
411
+ makeOptions(self._sc, additional_options), catalog_id)
412
+ return DataSink(j_sink, self).writeDataFrame(frame, self)
413
+
414
+ def write_dynamic_frame_from_jdbc_conf(self, frame, catalog_connection, connection_options={},
415
+ redshift_tmp_dir = "", transformation_ctx = "", catalog_id = None):
416
+ """
417
+ :param frame: dynamic frame to be written
418
+ :param catalog_connection: catalog connection name, used to access JDBC configuration
419
+ :param connection_options: dbtable and so on
420
+ :param redshift_tmp_dir: tmp dir
421
+ :param transformation_ctx: transformation context
422
+ :param catalog_id catalog id of the DataCatalog being accessed (account id of the data catalog).
423
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
424
+ :return: dynamic frame with potential errors
425
+ """
426
+ self.write_from_jdbc_conf(frame, catalog_connection, connection_options, redshift_tmp_dir, transformation_ctx,
427
+ catalog_id)
428
+
429
+ def write_from_jdbc_conf(self, frame_or_dfc, catalog_connection, connection_options={},
430
+ redshift_tmp_dir = "", transformation_ctx = "", catalog_id = None):
431
+ if isinstance(frame_or_dfc, DynamicFrameCollection):
432
+ new_options = dict(list(connection_options.items())
433
+ + [("useFrameName", True)])
434
+ elif isinstance(frame_or_dfc, DynamicFrame):
435
+ new_options = connection_options
436
+ else:
437
+ raise TypeError("frame_or_dfc must be DynamicFrame or"
438
+ "DynamicFrameCollection. Got " +
439
+ str(type(frame_or_dfc)))
440
+
441
+ j_sink = self._ssql_ctx.getJDBCSink(catalog_connection, makeOptions(self._sc, new_options), redshift_tmp_dir,
442
+ transformation_ctx, catalog_id)
443
+ return DataSink(j_sink, self).write(frame_or_dfc)
444
+
445
+ def convert_resolve_option(self, path, action, target):
446
+
447
+ if action.upper() == "KEEPASSTRUCT":
448
+ return self._jvm.ResolveSpec.apply(path, "make_struct")
449
+ elif action.upper() == "PROJECT":
450
+ if target is None or not isinstance(target, DataType):
451
+ raise ValueError("Target type must be specified with project action.")
452
+
453
+ return self._jvm.ResolveSpec.apply(path, "project:{}".format(target.typeName()))
454
+ else:
455
+ raise ValueError("Invalid resolve action {}. ".format(action) +
456
+ "Action must be one of KeepAsStruct and Project.")
457
+
458
+ def extract_jdbc_conf(self, connection_name, catalog_id=None):
459
+ """
460
+ Get the username, password, vendor and url from the connection object in the catalog
461
+ :param connection_name: name of the connection in the catalog
462
+ :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
463
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
464
+ :return: dict with keys "user", "password", "vendor", "url"
465
+ """
466
+ return self._ssql_ctx.extractJDBCConf(connection_name, catalog_id)
467
+
468
+ def purge_table(self, database, table_name, options={}, transformation_ctx="", catalog_id=None):
469
+ """
470
+ Delete files from s3 for the given catalog's database and table. If all files in a partition are deleted, that
471
+ partition is deleted from the catalog too
472
+ :param database: database name in catalog
473
+ :param table_name: table name in catalog
474
+ :param options: Options to filter files to be deleted and manifest file generation
475
+ retentionPeriod: Number of hours. Files newer than the retention period will be retained.
476
+ 168 hours - (7 days) by default
477
+ partitionPredicate: Partitions satisfying this predicate will be deleted.
478
+ Files within the retention period in these partitions will not be deleted.
479
+ "" - empty by default
480
+ excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
481
+ Set() - empty set by default
482
+ manifestFilePath: optional path for manifest file generation. All files that were successfully purged
483
+ or transitioned will be recorded in Success.csv and those that failed in Failed.csv
484
+ :param transformation_ctx: transformation context (used in manifest file path)
485
+ :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
486
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
487
+ :return: void return type
488
+ """
489
+ self._ssql_ctx.purgeTable(database, table_name, makeOptions(self._sc, options), transformation_ctx, catalog_id)
490
+
491
+ def purge_s3_path(self, s3_path, options={}, transformation_ctx=""):
492
+ """
493
+ Deletes files from a given s3 path recursively
494
+ :param s3_path: s3 path of the files to be deleted in the format s3://<bucket>/<prefix>/
495
+ :param options: Options to filter files to be deleted and manifest file generation
496
+ retentionPeriod: Number of hours. Files newer than the retention period will be retained.
497
+ 168 hours - (7 days) by default
498
+ excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
499
+ Set() - empty set by default
500
+ manifestFilePath: optional path for manifest file generation. All files that were successfully purged
501
+ or transitioned will be recorded in Success.csv and those that failed in Failed.csv
502
+ :param transformation_ctx: transformation context (used in manifest file path)
503
+ :return: void return type
504
+ """
505
+ self._ssql_ctx.purgeS3Path(s3_path, makeOptions(self._sc, options), transformation_ctx)
506
+
507
+ def transition_table(self, database, table_name, transition_to, options={}, transformation_ctx="", catalog_id=None):
508
+ """
509
+ Transitions the storage class of the files stored on s3 for the given catalog's database and table
510
+ :param database: database name in catalog
511
+ :param table_name: table name in catalog
512
+ :param transition_to: S3 storage class to transition to
513
+ https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html
514
+ :param options: Options to filter files to be transitioned and manifest file generation
515
+ retentionPeriod: Number of hours. Files newer than the retention period will be retained.
516
+ 168 hours - (7 days) by default
517
+ partitionPredicate: Partitions satisfying this predicate will be deleted.
518
+ Files within the retention period in these partitions will not be deleted.
519
+ "" - empty by default
520
+ excludeStorageClasses: Files with storage class in the excludeStorageClasses set are not deleted.
521
+ Set() - empty set by default
522
+ manifestFilePath: optional path for manifest file generation. All files that were successfully purged
523
+ or transitioned will be recorded in Success.csv and those that failed in Failed.csv
524
+ accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform
525
+ roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform
526
+ :param transformation_ctx: transformation context (used in manifest file path)
527
+ :param catalog_id: catalog id of the DataCatalog being accessed (account id of the data catalog).
528
+ Set to None by default (None defaults to the catalog id of the calling account in the service)
529
+ :return: void return type
530
+ """
531
+ self._ssql_ctx.transitionTable(database, table_name, transition_to, makeOptions(self._sc, options),
532
+ transformation_ctx, catalog_id)
533
+
534
+ def transition_s3_path(self, s3_path, transition_to, options={}, transformation_ctx=""):
535
+ """
536
+ Transition files in a given s3 path recursively
537
+ :param s3_path: s3 path of the files to be transitioned in the format s3://<bucket>/<prefix>/
538
+ :param transition_to: S3 storage class to transition to
539
+ https://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/s3/model/StorageClass.html
540
+ :param options: Options to filter files to be deleted and manifest file generation
541
+ retentionPeriod Number of hours. Files newer than the retention period will be retained.
542
+ 168 hours - (7 days) by default
543
+ excludeStorageClasses Files with storage class in the excludeStorageClasses set are not deleted.
544
+ Set() - empty set by default
545
+ manifestFilePath optional path for manifest file generation. All files that were successfully purged
546
+ or transitioned will be recorded in Success.csv and those that failed in Failed.csv
547
+ accountId: AWS accountId to run the Transition batch job. Mandatory for Transition transform
548
+ roleArn: AWS role to run the Transition batch job. Mandatory for Transition transform
549
+ :param transformation_ctx: transformation context (used in manifest file path)
550
+ :return: void return type
551
+ """
552
+ self._ssql_ctx.transitionS3Path(s3_path, transition_to, makeOptions(self._sc, options), transformation_ctx)
553
+
554
+ def get_logger(self):
555
+ return self._glue_logger
556
+
557
+ def currentTimeMillis(self):
558
+ return int(round(time.time() * 1000))
559
+
560
+ def getSampleStreamingDynamicFrame(self, frame, options={}, batch_function=None):
561
+ if "windowSize" not in options:
562
+ raise ValueError("Missing windowSize argument")
563
+
564
+ windowSize = options["windowSize"]
565
+ pollingTimeInMs = int(options.get("pollingTimeInMs", 10000))
566
+ recordPollingLimit = int(options.get("recordPollingLimit", 100))
567
+
568
+ # Use a different implementation here due to Py4J limitation
569
+ def convert_window_size_to_milis(window_size):
570
+ if type(window_size) != str or " " not in window_size.strip():
571
+ raise ValueError("Received invalid window size")
572
+ chunks = window_size.strip().split(" ")
573
+ if len(chunks) != 2:
574
+ raise ValueError("Received invalid window size")
575
+ unit = chunks[1].lower()
576
+ if "second" in unit:
577
+ multiplier = 1000
578
+ elif "minute" in unit:
579
+ multiplier = 1000 * 60
580
+ elif "hour" in unit:
581
+ multiplier = 1000 * 60 * 60
582
+ else:
583
+ raise ValueError("Received invalid window size")
584
+ try:
585
+ quantity = int(chunks[0])
586
+ except:
587
+ raise ValueError("Received invalid window size")
588
+ return quantity * multiplier
589
+
590
+ windowSizeInMilis = convert_window_size_to_milis(windowSize)
591
+ if windowSizeInMilis >= pollingTimeInMs:
592
+ raise ValueError("Polling time needs to be larger than window size")
593
+
594
+ tableId = str(uuid.uuid4()).replace("-", "")
595
+ writer = frame.writeStream\
596
+ .trigger(processingTime=windowSize)\
597
+ .queryName(tableId)\
598
+ .format("memory")
599
+ if batch_function is not None:
600
+ writer = writer.foreachBatch(batch_function)
601
+
602
+ query = writer.start()
603
+ resultDF = self.spark_session.sql("select * from " + tableId + " limit " + str(recordPollingLimit))
604
+ time.sleep(pollingTimeInMs / 1000)
605
+ query.stop()
606
+ return DynamicFrame.fromDF(resultDF, self, tableId)
607
+
608
+
609
+ def forEachBatch(self, frame, batch_function, options = {}):
610
+ if "windowSize" not in options:
611
+ raise Exception("Missing windowSize argument")
612
+ if "checkpointLocation" not in options:
613
+ raise Exception("Missing checkpointLocation argument")
614
+
615
+ windowSize = options["windowSize"]
616
+ checkpointLocation = options["checkpointLocation"]
617
+
618
+ java_import(self._jvm, "org.apache.spark.metrics.source.StreamingSource")
619
+
620
+ run = {'value': 0}
621
+ retry_attempt = {'value': 0}
622
+
623
+ def batch_function_with_persist(data_frame, batchId):
624
+
625
+ # This condition is true when the previous batch succeeded
626
+ if run['value'] > retry_attempt['value']:
627
+ run['value'] = 0
628
+ if retry_attempt['value'] > 0:
629
+ retry_attempt['value'] = 0
630
+ logging.info("The previous batch was succeeded. Reset the retry attempt counter to 0.")
631
+ run['value'] += 1
632
+
633
+ # process the batch
634
+ startTime = self.currentTimeMillis()
635
+ if "persistDataFrame" in options and options["persistDataFrame"].lower() == "false":
636
+ if len(data_frame.take(1)):
637
+ batch_function(data_frame, batchId)
638
+ else:
639
+ storage_level = options.get("storageLevel", "MEMORY_AND_DISK").upper()
640
+ data_frame.persist(getattr(pyspark.StorageLevel, storage_level))
641
+ num_records = data_frame.count()
642
+ if num_records > 0:
643
+ batch_function(data_frame, batchId)
644
+ data_frame.unpersist()
645
+ self._jvm.StreamingSource.updateNumRecords(num_records)
646
+ self._jvm.StreamingSource.updateBatchProcessingTimeInMs(self.currentTimeMillis() - startTime)
647
+
648
+ query = frame.writeStream.foreachBatch(batch_function_with_persist).trigger(processingTime=windowSize).option("checkpointLocation", checkpointLocation)
649
+
650
+ batch_max_retries = int(options.get('batchMaxRetries', 3))
651
+ if batch_max_retries < 0 or batch_max_retries > 100:
652
+ raise ValueError('Please specify the number of retries as an integer in the range of [0, 100].')
653
+
654
+ while (True):
655
+ try:
656
+ if retry_attempt['value'] > 0:
657
+ logging.warning("Retrying micro batch processing, attempt {} out of {}. ".format(retry_attempt['value'], batch_max_retries))
658
+ query.start().awaitTermination()
659
+ except Exception as e:
660
+
661
+ if str(e).startswith("CheckpointMetadataNotFound"):
662
+ raise e
663
+
664
+ retry_attempt['value'] += 1
665
+
666
+ if retry_attempt['value'] > batch_max_retries:
667
+ self._glue_logger.error("Exceeded the maximum number of batch retries. Throwing the exception. ")
668
+ raise e
669
+
670
+ backOffTime = retry_attempt['value'] if (retry_attempt['value'] < 3) else 5
671
+ time.sleep(backOffTime)
672
+
673
+ """
674
+ Appends ingestion time columns like ingest_year, ingest_month, ingest_day, ingest_hour, ingest_minute to the
675
+ input DataFrame.
676
+ :param df Input DataFrame in which to append the ingestion time columns.
677
+ :param timeGranularity Time Granularity until which to add the time granularity columns.
678
+ :return DataFrame after appending the time granularity columns.
679
+ """
680
+ def add_ingestion_time_columns(self, frame, time_granularity):
681
+ return DataFrame(self._ssql_ctx.addIngestionTimeColumns(frame._jdf, time_granularity), frame.sql_ctx)
682
+
683
+ def start_transaction(self, read_only):
684
+ return self._ssql_ctx.startTransaction(read_only)
685
+
686
+ def commit_transaction(self, transaction_id, wait_for_commit=True):
687
+ return self._ssql_ctx.commitTransaction(transaction_id, wait_for_commit)
688
+
689
+ def cancel_transaction(self, transaction_id):
690
+ return self._ssql_ctx.cancelTransaction(transaction_id)