AWSGlueDataplanePython 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. awsglue/README.md +37 -0
  2. awsglue/__init__.py +15 -0
  3. awsglue/context.py +690 -0
  4. awsglue/data_sink.py +49 -0
  5. awsglue/data_source.py +49 -0
  6. awsglue/dataframe_transforms/__init__.py +17 -0
  7. awsglue/dataframe_transforms/apply_mapping.py +76 -0
  8. awsglue/dataframereader.py +41 -0
  9. awsglue/dataframewriter.py +21 -0
  10. awsglue/devutils.py +236 -0
  11. awsglue/dynamicframe.py +669 -0
  12. awsglue/functions.py +31 -0
  13. awsglue/glue_shell.py +38 -0
  14. awsglue/gluetypes.py +461 -0
  15. awsglue/job.py +59 -0
  16. awsglue/scripts/__init__.py +12 -0
  17. awsglue/scripts/activate_etl_connector.py +362 -0
  18. awsglue/scripts/connector_activation_util.py +38 -0
  19. awsglue/scripts/crawler_redo_from_backup.py +75 -0
  20. awsglue/scripts/crawler_undo.py +121 -0
  21. awsglue/scripts/scripts_utils.py +106 -0
  22. awsglue/streaming_data_source.py +28 -0
  23. awsglue/transforms/__init__.py +47 -0
  24. awsglue/transforms/apply_mapping.py +72 -0
  25. awsglue/transforms/coalesce.py +66 -0
  26. awsglue/transforms/collection_transforms.py +155 -0
  27. awsglue/transforms/drop_nulls.py +85 -0
  28. awsglue/transforms/dynamicframe_filter.py +66 -0
  29. awsglue/transforms/dynamicframe_map.py +72 -0
  30. awsglue/transforms/errors_as_dynamicframe.py +45 -0
  31. awsglue/transforms/field_transforms.py +469 -0
  32. awsglue/transforms/relationalize.py +105 -0
  33. awsglue/transforms/repartition.py +61 -0
  34. awsglue/transforms/resolve_choice.py +85 -0
  35. awsglue/transforms/transform.py +92 -0
  36. awsglue/transforms/unbox.py +112 -0
  37. awsglue/transforms/union.py +66 -0
  38. awsglue/transforms/unnest_frame.py +75 -0
  39. awsglue/utils.py +159 -0
  40. awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
  41. awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
  42. awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
  43. awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
  44. awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
  45. awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,669 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from __future__ import print_function
14
+ import json
15
+ import sys
16
+ from awsglue.utils import makeOptions, callsite
17
+ from awsglue.gluetypes import _deserialize_json_string, _create_dynamic_record, _revert_to_dict, _serialize_schema
18
+ from awsglue.utils import _call_site, _as_java_list, _as_scala_option, _as_resolve_choiceOption, iteritems, itervalues
19
+ from pyspark.rdd import RDD, PipelinedRDD
20
+ from pyspark.sql.dataframe import DataFrame
21
+ from pyspark.serializers import PickleSerializer, BatchedSerializer
22
+
23
+ long = int
24
+ basestring = unicode = str
25
+ imap = map
26
+ ifilter = filter
27
+
28
+ class ResolveOption(object):
29
+ """
30
+ ResolveOption is used for resolve ChoiceType while converting DynamicRecord to DataFrame
31
+ option.action includes "Project", "KeepAsStruct" and "Cast".
32
+ """
33
+ def __init__(self, path, action, target=None):
34
+ """
35
+ :param path: string, path name to ChoiceType
36
+ :param action: string,
37
+ :param target: spark sql Datatype
38
+ """
39
+ self.path = path
40
+ self.action = action
41
+ self.target = target
42
+
43
+
44
+ class DynamicFrame(object):
45
+
46
+ def __init__(self, jdf, glue_ctx, name=""):
47
+ self._jdf = jdf
48
+ self.glue_ctx = glue_ctx
49
+ self._ssql_ctx = glue_ctx._ssql_ctx
50
+ self._sc = glue_ctx and glue_ctx._sc
51
+ self._schema = None
52
+ self._lazy_rdd = None
53
+ self.name = name
54
+
55
+ @property
56
+ def _rdd(self):
57
+ if self._lazy_rdd is None:
58
+ jrdd = self._jdf.javaToPython()
59
+ self._lazy_rdd = RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer()))
60
+ return self._lazy_rdd
61
+
62
+ def with_frame_schema(self, schema):
63
+ """ Specify schema so we don't have to compute it """
64
+ return DynamicFrame(self._jdf.pyWithFrameSchema(_serialize_schema(schema)), self.glue_ctx, self.name)
65
+
66
+ def schema(self):
67
+ if self._schema is None:
68
+ try:
69
+ self._schema = _deserialize_json_string(self._jdf.schema().toString())
70
+ except AttributeError as e:
71
+ raise Exception("Unable to parse datatype from schema. %s" % e)
72
+ return self._schema
73
+
74
+ def show(self, num_rows=20):
75
+ print(self._jdf.showString(num_rows))
76
+
77
+ def filter(self, f, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
78
+ def wrap_dict_with_dynamic_records(x):
79
+ rec = _create_dynamic_record(x["record"])
80
+ try:
81
+ return f(rec)
82
+ except Exception as E:
83
+ if isinstance(E, KeyError) or isinstance(E, ValueError) or isinstance(E, TypeError):
84
+ return False
85
+ x['isError'] = True
86
+ x['errorMessage'] = str(E)
87
+ return True
88
+
89
+ def func(iterator):
90
+ return ifilter(wrap_dict_with_dynamic_records, iterator)
91
+ return self.mapPartitions(func, True, transformation_ctx, info, stageThreshold, totalThreshold)
92
+
93
+ def mapPartitions(self, f, preservesPartitioning=True, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
94
+ def func(s, iterator):
95
+ return f(iterator)
96
+ return self.mapPartitionsWithIndex(func, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
97
+
98
+ def map(self, f, preservesPartitioning=False,transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
99
+ def wrap_dict_with_dynamic_records(x):
100
+ rec = _create_dynamic_record(x["record"])
101
+ try:
102
+ result_record = _revert_to_dict(f(rec))
103
+ if result_record:
104
+ x["record"] = result_record
105
+ else:
106
+ x['isError'] = True
107
+ x['errorMessage'] = "User-specified function returned None instead of DynamicRecord"
108
+ return x
109
+ except Exception as E:
110
+ x['isError'] = True
111
+ x['errorMessage'] = str(E)
112
+ return x
113
+ def func(_, iterator):
114
+ return imap(wrap_dict_with_dynamic_records, iterator)
115
+ return self.mapPartitionsWithIndex(func, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
116
+
117
+ def mapPartitionsWithIndex(self, f, preservesPartitioning=False, transformation_ctx = "", info = "", stageThreshold = 0,totalThreshold = 0):
118
+ return DynamicFrame(self.glue_ctx._jvm.DynamicFrame.fromPythonRDD(self._jdf,
119
+ PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd, self.glue_ctx._ssql_ctx, transformation_ctx, self.name,
120
+ _call_site(self._sc, callsite(), info), long(stageThreshold),
121
+ long(totalThreshold)), self.glue_ctx, self.name)
122
+
123
+ def printSchema(self):
124
+ print(self._jdf.schema().treeString())
125
+
126
+ def toDF(self, options = None):
127
+ """
128
+ Please specify also target type if you choose Project and Cast action type.
129
+
130
+ :param options: Must be list of options
131
+
132
+ >>>toDF([ResolveOption("a.b.c", "KeepAsStruct")])
133
+ >>>toDF([ResolveOption("a.b.c", "Project", DoubleType())])
134
+ """
135
+ if options is None: options = []
136
+ scala_options = []
137
+
138
+ for option in options:
139
+ if option.action != "KeepAsStruct" and option.target is None:
140
+ raise Exception("Missing target type for resolve action %s." % option.action)
141
+
142
+ scala_options.append(self.glue_ctx.convert_resolve_option(option.path, option.action, option.target))
143
+
144
+ return DataFrame(self._jdf.toDF(self.glue_ctx._jvm.PythonUtils.toSeq(scala_options)), self.glue_ctx)
145
+
146
+ @classmethod
147
+ def fromDF(cls, dataframe, glue_ctx, name=""):
148
+ """
149
+ Convert a DataFrame to a DynamicFrame by converting DynamicRecords to Rows
150
+ :param dataframe: A spark sql DataFrame
151
+ :param glue_ctx: the GlueContext object
152
+ :param name: name of the result DynamicFrame
153
+ :return: DynamicFrame
154
+ """
155
+ return DynamicFrame(glue_ctx._jvm.DynamicFrame.apply(dataframe._jdf, glue_ctx._ssql_ctx),
156
+ glue_ctx, name)
157
+
158
+ def unbox(self, path, format, transformation_ctx="", info = "", stageThreshold = 0, totalThreshold = 0, **options):
159
+ """
160
+ unbox a string field
161
+
162
+ :param path: full path to the StringNode you want to unbox
163
+ :param format: "avro" or "json"
164
+ :param info: String, any string to be associated with errors in this transformation.
165
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
166
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
167
+ for which the processing needs to error out.
168
+ :param options:
169
+ separator: String,
170
+ escaper: String,
171
+ skipFirst: Boolean,
172
+ withSchema: String, schema string should always be called by using StructType.json()
173
+ withHeader: Boolean
174
+ :return: a new DynamicFrame with unboxed DynamicRecords
175
+
176
+ >>>unbox("a.b.c", "csv", separator="|")
177
+ """
178
+ return DynamicFrame(self._jdf.unbox(path, format, json.dumps(options), transformation_ctx,
179
+ _call_site(self._sc, callsite(), info), long(stageThreshold),
180
+ long(totalThreshold)),
181
+ self.glue_ctx, self.name)
182
+
183
+ def drop_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
184
+ """
185
+ :param paths: List of strings, each the full path to a node you want to drop
186
+ :param info: String, any string to be associated with errors in this transformation.
187
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
188
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
189
+ for which the processing needs to error out.
190
+ :return: DynamicFrame
191
+ """
192
+ if isinstance(paths, basestring):
193
+ paths = [paths]
194
+
195
+ return DynamicFrame(self._jdf.dropFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
196
+ _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
197
+ self.glue_ctx, self.name)
198
+
199
+ def select_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
200
+ """
201
+ :param paths: List of strings, each the full path to a node you want to get
202
+ :param info: String, any string to be associated with errors in this transformation.
203
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
204
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
205
+ for which the processing needs to error out.
206
+ :return: DynamicFrame
207
+ """
208
+ if isinstance(paths, basestring):
209
+ paths = [paths]
210
+
211
+ return DynamicFrame(self._jdf.selectFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
212
+ _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
213
+ self.glue_ctx, self.name)
214
+
215
+ def split_fields(self, paths, name1, name2, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
216
+ """
217
+ :param paths: List of strings, each the full path to a node you want to split into a new DynamicFrame
218
+ :param name1: name for the dynamic frame to be split off
219
+ :param name2: name for the dynamic frame remains on original
220
+ :param info: String, any string to be associated with errors in this transformation.
221
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
222
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
223
+ for which the processing needs to error out.
224
+ :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
225
+ the second containing the nodes remaining on the original.
226
+ """
227
+ if isinstance(paths, basestring):
228
+ paths = [paths]
229
+
230
+ jdfs = _as_java_list(self._sc, self._jdf.splitFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
231
+ _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)))
232
+ return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)
233
+
234
+ def split_rows(self, comparison_dict, name1, name2, transformation_ctx = "", info= "", stageThreshold = 0, totalThreshold = 0):
235
+ """
236
+ :param comparison_dict: a dictionary where the key is the path to a column, the the value is another
237
+ dictionary maping comparators to the value to which the column will be compared.
238
+ e.g. {"age": {">": 10, "<": 20}} will give back rows where age between 10 and 20 exclusive split from those
239
+ that do not meet this criteria.
240
+ :param name1: name for the dynamic frame to be split off
241
+ :param name2: name for the dynamic frame remains on original
242
+ :param info: String, any string to be associated with errors in this transformation.
243
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
244
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
245
+ for which the processing needs to error out.
246
+ :return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
247
+ the second containing the nodes remaining on the original.
248
+ """
249
+ paths, values, operators = [], [], []
250
+
251
+ for key, value in comparison_dict.items():
252
+ paths.extend([key] * len(value))
253
+ for k, v in value.items():
254
+ operators.append(k)
255
+ if isinstance(v, int):
256
+ values.append(long(v))
257
+ else:
258
+ values.append(v)
259
+
260
+ jdfs = _as_java_list(self._sc, self._jdf.splitRows(self.glue_ctx._jvm.PythonUtils.toSeq(paths),
261
+ self.glue_ctx._jvm.PythonUtils.toSeq(values),
262
+ self.glue_ctx._jvm.PythonUtils.toSeq(operators),
263
+ transformation_ctx, _call_site(self._sc, callsite(), info),
264
+ long(stageThreshold), long(totalThreshold)))
265
+ return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)
266
+
267
+ def rename_field(self, oldName, newName, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
268
+ """
269
+ :param oldName: String, full path to the node you want to rename
270
+ :param newName: String, new name including full path
271
+ :param info: String, any string to be associated with errors in this transformation.
272
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
273
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
274
+ for which the processing needs to error out.
275
+ :return: DynamicFrame
276
+ """
277
+ return DynamicFrame(self._jdf.renameField(oldName, newName, transformation_ctx, _call_site(self._sc, callsite(), info),
278
+ long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
279
+
280
+ def write(self, connection_type, connection_options={},
281
+ format=None, format_options={}, accumulator_size = 0):
282
+ return self.glue_ctx.write_from_options(frame_or_dfc=self,
283
+ connection_type=connection_type,
284
+ connection_options=connection_options,
285
+ format=format,
286
+ format_options=format_options,
287
+ accumulator_size=accumulator_size)
288
+
289
+ def count(self):
290
+ return self._jdf.count()
291
+
292
+ def spigot(self, path, options={}, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
293
+ return DynamicFrame(self._jdf.spigot(path, makeOptions(self._sc, options), transformation_ctx,
294
+ _call_site(self._sc, callsite(), info), long(stageThreshold),
295
+ long(totalThreshold)),
296
+ self.glue_ctx, self.name)
297
+
298
+ def join(self, paths1, paths2, frame2, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
299
+ if isinstance(paths1, basestring):
300
+ paths1 = [paths1]
301
+ if isinstance(paths2, basestring):
302
+ paths2 = [paths2]
303
+
304
+ return DynamicFrame(self._jdf.join(self.glue_ctx._jvm.PythonUtils.toSeq(paths1), self.glue_ctx._jvm.PythonUtils.toSeq(paths2), frame2._jdf, transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name + frame2.name)
305
+
306
+ def unnest(self, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
307
+ """
308
+ unnest a dynamic frame. i.e. flattens nested objects to top level elements.
309
+ It also generates joinkeys for array objects
310
+ :param info: String, any string to be associated with errors in this transformation.
311
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
312
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
313
+ for which the processing needs to error out.
314
+ :return: a new unnested dynamic frame
315
+
316
+ >>>unnest()
317
+ """
318
+ return DynamicFrame(self._jdf.unnest(transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
319
+
320
+ def relationalize(self, root_table_name, staging_path, options={}, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
321
+ """
322
+ Relationalizes a dynamic frame. i.e. produces a list of frames that are
323
+ generated by unnesting nested columns and pivoting array columns. The
324
+ pivoted array column can be joined to the root table using the joinkey
325
+ generated in unnest phase
326
+ :param root_table_name: name for the root table
327
+ :param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
328
+ this path
329
+ :param options: dict of optional parameters for relationalize
330
+ :param transformation_ctx: context key to retrieve metadata about the current transformation
331
+ :param info: String, any string to be associated with errors in this transformation.
332
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
333
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
334
+ for which the processing needs to error out.
335
+ :return: DynamicFrameCollection
336
+ """
337
+ _rFrames = _as_java_list(self._sc, self._jdf.relationalize(root_table_name, staging_path,
338
+ makeOptions(self._sc, options),
339
+ transformation_ctx, _call_site(self._sc, callsite(), info),
340
+ long(stageThreshold), long(totalThreshold)))
341
+ return DynamicFrameCollection(dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName())) for df in _rFrames), self.glue_ctx)
342
+
343
+ def applyMapping(self, *args, **kwargs):
344
+ # In a previous version we passed args[1:] and in our tests we passed
345
+ # the DynamicFrame as the first argument. This checks for that case
346
+ # to avoid regressions.
347
+ if len(args) > 0 and isinstance(args[0], DynamicFrame):
348
+ return self.apply_mapping(*(args[1:]), **kwargs)
349
+ else:
350
+ return self.apply_mapping(*args, **kwargs)
351
+
352
+ def apply_mapping(self, mappings, case_sensitive = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
353
+ def _to_java_mapping(mapping_tup):
354
+ if not isinstance(mapping_tup, tuple):
355
+ raise TypeError("Mapping must be specified as a tuple. Got " +
356
+ mapping_tup)
357
+
358
+ tup2 = self.glue_ctx._jvm.scala.Tuple2
359
+ tup3 = self.glue_ctx._jvm.scala.Tuple3
360
+ tup4 = self.glue_ctx._jvm.scala.Tuple4
361
+ java_cls = self.glue_ctx._jvm.MappingSpec
362
+
363
+ if len(mapping_tup) == 2:
364
+ return java_cls.apply(tup2.apply(mapping_tup[0], mapping_tup[1]))
365
+ elif len(mapping_tup) == 3:
366
+ return java_cls.apply(tup3.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2]))
367
+ elif len(mapping_tup) == 4:
368
+ return java_cls.apply(tup4.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2], mapping_tup[3]))
369
+ else:
370
+ raise ValueError("Mapping tuple must be of length 2, 3, or 4"
371
+ "Got tuple of length " + str(len(mapping_tup)))
372
+
373
+ if isinstance(mappings, tuple):
374
+ mappings = [mappings]
375
+
376
+ mappings_list = [ _to_java_mapping(m) for m in mappings ]
377
+
378
+ new_jdf = self._jdf.applyMapping(
379
+ self.glue_ctx._jvm.PythonUtils.toSeq(mappings_list),
380
+ case_sensitive,
381
+ transformation_ctx,
382
+ _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))
383
+
384
+ return DynamicFrame(new_jdf, self.glue_ctx, self.name)
385
+
386
+ def unnest_ddb_json(self, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
387
+ new_jdf = self._jdf.unnestDDBJson(transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))
388
+ return DynamicFrame(new_jdf, self.glue_ctx, self.name)
389
+
390
+ def simplify_ddb_json(self):
391
+ new_jdf = self._jdf.simplifyDDBJson()
392
+ return DynamicFrame(new_jdf, self.glue_ctx, self.name)
393
+
394
+ def resolveChoice(self, specs=None, choice="", database=None, table_name=None,
395
+ transformation_ctx="", info="", stageThreshold=0, totalThreshold=0, catalog_id=None):
396
+ """
397
+ :param specs: specification for choice type and corresponding resolve action,
398
+ if the specs is empty, then tape backend would go one round of the data
399
+ to get schema, and then based on the schema to resolve choice.
400
+ :param choice: default option when choice type path found missing from specs
401
+ :param database: Glue catalog database name, required for MATCH_CATALOG choice
402
+ :param table_name: Glue catalog table name, required for MATCH_CATALOG choice
403
+ :return: a new DynamicFrame
404
+ """
405
+ def _to_java_specs(specs_tup):
406
+ path, action = specs_tup
407
+ return self.glue_ctx._jvm.ResolveSpec.apply(path, action)
408
+
409
+ if specs is None and not choice:
410
+ raise Exception("Parameter specs and option are both missing, add one.")
411
+
412
+ if specs is not None and choice:
413
+ raise Exception("Parameter specs and option are both specified, choose one.")
414
+
415
+ if specs is None:
416
+ specs = []
417
+
418
+ if isinstance(specs, tuple):
419
+ specs = [specs]
420
+
421
+ specs_list = [ _to_java_specs(m) for m in specs ]
422
+
423
+ choice_option = _as_scala_option(self._sc, _as_resolve_choiceOption(self._sc, choice))
424
+ database_option = _as_scala_option(self._sc, database)
425
+ table_name_option = _as_scala_option(self._sc, table_name)
426
+
427
+ new_jdf = self._jdf.resolveChoice(
428
+ self.glue_ctx._jvm.PythonUtils.toSeq(specs_list),
429
+ choice_option, database_option, table_name_option,
430
+ transformation_ctx,
431
+ _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold),
432
+ _as_scala_option(self._sc, catalog_id))
433
+
434
+ return DynamicFrame(new_jdf, self.glue_ctx, self.name)
435
+
436
+ def mergeDynamicFrame(self, stage_dynamic_frame, primary_keys, transformation_ctx = "", options = {}, info = "", stageThreshold = 0, totalThreshold = 0):
437
+ """
438
+ Merge this DynamicFrame with a staging DynamicFrame based on the provided primary keys to identify records.
439
+ Duplicate records (records with same primary keys) are not de-duplicated. All records (including duplicates) are
440
+ retained from the source, if there is no matching record in staging frame. If staging frame has matching records
441
+ then the records from the staging frame overwrites the records in the source.
442
+ :param stage_dynamic_frame: Staging DynamicFrame
443
+ :param primary_keys: List of primary key fields to match records from source and staging dynamic frame
444
+ :param transformation_ctx: context key to retrieve metadata about the current transformation
445
+ :param options: optional options for the transformation
446
+ :param info: String, any string to be associated with errors in this transformation.
447
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
448
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
449
+ for which the processing needs to error out.
450
+ :return: DynamicFrame
451
+ """
452
+ if isinstance(primary_keys, basestring):
453
+ primary_keys = [primary_keys]
454
+ return DynamicFrame(self._jdf.mergeDynamicFrames(stage_dynamic_frame._jdf,
455
+ self.glue_ctx._jvm.PythonUtils.toSeq(primary_keys),
456
+ transformation_ctx,
457
+ makeOptions(self._sc, options),
458
+ _call_site(self._sc, callsite(), info),
459
+ long(stageThreshold),
460
+ long(totalThreshold)),
461
+ self.glue_ctx, self.name)
462
+
463
+ def union(self, other_frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
464
+ """Returns a DynamicFrame containing all records in this frame and all records in other_frame.
465
+ :param other_frame: DynamicFrame to union with this one.
466
+ :param transformation_ctx: context key to retrieve metadata about the current transformation
467
+ :param info: String, any string to be associated with errors in this transformation.
468
+ :param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
469
+ :param totalThreshold: Long, total number of errors upto and including in this transformation
470
+ for which the processing needs to error out.
471
+ :return: DynamicFrame
472
+ """
473
+ union = self._jdf.union(other_frame._jdf, transformation_ctx, _call_site(self._sc, callsite(), info),
474
+ long(stageThreshold), long(totalThreshold))
475
+ return DynamicFrame(union, self.glue_ctx, union.name)
476
+
477
+ def getNumPartitions(self):
478
+ """Returns the number of partitions in the current DynamicFrame."""
479
+ return self._jdf.getNumPartitions()
480
+
481
+ def repartition(self, num_partitions, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
482
+ new_jdf = self._jdf.repartition(num_partitions, transformation_ctx,
483
+ _call_site(self._sc, callsite(), info),
484
+ long(stageThreshold), long(totalThreshold))
485
+ return DynamicFrame(new_jdf, self.glue_ctx, self.name)
486
+
487
+ def coalesce(self, num_partitions, shuffle = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
488
+ new_jdf = self._jdf.coalesce(num_partitions, shuffle, transformation_ctx,
489
+ _call_site(self._sc, callsite(), info),
490
+ long(stageThreshold), long(totalThreshold))
491
+ return DynamicFrame(new_jdf, self.glue_ctx, self.name)
492
+
493
+ def errorsAsDynamicFrame(self):
494
+ """
495
+ Returns a DynamicFrame which has error records nested.
496
+ :return: DynamicFrame
497
+ """
498
+ return DynamicFrame(self._jdf.errorsAsDynamicFrame(), self.glue_ctx, self.name)
499
+
500
+ def errorsCount(self):
501
+ """
502
+ Returns the total error records in a DynamicFrames
503
+ :return: Long
504
+ """
505
+ return self._jdf.errorsCount()
506
+
507
+ def stageErrorsCount(self):
508
+ """
509
+ Returns the error generated in the transformation to this DynamicFrame
510
+ :return: Long
511
+ """
512
+ return self._jdf.stageErrorsCount()
513
+
514
+ def assertErrorThreshold(self):
515
+ """
516
+ Asserts for the errors in the transformations which yielded this DynamicFrame
517
+ :return: Exception
518
+ """
519
+ return self._jdf.assertErrorThreshold()
520
+
521
+
522
+ class DynamicFrameCollection(object):
523
+
524
+ def __init__(self, dynamic_frames, glue_ctx):
525
+ """
526
+ :param df_dict: a dictionary of dynamic frame
527
+ """
528
+ self._glue_ctx = glue_ctx
529
+ if isinstance(dynamic_frames, list):
530
+ self._df_dict = { df.name: df for df in dynamic_frames }
531
+ elif isinstance(dynamic_frames, dict):
532
+ self._df_dict = dynamic_frames
533
+ else:
534
+ raise TypeError("dynamic_frames must be list or dict.")
535
+
536
+ def __getitem__(self, key):
537
+ return self._df_dict[key]
538
+
539
+ def __len__(self):
540
+ return len(self._df_dict)
541
+
542
+ def keys(self):
543
+ return self._df_dict.keys()
544
+
545
+ def values(self):
546
+ return self._df_dict.values()
547
+
548
+ def select(self, key, transformation_ctx = ""):
549
+ """
550
+ :param key: get dynamic frame of key
551
+ :return: a dynamic frame
552
+ """
553
+ if key in self._df_dict:
554
+ return self.__getitem__(key)
555
+ else:
556
+ return DynamicFrame(self._glue_ctx._jvm.DynamicFrame.emptyDynamicFrame(self._glue_ctx._glue_scala_context), self._glue_ctx, key)
557
+
558
+ def map(self, callable, transformation_ctx = ""):
559
+ """
560
+ :param callable: pass in a callable to every DynamicFrame
561
+ :return: a DynamicFrameCollection
562
+ """
563
+ new_dict = {}
564
+ for k,v in iteritems(self._df_dict):
565
+ res = callable(v, transformation_ctx+':'+k)
566
+ if not isinstance(res, DynamicFrame):
567
+ raise TypeError("callable must return a DynamicFrame. "\
568
+ "Got {}".format(str(type(res))))
569
+ new_dict[k] = res
570
+
571
+ return DynamicFrameCollection(new_dict, self._glue_ctx)
572
+
573
+ def flatmap(self, f, transformation_ctx = ""):
574
+ """
575
+ :param f: A function that takes a DynamicFrame and returns a
576
+ DynamicFrame or a DynamicFrameCollection.
577
+ :return: A DynamicFrameCollection
578
+ """
579
+ new_dict = {}
580
+
581
+ for frame in itervalues(self._df_dict):
582
+ res = f(frame, transformation_ctx+':'+frame.name)
583
+
584
+ if isinstance(res, DynamicFrame):
585
+ new_dict[res.name] = res
586
+ elif isinstance(res, DynamicFrameCollection):
587
+ new_dict.update(res)
588
+ else:
589
+ raise TypeError("Function argument to flatmap must return "\
590
+ "DynamicFrame or DynamicFrameCollection."\
591
+ " Got {}".format(str(type(res))))
592
+
593
+ return DynamicFrameCollection(new_dict, self._glue_ctx)
594
+
595
+
596
+ class DynamicFrameReader(object):
597
+ def __init__(self, glue_context):
598
+ self._glue_context = glue_context
599
+
600
+ def from_rdd(self, data, name, schema=None, sampleRatio=None):
601
+ """Creates a DynamicFrame from an RDD.
602
+ """
603
+ return self._glue_context.create_dynamic_frame_from_rdd(data, name, schema, sampleRatio)
604
+
605
+ def from_options(self, connection_type, connection_options={},
606
+ format=None, format_options={}, transformation_ctx="", push_down_predicate = "", **kwargs):
607
+ """Creates a DynamicFrame with the specified connection and format.
608
+ """
609
+ return self._glue_context.create_dynamic_frame_from_options(connection_type,
610
+ connection_options,
611
+ format,
612
+ format_options, transformation_ctx, push_down_predicate, **kwargs)
613
+
614
+ def from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", push_down_predicate = "", additional_options = {}, catalog_id = None, **kwargs):
615
+ """Creates a DynamicFrame with the specified catalog name space and table name.
616
+ """
617
+ if database is not None and "name_space" in kwargs:
618
+ raise Exception("Parameter name_space and database are both specified, choose one.")
619
+ elif database is None and "name_space" not in kwargs:
620
+ raise Exception("Parameter name_space or database is missing.")
621
+ elif "name_space" in kwargs:
622
+ db = kwargs.pop("name_space")
623
+ else:
624
+ db = database
625
+
626
+ if table_name is None:
627
+ raise Exception("Parameter table_name is missing.")
628
+
629
+ return self._glue_context.create_dynamic_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)
630
+
631
+
632
+ class DynamicFrameWriter(object):
633
+ def __init__(self, glue_context):
634
+ self._glue_context = glue_context
635
+
636
+ def from_options(self, frame, connection_type, connection_options={},
637
+ format=None, format_options={}, transformation_ctx=""):
638
+ """Creates a DynamicFrame with the specified connection and format.
639
+ """
640
+ return self._glue_context.write_dynamic_frame_from_options(frame,
641
+ connection_type,
642
+ connection_options,
643
+ format,
644
+ format_options, transformation_ctx)
645
+
646
+ def from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
647
+ """Creates a DynamicFrame with the specified catalog name space and table name.
648
+ """
649
+ if database is not None and "name_space" in kwargs:
650
+ raise Exception("Parameter name_space and database are both specified, choose one.")
651
+ elif database is None and "name_space" not in kwargs:
652
+ raise Exception("Parameter name_space or database is missing.")
653
+ elif "name_space" in kwargs:
654
+ db = kwargs.pop("name_space")
655
+ else:
656
+ db = database
657
+
658
+ if table_name is None:
659
+ raise Exception("Parameter table_name is missing.")
660
+
661
+ return self._glue_context.write_dynamic_frame_from_catalog(frame, db, table_name, redshift_tmp_dir, transformation_ctx, additional_options, catalog_id)
662
+
663
+ def from_jdbc_conf(self, frame, catalog_connection, connection_options={}, redshift_tmp_dir = "", transformation_ctx=""):
664
+ """Creates a DynamicFrame with the specified JDBC connection information.
665
+ """
666
+ return self._glue_context.write_dynamic_frame_from_jdbc_conf(frame,
667
+ catalog_connection,
668
+ connection_options,
669
+ redshift_tmp_dir, transformation_ctx)