AWSGlueDataplanePython 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awsglue/README.md +37 -0
- awsglue/__init__.py +15 -0
- awsglue/context.py +690 -0
- awsglue/data_sink.py +49 -0
- awsglue/data_source.py +49 -0
- awsglue/dataframe_transforms/__init__.py +17 -0
- awsglue/dataframe_transforms/apply_mapping.py +76 -0
- awsglue/dataframereader.py +41 -0
- awsglue/dataframewriter.py +21 -0
- awsglue/devutils.py +236 -0
- awsglue/dynamicframe.py +669 -0
- awsglue/functions.py +31 -0
- awsglue/glue_shell.py +38 -0
- awsglue/gluetypes.py +461 -0
- awsglue/job.py +59 -0
- awsglue/scripts/__init__.py +12 -0
- awsglue/scripts/activate_etl_connector.py +362 -0
- awsglue/scripts/connector_activation_util.py +38 -0
- awsglue/scripts/crawler_redo_from_backup.py +75 -0
- awsglue/scripts/crawler_undo.py +121 -0
- awsglue/scripts/scripts_utils.py +106 -0
- awsglue/streaming_data_source.py +28 -0
- awsglue/transforms/__init__.py +47 -0
- awsglue/transforms/apply_mapping.py +72 -0
- awsglue/transforms/coalesce.py +66 -0
- awsglue/transforms/collection_transforms.py +155 -0
- awsglue/transforms/drop_nulls.py +85 -0
- awsglue/transforms/dynamicframe_filter.py +66 -0
- awsglue/transforms/dynamicframe_map.py +72 -0
- awsglue/transforms/errors_as_dynamicframe.py +45 -0
- awsglue/transforms/field_transforms.py +469 -0
- awsglue/transforms/relationalize.py +105 -0
- awsglue/transforms/repartition.py +61 -0
- awsglue/transforms/resolve_choice.py +85 -0
- awsglue/transforms/transform.py +92 -0
- awsglue/transforms/unbox.py +112 -0
- awsglue/transforms/union.py +66 -0
- awsglue/transforms/unnest_frame.py +75 -0
- awsglue/utils.py +159 -0
- awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
- awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
- awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
- awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
awsglue/dynamicframe.py
ADDED
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from __future__ import print_function
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from awsglue.utils import makeOptions, callsite
|
|
17
|
+
from awsglue.gluetypes import _deserialize_json_string, _create_dynamic_record, _revert_to_dict, _serialize_schema
|
|
18
|
+
from awsglue.utils import _call_site, _as_java_list, _as_scala_option, _as_resolve_choiceOption, iteritems, itervalues
|
|
19
|
+
from pyspark.rdd import RDD, PipelinedRDD
|
|
20
|
+
from pyspark.sql.dataframe import DataFrame
|
|
21
|
+
from pyspark.serializers import PickleSerializer, BatchedSerializer
|
|
22
|
+
|
|
23
|
+
long = int
|
|
24
|
+
basestring = unicode = str
|
|
25
|
+
imap = map
|
|
26
|
+
ifilter = filter
|
|
27
|
+
|
|
28
|
+
class ResolveOption(object):
|
|
29
|
+
"""
|
|
30
|
+
ResolveOption is used for resolve ChoiceType while converting DynamicRecord to DataFrame
|
|
31
|
+
option.action includes "Project", "KeepAsStruct" and "Cast".
|
|
32
|
+
"""
|
|
33
|
+
def __init__(self, path, action, target=None):
|
|
34
|
+
"""
|
|
35
|
+
:param path: string, path name to ChoiceType
|
|
36
|
+
:param action: string,
|
|
37
|
+
:param target: spark sql Datatype
|
|
38
|
+
"""
|
|
39
|
+
self.path = path
|
|
40
|
+
self.action = action
|
|
41
|
+
self.target = target
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DynamicFrame(object):
|
|
45
|
+
|
|
46
|
+
def __init__(self, jdf, glue_ctx, name=""):
|
|
47
|
+
self._jdf = jdf
|
|
48
|
+
self.glue_ctx = glue_ctx
|
|
49
|
+
self._ssql_ctx = glue_ctx._ssql_ctx
|
|
50
|
+
self._sc = glue_ctx and glue_ctx._sc
|
|
51
|
+
self._schema = None
|
|
52
|
+
self._lazy_rdd = None
|
|
53
|
+
self.name = name
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def _rdd(self):
|
|
57
|
+
if self._lazy_rdd is None:
|
|
58
|
+
jrdd = self._jdf.javaToPython()
|
|
59
|
+
self._lazy_rdd = RDD(jrdd, self._sc, BatchedSerializer(PickleSerializer()))
|
|
60
|
+
return self._lazy_rdd
|
|
61
|
+
|
|
62
|
+
def with_frame_schema(self, schema):
|
|
63
|
+
""" Specify schema so we don't have to compute it """
|
|
64
|
+
return DynamicFrame(self._jdf.pyWithFrameSchema(_serialize_schema(schema)), self.glue_ctx, self.name)
|
|
65
|
+
|
|
66
|
+
def schema(self):
|
|
67
|
+
if self._schema is None:
|
|
68
|
+
try:
|
|
69
|
+
self._schema = _deserialize_json_string(self._jdf.schema().toString())
|
|
70
|
+
except AttributeError as e:
|
|
71
|
+
raise Exception("Unable to parse datatype from schema. %s" % e)
|
|
72
|
+
return self._schema
|
|
73
|
+
|
|
74
|
+
def show(self, num_rows=20):
|
|
75
|
+
print(self._jdf.showString(num_rows))
|
|
76
|
+
|
|
77
|
+
def filter(self, f, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
|
|
78
|
+
def wrap_dict_with_dynamic_records(x):
|
|
79
|
+
rec = _create_dynamic_record(x["record"])
|
|
80
|
+
try:
|
|
81
|
+
return f(rec)
|
|
82
|
+
except Exception as E:
|
|
83
|
+
if isinstance(E, KeyError) or isinstance(E, ValueError) or isinstance(E, TypeError):
|
|
84
|
+
return False
|
|
85
|
+
x['isError'] = True
|
|
86
|
+
x['errorMessage'] = str(E)
|
|
87
|
+
return True
|
|
88
|
+
|
|
89
|
+
def func(iterator):
|
|
90
|
+
return ifilter(wrap_dict_with_dynamic_records, iterator)
|
|
91
|
+
return self.mapPartitions(func, True, transformation_ctx, info, stageThreshold, totalThreshold)
|
|
92
|
+
|
|
93
|
+
def mapPartitions(self, f, preservesPartitioning=True, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
|
|
94
|
+
def func(s, iterator):
|
|
95
|
+
return f(iterator)
|
|
96
|
+
return self.mapPartitionsWithIndex(func, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
|
|
97
|
+
|
|
98
|
+
def map(self, f, preservesPartitioning=False,transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
|
|
99
|
+
def wrap_dict_with_dynamic_records(x):
|
|
100
|
+
rec = _create_dynamic_record(x["record"])
|
|
101
|
+
try:
|
|
102
|
+
result_record = _revert_to_dict(f(rec))
|
|
103
|
+
if result_record:
|
|
104
|
+
x["record"] = result_record
|
|
105
|
+
else:
|
|
106
|
+
x['isError'] = True
|
|
107
|
+
x['errorMessage'] = "User-specified function returned None instead of DynamicRecord"
|
|
108
|
+
return x
|
|
109
|
+
except Exception as E:
|
|
110
|
+
x['isError'] = True
|
|
111
|
+
x['errorMessage'] = str(E)
|
|
112
|
+
return x
|
|
113
|
+
def func(_, iterator):
|
|
114
|
+
return imap(wrap_dict_with_dynamic_records, iterator)
|
|
115
|
+
return self.mapPartitionsWithIndex(func, preservesPartitioning, transformation_ctx, info, stageThreshold, totalThreshold)
|
|
116
|
+
|
|
117
|
+
def mapPartitionsWithIndex(self, f, preservesPartitioning=False, transformation_ctx = "", info = "", stageThreshold = 0,totalThreshold = 0):
|
|
118
|
+
return DynamicFrame(self.glue_ctx._jvm.DynamicFrame.fromPythonRDD(self._jdf,
|
|
119
|
+
PipelinedRDD(self._rdd, f, preservesPartitioning)._jrdd, self.glue_ctx._ssql_ctx, transformation_ctx, self.name,
|
|
120
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold),
|
|
121
|
+
long(totalThreshold)), self.glue_ctx, self.name)
|
|
122
|
+
|
|
123
|
+
def printSchema(self):
|
|
124
|
+
print(self._jdf.schema().treeString())
|
|
125
|
+
|
|
126
|
+
def toDF(self, options = None):
|
|
127
|
+
"""
|
|
128
|
+
Please specify also target type if you choose Project and Cast action type.
|
|
129
|
+
|
|
130
|
+
:param options: Must be list of options
|
|
131
|
+
|
|
132
|
+
>>>toDF([ResolveOption("a.b.c", "KeepAsStruct")])
|
|
133
|
+
>>>toDF([ResolveOption("a.b.c", "Project", DoubleType())])
|
|
134
|
+
"""
|
|
135
|
+
if options is None: options = []
|
|
136
|
+
scala_options = []
|
|
137
|
+
|
|
138
|
+
for option in options:
|
|
139
|
+
if option.action != "KeepAsStruct" and option.target is None:
|
|
140
|
+
raise Exception("Missing target type for resolve action %s." % option.action)
|
|
141
|
+
|
|
142
|
+
scala_options.append(self.glue_ctx.convert_resolve_option(option.path, option.action, option.target))
|
|
143
|
+
|
|
144
|
+
return DataFrame(self._jdf.toDF(self.glue_ctx._jvm.PythonUtils.toSeq(scala_options)), self.glue_ctx)
|
|
145
|
+
|
|
146
|
+
@classmethod
|
|
147
|
+
def fromDF(cls, dataframe, glue_ctx, name=""):
|
|
148
|
+
"""
|
|
149
|
+
Convert a DataFrame to a DynamicFrame by converting DynamicRecords to Rows
|
|
150
|
+
:param dataframe: A spark sql DataFrame
|
|
151
|
+
:param glue_ctx: the GlueContext object
|
|
152
|
+
:param name: name of the result DynamicFrame
|
|
153
|
+
:return: DynamicFrame
|
|
154
|
+
"""
|
|
155
|
+
return DynamicFrame(glue_ctx._jvm.DynamicFrame.apply(dataframe._jdf, glue_ctx._ssql_ctx),
|
|
156
|
+
glue_ctx, name)
|
|
157
|
+
|
|
158
|
+
def unbox(self, path, format, transformation_ctx="", info = "", stageThreshold = 0, totalThreshold = 0, **options):
|
|
159
|
+
"""
|
|
160
|
+
unbox a string field
|
|
161
|
+
|
|
162
|
+
:param path: full path to the StringNode you want to unbox
|
|
163
|
+
:param format: "avro" or "json"
|
|
164
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
165
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
166
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
167
|
+
for which the processing needs to error out.
|
|
168
|
+
:param options:
|
|
169
|
+
separator: String,
|
|
170
|
+
escaper: String,
|
|
171
|
+
skipFirst: Boolean,
|
|
172
|
+
withSchema: String, schema string should always be called by using StructType.json()
|
|
173
|
+
withHeader: Boolean
|
|
174
|
+
:return: a new DynamicFrame with unboxed DynamicRecords
|
|
175
|
+
|
|
176
|
+
>>>unbox("a.b.c", "csv", separator="|")
|
|
177
|
+
"""
|
|
178
|
+
return DynamicFrame(self._jdf.unbox(path, format, json.dumps(options), transformation_ctx,
|
|
179
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold),
|
|
180
|
+
long(totalThreshold)),
|
|
181
|
+
self.glue_ctx, self.name)
|
|
182
|
+
|
|
183
|
+
def drop_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
184
|
+
"""
|
|
185
|
+
:param paths: List of strings, each the full path to a node you want to drop
|
|
186
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
187
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
188
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
189
|
+
for which the processing needs to error out.
|
|
190
|
+
:return: DynamicFrame
|
|
191
|
+
"""
|
|
192
|
+
if isinstance(paths, basestring):
|
|
193
|
+
paths = [paths]
|
|
194
|
+
|
|
195
|
+
return DynamicFrame(self._jdf.dropFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
|
|
196
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
|
|
197
|
+
self.glue_ctx, self.name)
|
|
198
|
+
|
|
199
|
+
def select_fields(self, paths, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
200
|
+
"""
|
|
201
|
+
:param paths: List of strings, each the full path to a node you want to get
|
|
202
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
203
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
204
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
205
|
+
for which the processing needs to error out.
|
|
206
|
+
:return: DynamicFrame
|
|
207
|
+
"""
|
|
208
|
+
if isinstance(paths, basestring):
|
|
209
|
+
paths = [paths]
|
|
210
|
+
|
|
211
|
+
return DynamicFrame(self._jdf.selectFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
|
|
212
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)),
|
|
213
|
+
self.glue_ctx, self.name)
|
|
214
|
+
|
|
215
|
+
def split_fields(self, paths, name1, name2, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
216
|
+
"""
|
|
217
|
+
:param paths: List of strings, each the full path to a node you want to split into a new DynamicFrame
|
|
218
|
+
:param name1: name for the dynamic frame to be split off
|
|
219
|
+
:param name2: name for the dynamic frame remains on original
|
|
220
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
221
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
222
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
223
|
+
for which the processing needs to error out.
|
|
224
|
+
:return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
|
|
225
|
+
the second containing the nodes remaining on the original.
|
|
226
|
+
"""
|
|
227
|
+
if isinstance(paths, basestring):
|
|
228
|
+
paths = [paths]
|
|
229
|
+
|
|
230
|
+
jdfs = _as_java_list(self._sc, self._jdf.splitFields(self.glue_ctx._jvm.PythonUtils.toSeq(paths), transformation_ctx,
|
|
231
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)))
|
|
232
|
+
return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)
|
|
233
|
+
|
|
234
|
+
def split_rows(self, comparison_dict, name1, name2, transformation_ctx = "", info= "", stageThreshold = 0, totalThreshold = 0):
|
|
235
|
+
"""
|
|
236
|
+
:param comparison_dict: a dictionary where the key is the path to a column, the the value is another
|
|
237
|
+
dictionary maping comparators to the value to which the column will be compared.
|
|
238
|
+
e.g. {"age": {">": 10, "<": 20}} will give back rows where age between 10 and 20 exclusive split from those
|
|
239
|
+
that do not meet this criteria.
|
|
240
|
+
:param name1: name for the dynamic frame to be split off
|
|
241
|
+
:param name2: name for the dynamic frame remains on original
|
|
242
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
243
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
244
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
245
|
+
for which the processing needs to error out.
|
|
246
|
+
:return: DynamicFrameCollection with two DynamicFrames, the first containing all the nodes that you have split off,
|
|
247
|
+
the second containing the nodes remaining on the original.
|
|
248
|
+
"""
|
|
249
|
+
paths, values, operators = [], [], []
|
|
250
|
+
|
|
251
|
+
for key, value in comparison_dict.items():
|
|
252
|
+
paths.extend([key] * len(value))
|
|
253
|
+
for k, v in value.items():
|
|
254
|
+
operators.append(k)
|
|
255
|
+
if isinstance(v, int):
|
|
256
|
+
values.append(long(v))
|
|
257
|
+
else:
|
|
258
|
+
values.append(v)
|
|
259
|
+
|
|
260
|
+
jdfs = _as_java_list(self._sc, self._jdf.splitRows(self.glue_ctx._jvm.PythonUtils.toSeq(paths),
|
|
261
|
+
self.glue_ctx._jvm.PythonUtils.toSeq(values),
|
|
262
|
+
self.glue_ctx._jvm.PythonUtils.toSeq(operators),
|
|
263
|
+
transformation_ctx, _call_site(self._sc, callsite(), info),
|
|
264
|
+
long(stageThreshold), long(totalThreshold)))
|
|
265
|
+
return DynamicFrameCollection({name1 : DynamicFrame(jdfs[0], self.glue_ctx, name1), name2 : DynamicFrame(jdfs[1], self.glue_ctx, name2)}, self.glue_ctx)
|
|
266
|
+
|
|
267
|
+
def rename_field(self, oldName, newName, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
268
|
+
"""
|
|
269
|
+
:param oldName: String, full path to the node you want to rename
|
|
270
|
+
:param newName: String, new name including full path
|
|
271
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
272
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
273
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
274
|
+
for which the processing needs to error out.
|
|
275
|
+
:return: DynamicFrame
|
|
276
|
+
"""
|
|
277
|
+
return DynamicFrame(self._jdf.renameField(oldName, newName, transformation_ctx, _call_site(self._sc, callsite(), info),
|
|
278
|
+
long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
|
|
279
|
+
|
|
280
|
+
def write(self, connection_type, connection_options={},
|
|
281
|
+
format=None, format_options={}, accumulator_size = 0):
|
|
282
|
+
return self.glue_ctx.write_from_options(frame_or_dfc=self,
|
|
283
|
+
connection_type=connection_type,
|
|
284
|
+
connection_options=connection_options,
|
|
285
|
+
format=format,
|
|
286
|
+
format_options=format_options,
|
|
287
|
+
accumulator_size=accumulator_size)
|
|
288
|
+
|
|
289
|
+
def count(self):
|
|
290
|
+
return self._jdf.count()
|
|
291
|
+
|
|
292
|
+
def spigot(self, path, options={}, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
293
|
+
return DynamicFrame(self._jdf.spigot(path, makeOptions(self._sc, options), transformation_ctx,
|
|
294
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold),
|
|
295
|
+
long(totalThreshold)),
|
|
296
|
+
self.glue_ctx, self.name)
|
|
297
|
+
|
|
298
|
+
def join(self, paths1, paths2, frame2, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
299
|
+
if isinstance(paths1, basestring):
|
|
300
|
+
paths1 = [paths1]
|
|
301
|
+
if isinstance(paths2, basestring):
|
|
302
|
+
paths2 = [paths2]
|
|
303
|
+
|
|
304
|
+
return DynamicFrame(self._jdf.join(self.glue_ctx._jvm.PythonUtils.toSeq(paths1), self.glue_ctx._jvm.PythonUtils.toSeq(paths2), frame2._jdf, transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name + frame2.name)
|
|
305
|
+
|
|
306
|
+
def unnest(self, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
307
|
+
"""
|
|
308
|
+
unnest a dynamic frame. i.e. flattens nested objects to top level elements.
|
|
309
|
+
It also generates joinkeys for array objects
|
|
310
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
311
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
312
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
313
|
+
for which the processing needs to error out.
|
|
314
|
+
:return: a new unnested dynamic frame
|
|
315
|
+
|
|
316
|
+
>>>unnest()
|
|
317
|
+
"""
|
|
318
|
+
return DynamicFrame(self._jdf.unnest(transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold)), self.glue_ctx, self.name)
|
|
319
|
+
|
|
320
|
+
def relationalize(self, root_table_name, staging_path, options={}, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
321
|
+
"""
|
|
322
|
+
Relationalizes a dynamic frame. i.e. produces a list of frames that are
|
|
323
|
+
generated by unnesting nested columns and pivoting array columns. The
|
|
324
|
+
pivoted array column can be joined to the root table using the joinkey
|
|
325
|
+
generated in unnest phase
|
|
326
|
+
:param root_table_name: name for the root table
|
|
327
|
+
:param staging_path: path to store partitions of pivoted tables in csv format. Pivoted tables are read back from
|
|
328
|
+
this path
|
|
329
|
+
:param options: dict of optional parameters for relationalize
|
|
330
|
+
:param transformation_ctx: context key to retrieve metadata about the current transformation
|
|
331
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
332
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
333
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
334
|
+
for which the processing needs to error out.
|
|
335
|
+
:return: DynamicFrameCollection
|
|
336
|
+
"""
|
|
337
|
+
_rFrames = _as_java_list(self._sc, self._jdf.relationalize(root_table_name, staging_path,
|
|
338
|
+
makeOptions(self._sc, options),
|
|
339
|
+
transformation_ctx, _call_site(self._sc, callsite(), info),
|
|
340
|
+
long(stageThreshold), long(totalThreshold)))
|
|
341
|
+
return DynamicFrameCollection(dict((df.getName(), DynamicFrame(df, self.glue_ctx, df.getName())) for df in _rFrames), self.glue_ctx)
|
|
342
|
+
|
|
343
|
+
def applyMapping(self, *args, **kwargs):
|
|
344
|
+
# In a previous version we passed args[1:] and in our tests we passed
|
|
345
|
+
# the DynamicFrame as the first argument. This checks for that case
|
|
346
|
+
# to avoid regressions.
|
|
347
|
+
if len(args) > 0 and isinstance(args[0], DynamicFrame):
|
|
348
|
+
return self.apply_mapping(*(args[1:]), **kwargs)
|
|
349
|
+
else:
|
|
350
|
+
return self.apply_mapping(*args, **kwargs)
|
|
351
|
+
|
|
352
|
+
def apply_mapping(self, mappings, case_sensitive = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
353
|
+
def _to_java_mapping(mapping_tup):
|
|
354
|
+
if not isinstance(mapping_tup, tuple):
|
|
355
|
+
raise TypeError("Mapping must be specified as a tuple. Got " +
|
|
356
|
+
mapping_tup)
|
|
357
|
+
|
|
358
|
+
tup2 = self.glue_ctx._jvm.scala.Tuple2
|
|
359
|
+
tup3 = self.glue_ctx._jvm.scala.Tuple3
|
|
360
|
+
tup4 = self.glue_ctx._jvm.scala.Tuple4
|
|
361
|
+
java_cls = self.glue_ctx._jvm.MappingSpec
|
|
362
|
+
|
|
363
|
+
if len(mapping_tup) == 2:
|
|
364
|
+
return java_cls.apply(tup2.apply(mapping_tup[0], mapping_tup[1]))
|
|
365
|
+
elif len(mapping_tup) == 3:
|
|
366
|
+
return java_cls.apply(tup3.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2]))
|
|
367
|
+
elif len(mapping_tup) == 4:
|
|
368
|
+
return java_cls.apply(tup4.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2], mapping_tup[3]))
|
|
369
|
+
else:
|
|
370
|
+
raise ValueError("Mapping tuple must be of length 2, 3, or 4"
|
|
371
|
+
"Got tuple of length " + str(len(mapping_tup)))
|
|
372
|
+
|
|
373
|
+
if isinstance(mappings, tuple):
|
|
374
|
+
mappings = [mappings]
|
|
375
|
+
|
|
376
|
+
mappings_list = [ _to_java_mapping(m) for m in mappings ]
|
|
377
|
+
|
|
378
|
+
new_jdf = self._jdf.applyMapping(
|
|
379
|
+
self.glue_ctx._jvm.PythonUtils.toSeq(mappings_list),
|
|
380
|
+
case_sensitive,
|
|
381
|
+
transformation_ctx,
|
|
382
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))
|
|
383
|
+
|
|
384
|
+
return DynamicFrame(new_jdf, self.glue_ctx, self.name)
|
|
385
|
+
|
|
386
|
+
def unnest_ddb_json(self, transformation_ctx="", info="", stageThreshold=0, totalThreshold=0):
|
|
387
|
+
new_jdf = self._jdf.unnestDDBJson(transformation_ctx, _call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold))
|
|
388
|
+
return DynamicFrame(new_jdf, self.glue_ctx, self.name)
|
|
389
|
+
|
|
390
|
+
def simplify_ddb_json(self):
|
|
391
|
+
new_jdf = self._jdf.simplifyDDBJson()
|
|
392
|
+
return DynamicFrame(new_jdf, self.glue_ctx, self.name)
|
|
393
|
+
|
|
394
|
+
def resolveChoice(self, specs=None, choice="", database=None, table_name=None,
|
|
395
|
+
transformation_ctx="", info="", stageThreshold=0, totalThreshold=0, catalog_id=None):
|
|
396
|
+
"""
|
|
397
|
+
:param specs: specification for choice type and corresponding resolve action,
|
|
398
|
+
if the specs is empty, then tape backend would go one round of the data
|
|
399
|
+
to get schema, and then based on the schema to resolve choice.
|
|
400
|
+
:param choice: default option when choice type path found missing from specs
|
|
401
|
+
:param database: Glue catalog database name, required for MATCH_CATALOG choice
|
|
402
|
+
:param table_name: Glue catalog table name, required for MATCH_CATALOG choice
|
|
403
|
+
:return: a new DynamicFrame
|
|
404
|
+
"""
|
|
405
|
+
def _to_java_specs(specs_tup):
|
|
406
|
+
path, action = specs_tup
|
|
407
|
+
return self.glue_ctx._jvm.ResolveSpec.apply(path, action)
|
|
408
|
+
|
|
409
|
+
if specs is None and not choice:
|
|
410
|
+
raise Exception("Parameter specs and option are both missing, add one.")
|
|
411
|
+
|
|
412
|
+
if specs is not None and choice:
|
|
413
|
+
raise Exception("Parameter specs and option are both specified, choose one.")
|
|
414
|
+
|
|
415
|
+
if specs is None:
|
|
416
|
+
specs = []
|
|
417
|
+
|
|
418
|
+
if isinstance(specs, tuple):
|
|
419
|
+
specs = [specs]
|
|
420
|
+
|
|
421
|
+
specs_list = [ _to_java_specs(m) for m in specs ]
|
|
422
|
+
|
|
423
|
+
choice_option = _as_scala_option(self._sc, _as_resolve_choiceOption(self._sc, choice))
|
|
424
|
+
database_option = _as_scala_option(self._sc, database)
|
|
425
|
+
table_name_option = _as_scala_option(self._sc, table_name)
|
|
426
|
+
|
|
427
|
+
new_jdf = self._jdf.resolveChoice(
|
|
428
|
+
self.glue_ctx._jvm.PythonUtils.toSeq(specs_list),
|
|
429
|
+
choice_option, database_option, table_name_option,
|
|
430
|
+
transformation_ctx,
|
|
431
|
+
_call_site(self._sc, callsite(), info), long(stageThreshold), long(totalThreshold),
|
|
432
|
+
_as_scala_option(self._sc, catalog_id))
|
|
433
|
+
|
|
434
|
+
return DynamicFrame(new_jdf, self.glue_ctx, self.name)
|
|
435
|
+
|
|
436
|
+
def mergeDynamicFrame(self, stage_dynamic_frame, primary_keys, transformation_ctx = "", options = {}, info = "", stageThreshold = 0, totalThreshold = 0):
|
|
437
|
+
"""
|
|
438
|
+
Merge this DynamicFrame with a staging DynamicFrame based on the provided primary keys to identify records.
|
|
439
|
+
Duplicate records (records with same primary keys) are not de-duplicated. All records (including duplicates) are
|
|
440
|
+
retained from the source, if there is no matching record in staging frame. If staging frame has matching records
|
|
441
|
+
then the records from the staging frame overwrites the records in the source.
|
|
442
|
+
:param stage_dynamic_frame: Staging DynamicFrame
|
|
443
|
+
:param primary_keys: List of primary key fields to match records from source and staging dynamic frame
|
|
444
|
+
:param transformation_ctx: context key to retrieve metadata about the current transformation
|
|
445
|
+
:param options: optional options for the transformation
|
|
446
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
447
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
448
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
449
|
+
for which the processing needs to error out.
|
|
450
|
+
:return: DynamicFrame
|
|
451
|
+
"""
|
|
452
|
+
if isinstance(primary_keys, basestring):
|
|
453
|
+
primary_keys = [primary_keys]
|
|
454
|
+
return DynamicFrame(self._jdf.mergeDynamicFrames(stage_dynamic_frame._jdf,
|
|
455
|
+
self.glue_ctx._jvm.PythonUtils.toSeq(primary_keys),
|
|
456
|
+
transformation_ctx,
|
|
457
|
+
makeOptions(self._sc, options),
|
|
458
|
+
_call_site(self._sc, callsite(), info),
|
|
459
|
+
long(stageThreshold),
|
|
460
|
+
long(totalThreshold)),
|
|
461
|
+
self.glue_ctx, self.name)
|
|
462
|
+
|
|
463
|
+
def union(self, other_frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
464
|
+
"""Returns a DynamicFrame containing all records in this frame and all records in other_frame.
|
|
465
|
+
:param other_frame: DynamicFrame to union with this one.
|
|
466
|
+
:param transformation_ctx: context key to retrieve metadata about the current transformation
|
|
467
|
+
:param info: String, any string to be associated with errors in this transformation.
|
|
468
|
+
:param stageThreshold: Long, number of errors in the given transformation for which the processing needs to error out.
|
|
469
|
+
:param totalThreshold: Long, total number of errors upto and including in this transformation
|
|
470
|
+
for which the processing needs to error out.
|
|
471
|
+
:return: DynamicFrame
|
|
472
|
+
"""
|
|
473
|
+
union = self._jdf.union(other_frame._jdf, transformation_ctx, _call_site(self._sc, callsite(), info),
|
|
474
|
+
long(stageThreshold), long(totalThreshold))
|
|
475
|
+
return DynamicFrame(union, self.glue_ctx, union.name)
|
|
476
|
+
|
|
477
|
+
def getNumPartitions(self):
|
|
478
|
+
"""Returns the number of partitions in the current DynamicFrame."""
|
|
479
|
+
return self._jdf.getNumPartitions()
|
|
480
|
+
|
|
481
|
+
def repartition(self, num_partitions, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
482
|
+
new_jdf = self._jdf.repartition(num_partitions, transformation_ctx,
|
|
483
|
+
_call_site(self._sc, callsite(), info),
|
|
484
|
+
long(stageThreshold), long(totalThreshold))
|
|
485
|
+
return DynamicFrame(new_jdf, self.glue_ctx, self.name)
|
|
486
|
+
|
|
487
|
+
def coalesce(self, num_partitions, shuffle = False, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
488
|
+
new_jdf = self._jdf.coalesce(num_partitions, shuffle, transformation_ctx,
|
|
489
|
+
_call_site(self._sc, callsite(), info),
|
|
490
|
+
long(stageThreshold), long(totalThreshold))
|
|
491
|
+
return DynamicFrame(new_jdf, self.glue_ctx, self.name)
|
|
492
|
+
|
|
493
|
+
def errorsAsDynamicFrame(self):
|
|
494
|
+
"""
|
|
495
|
+
Returns a DynamicFrame which has error records nested.
|
|
496
|
+
:return: DynamicFrame
|
|
497
|
+
"""
|
|
498
|
+
return DynamicFrame(self._jdf.errorsAsDynamicFrame(), self.glue_ctx, self.name)
|
|
499
|
+
|
|
500
|
+
def errorsCount(self):
|
|
501
|
+
"""
|
|
502
|
+
Returns the total error records in a DynamicFrames
|
|
503
|
+
:return: Long
|
|
504
|
+
"""
|
|
505
|
+
return self._jdf.errorsCount()
|
|
506
|
+
|
|
507
|
+
def stageErrorsCount(self):
|
|
508
|
+
"""
|
|
509
|
+
Returns the error generated in the transformation to this DynamicFrame
|
|
510
|
+
:return: Long
|
|
511
|
+
"""
|
|
512
|
+
return self._jdf.stageErrorsCount()
|
|
513
|
+
|
|
514
|
+
def assertErrorThreshold(self):
|
|
515
|
+
"""
|
|
516
|
+
Asserts for the errors in the transformations which yielded this DynamicFrame
|
|
517
|
+
:return: Exception
|
|
518
|
+
"""
|
|
519
|
+
return self._jdf.assertErrorThreshold()
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
class DynamicFrameCollection(object):
|
|
523
|
+
|
|
524
|
+
def __init__(self, dynamic_frames, glue_ctx):
|
|
525
|
+
"""
|
|
526
|
+
:param df_dict: a dictionary of dynamic frame
|
|
527
|
+
"""
|
|
528
|
+
self._glue_ctx = glue_ctx
|
|
529
|
+
if isinstance(dynamic_frames, list):
|
|
530
|
+
self._df_dict = { df.name: df for df in dynamic_frames }
|
|
531
|
+
elif isinstance(dynamic_frames, dict):
|
|
532
|
+
self._df_dict = dynamic_frames
|
|
533
|
+
else:
|
|
534
|
+
raise TypeError("dynamic_frames must be list or dict.")
|
|
535
|
+
|
|
536
|
+
def __getitem__(self, key):
|
|
537
|
+
return self._df_dict[key]
|
|
538
|
+
|
|
539
|
+
def __len__(self):
|
|
540
|
+
return len(self._df_dict)
|
|
541
|
+
|
|
542
|
+
def keys(self):
|
|
543
|
+
return self._df_dict.keys()
|
|
544
|
+
|
|
545
|
+
def values(self):
|
|
546
|
+
return self._df_dict.values()
|
|
547
|
+
|
|
548
|
+
def select(self, key, transformation_ctx = ""):
|
|
549
|
+
"""
|
|
550
|
+
:param key: get dynamic frame of key
|
|
551
|
+
:return: a dynamic frame
|
|
552
|
+
"""
|
|
553
|
+
if key in self._df_dict:
|
|
554
|
+
return self.__getitem__(key)
|
|
555
|
+
else:
|
|
556
|
+
return DynamicFrame(self._glue_ctx._jvm.DynamicFrame.emptyDynamicFrame(self._glue_ctx._glue_scala_context), self._glue_ctx, key)
|
|
557
|
+
|
|
558
|
+
def map(self, callable, transformation_ctx = ""):
|
|
559
|
+
"""
|
|
560
|
+
:param callable: pass in a callable to every DynamicFrame
|
|
561
|
+
:return: a DynamicFrameCollection
|
|
562
|
+
"""
|
|
563
|
+
new_dict = {}
|
|
564
|
+
for k,v in iteritems(self._df_dict):
|
|
565
|
+
res = callable(v, transformation_ctx+':'+k)
|
|
566
|
+
if not isinstance(res, DynamicFrame):
|
|
567
|
+
raise TypeError("callable must return a DynamicFrame. "\
|
|
568
|
+
"Got {}".format(str(type(res))))
|
|
569
|
+
new_dict[k] = res
|
|
570
|
+
|
|
571
|
+
return DynamicFrameCollection(new_dict, self._glue_ctx)
|
|
572
|
+
|
|
573
|
+
def flatmap(self, f, transformation_ctx = ""):
|
|
574
|
+
"""
|
|
575
|
+
:param f: A function that takes a DynamicFrame and returns a
|
|
576
|
+
DynamicFrame or a DynamicFrameCollection.
|
|
577
|
+
:return: A DynamicFrameCollection
|
|
578
|
+
"""
|
|
579
|
+
new_dict = {}
|
|
580
|
+
|
|
581
|
+
for frame in itervalues(self._df_dict):
|
|
582
|
+
res = f(frame, transformation_ctx+':'+frame.name)
|
|
583
|
+
|
|
584
|
+
if isinstance(res, DynamicFrame):
|
|
585
|
+
new_dict[res.name] = res
|
|
586
|
+
elif isinstance(res, DynamicFrameCollection):
|
|
587
|
+
new_dict.update(res)
|
|
588
|
+
else:
|
|
589
|
+
raise TypeError("Function argument to flatmap must return "\
|
|
590
|
+
"DynamicFrame or DynamicFrameCollection."\
|
|
591
|
+
" Got {}".format(str(type(res))))
|
|
592
|
+
|
|
593
|
+
return DynamicFrameCollection(new_dict, self._glue_ctx)
|
|
594
|
+
|
|
595
|
+
|
|
596
|
+
class DynamicFrameReader(object):
|
|
597
|
+
def __init__(self, glue_context):
|
|
598
|
+
self._glue_context = glue_context
|
|
599
|
+
|
|
600
|
+
def from_rdd(self, data, name, schema=None, sampleRatio=None):
|
|
601
|
+
"""Creates a DynamicFrame from an RDD.
|
|
602
|
+
"""
|
|
603
|
+
return self._glue_context.create_dynamic_frame_from_rdd(data, name, schema, sampleRatio)
|
|
604
|
+
|
|
605
|
+
def from_options(self, connection_type, connection_options={},
|
|
606
|
+
format=None, format_options={}, transformation_ctx="", push_down_predicate = "", **kwargs):
|
|
607
|
+
"""Creates a DynamicFrame with the specified connection and format.
|
|
608
|
+
"""
|
|
609
|
+
return self._glue_context.create_dynamic_frame_from_options(connection_type,
|
|
610
|
+
connection_options,
|
|
611
|
+
format,
|
|
612
|
+
format_options, transformation_ctx, push_down_predicate, **kwargs)
|
|
613
|
+
|
|
614
|
+
def from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", push_down_predicate = "", additional_options = {}, catalog_id = None, **kwargs):
|
|
615
|
+
"""Creates a DynamicFrame with the specified catalog name space and table name.
|
|
616
|
+
"""
|
|
617
|
+
if database is not None and "name_space" in kwargs:
|
|
618
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
619
|
+
elif database is None and "name_space" not in kwargs:
|
|
620
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
621
|
+
elif "name_space" in kwargs:
|
|
622
|
+
db = kwargs.pop("name_space")
|
|
623
|
+
else:
|
|
624
|
+
db = database
|
|
625
|
+
|
|
626
|
+
if table_name is None:
|
|
627
|
+
raise Exception("Parameter table_name is missing.")
|
|
628
|
+
|
|
629
|
+
return self._glue_context.create_dynamic_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
class DynamicFrameWriter(object):
|
|
633
|
+
def __init__(self, glue_context):
|
|
634
|
+
self._glue_context = glue_context
|
|
635
|
+
|
|
636
|
+
def from_options(self, frame, connection_type, connection_options={},
|
|
637
|
+
format=None, format_options={}, transformation_ctx=""):
|
|
638
|
+
"""Creates a DynamicFrame with the specified connection and format.
|
|
639
|
+
"""
|
|
640
|
+
return self._glue_context.write_dynamic_frame_from_options(frame,
|
|
641
|
+
connection_type,
|
|
642
|
+
connection_options,
|
|
643
|
+
format,
|
|
644
|
+
format_options, transformation_ctx)
|
|
645
|
+
|
|
646
|
+
def from_catalog(self, frame, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", additional_options = {}, catalog_id = None, **kwargs):
|
|
647
|
+
"""Creates a DynamicFrame with the specified catalog name space and table name.
|
|
648
|
+
"""
|
|
649
|
+
if database is not None and "name_space" in kwargs:
|
|
650
|
+
raise Exception("Parameter name_space and database are both specified, choose one.")
|
|
651
|
+
elif database is None and "name_space" not in kwargs:
|
|
652
|
+
raise Exception("Parameter name_space or database is missing.")
|
|
653
|
+
elif "name_space" in kwargs:
|
|
654
|
+
db = kwargs.pop("name_space")
|
|
655
|
+
else:
|
|
656
|
+
db = database
|
|
657
|
+
|
|
658
|
+
if table_name is None:
|
|
659
|
+
raise Exception("Parameter table_name is missing.")
|
|
660
|
+
|
|
661
|
+
return self._glue_context.write_dynamic_frame_from_catalog(frame, db, table_name, redshift_tmp_dir, transformation_ctx, additional_options, catalog_id)
|
|
662
|
+
|
|
663
|
+
def from_jdbc_conf(self, frame, catalog_connection, connection_options={}, redshift_tmp_dir = "", transformation_ctx=""):
|
|
664
|
+
"""Creates a DynamicFrame with the specified JDBC connection information.
|
|
665
|
+
"""
|
|
666
|
+
return self._glue_context.write_dynamic_frame_from_jdbc_conf(frame,
|
|
667
|
+
catalog_connection,
|
|
668
|
+
connection_options,
|
|
669
|
+
redshift_tmp_dir, transformation_ctx)
|