AWSGlueDataplanePython 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awsglue/README.md +37 -0
- awsglue/__init__.py +15 -0
- awsglue/context.py +690 -0
- awsglue/data_sink.py +49 -0
- awsglue/data_source.py +49 -0
- awsglue/dataframe_transforms/__init__.py +17 -0
- awsglue/dataframe_transforms/apply_mapping.py +76 -0
- awsglue/dataframereader.py +41 -0
- awsglue/dataframewriter.py +21 -0
- awsglue/devutils.py +236 -0
- awsglue/dynamicframe.py +669 -0
- awsglue/functions.py +31 -0
- awsglue/glue_shell.py +38 -0
- awsglue/gluetypes.py +461 -0
- awsglue/job.py +59 -0
- awsglue/scripts/__init__.py +12 -0
- awsglue/scripts/activate_etl_connector.py +362 -0
- awsglue/scripts/connector_activation_util.py +38 -0
- awsglue/scripts/crawler_redo_from_backup.py +75 -0
- awsglue/scripts/crawler_undo.py +121 -0
- awsglue/scripts/scripts_utils.py +106 -0
- awsglue/streaming_data_source.py +28 -0
- awsglue/transforms/__init__.py +47 -0
- awsglue/transforms/apply_mapping.py +72 -0
- awsglue/transforms/coalesce.py +66 -0
- awsglue/transforms/collection_transforms.py +155 -0
- awsglue/transforms/drop_nulls.py +85 -0
- awsglue/transforms/dynamicframe_filter.py +66 -0
- awsglue/transforms/dynamicframe_map.py +72 -0
- awsglue/transforms/errors_as_dynamicframe.py +45 -0
- awsglue/transforms/field_transforms.py +469 -0
- awsglue/transforms/relationalize.py +105 -0
- awsglue/transforms/repartition.py +61 -0
- awsglue/transforms/resolve_choice.py +85 -0
- awsglue/transforms/transform.py +92 -0
- awsglue/transforms/unbox.py +112 -0
- awsglue/transforms/union.py +66 -0
- awsglue/transforms/unnest_frame.py +75 -0
- awsglue/utils.py +159 -0
- awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
- awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
- awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
- awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
awsglue/functions.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from pyspark import SparkContext
|
|
14
|
+
from pyspark.sql.column import Column, _to_java_column, _to_seq
|
|
15
|
+
|
|
16
|
+
def replaceArrayElement(srcCol, replaceCol, idx):
|
|
17
|
+
sc = SparkContext._active_spark_context
|
|
18
|
+
jsrcCol, jreplaceCol = _to_java_column(srcCol), _to_java_column(replaceCol)
|
|
19
|
+
return Column(sc._jvm.gluefunctions.replaceArrayElement(jsrcCol, jreplaceCol, idx))
|
|
20
|
+
|
|
21
|
+
def namedStruct(*cols):
|
|
22
|
+
sc = SparkContext._active_spark_context
|
|
23
|
+
if len(cols) == 1 and isinstance(cols[0], (list, set)):
|
|
24
|
+
cols = cols[0]
|
|
25
|
+
jc = sc._jvm.gluefunctions.namedStruct(_to_seq(sc, cols, _to_java_column))
|
|
26
|
+
return Column(jc)
|
|
27
|
+
|
|
28
|
+
def explodeWithIndex(col):
|
|
29
|
+
sc = SparkContext._active_spark_context
|
|
30
|
+
jc = sc._jvm.gluefunctions.explodeWithIndex(_to_java_column(col))
|
|
31
|
+
return Column(jc).alias('index', 'val')
|
awsglue/glue_shell.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from __future__ import print_function
|
|
14
|
+
import platform
|
|
15
|
+
import pyspark
|
|
16
|
+
from pyspark.context import SparkContext
|
|
17
|
+
from pyspark.sql import SQLContext
|
|
18
|
+
from awsglue.context import GlueContext
|
|
19
|
+
|
|
20
|
+
sc = SparkContext()
|
|
21
|
+
# Change to GlueContext
|
|
22
|
+
# TODO: Figure out if/how to use HiveContext
|
|
23
|
+
glueContext = GlueContext(sc)
|
|
24
|
+
|
|
25
|
+
welcome_msg = """Welcome to
|
|
26
|
+
___ _ _______ ________
|
|
27
|
+
/ | | / / ___/ / ____/ /_ _____
|
|
28
|
+
/ /| | | /| / /\\__ \\ / / __/ / / / / _ \\
|
|
29
|
+
/ ___ | |/ |/ /___/ / / /_/ / / /_/ / __/
|
|
30
|
+
/_/ |_|__/|__//____/ \____/_/\____/\___/
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
print(welcome_msg)
|
|
34
|
+
print("Using Python version %s (%s, %s)" % (
|
|
35
|
+
platform.python_version(),
|
|
36
|
+
platform.python_build()[0],
|
|
37
|
+
platform.python_build()[1]))
|
|
38
|
+
print("GlueContext available as glueContext.")
|
awsglue/gluetypes.py
ADDED
|
@@ -0,0 +1,461 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from awsglue.utils import iteritems
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if sys.version >= "3":
|
|
19
|
+
basestring = unicode = str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DataType(object):
|
|
23
|
+
def __init__(self, properties={}):
|
|
24
|
+
self.properties = properties
|
|
25
|
+
|
|
26
|
+
def __eq__(self, other):
|
|
27
|
+
return (isinstance(other, self.__class__) and
|
|
28
|
+
self.__dict__ == other.__dict__)
|
|
29
|
+
|
|
30
|
+
def __hash__(self):
|
|
31
|
+
return hash(str(self.__class__))
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def typeName(cls):
|
|
35
|
+
return cls.__name__[:-4].lower()
|
|
36
|
+
|
|
37
|
+
def jsonValue(self):
|
|
38
|
+
return {"dataType": self.typeName(), "properties": self.properties}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# Atomic types
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
# Note we can't use singletons like Spark does because DataType instances can
|
|
47
|
+
# have properties.
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AtomicType(DataType):
|
|
51
|
+
def __repr__(self):
|
|
52
|
+
return "{}({})".format(self.__class__.__name__, self.properties)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def fromJsonValue(cls, json_value):
|
|
56
|
+
return cls(**{k: v for k, v in iteritems(json_value)
|
|
57
|
+
if k != "dataType"})
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class BinaryType(AtomicType):
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class BooleanType(AtomicType):
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ByteType(AtomicType):
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class DateType(AtomicType):
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class DecimalType(AtomicType):
|
|
77
|
+
def __init__(self, precision=10, scale=2, properties={}):
|
|
78
|
+
super(DecimalType, self).__init__(properties)
|
|
79
|
+
self.precision = precision
|
|
80
|
+
self.scale = scale
|
|
81
|
+
|
|
82
|
+
def __repr__(self):
|
|
83
|
+
return "DecimalType({}, {}, {})".format(self.precision,
|
|
84
|
+
self.scale,
|
|
85
|
+
self.properties)
|
|
86
|
+
|
|
87
|
+
def jsonValue(self):
|
|
88
|
+
return dict(list(super(DecimalType, self).jsonValue().items()) +
|
|
89
|
+
[('precision', self.precision), ('scale', self.scale)])
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class DoubleType(AtomicType):
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class EnumType(AtomicType):
|
|
97
|
+
def __init__(self, options, properties={}):
|
|
98
|
+
super(EnumType, self).__init__(properties)
|
|
99
|
+
DataType.__init__(self, properties)
|
|
100
|
+
self.options = options
|
|
101
|
+
|
|
102
|
+
def __repr__(self):
|
|
103
|
+
options_str = ",".join(self.options[0:3])
|
|
104
|
+
if len(self.options) > 3:
|
|
105
|
+
options_str = options_str + ",..."
|
|
106
|
+
return "EnumType([{}], {})".format(options_str, self.properties)
|
|
107
|
+
|
|
108
|
+
def jsonValue(self):
|
|
109
|
+
dict(list(super(EnumType, self).jsonValue().items()) +
|
|
110
|
+
[('options', list(self.options))])
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class FloatType(AtomicType):
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class IntegerType(AtomicType):
|
|
118
|
+
@classmethod
|
|
119
|
+
def typeName(cls):
|
|
120
|
+
return "int"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class LongType(AtomicType):
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class NullType(AtomicType):
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class ShortType(AtomicType):
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class StringType(AtomicType):
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class TimestampType(AtomicType):
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
class TimestampNTZType(AtomicType):
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class UnknownType(AtomicType):
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
# ---------------------------------------------------------------------------
|
|
151
|
+
# Collection types
|
|
152
|
+
# ---------------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
class ArrayType(DataType):
|
|
155
|
+
|
|
156
|
+
def __init__(self, elementType=UnknownType(), properties={}):
|
|
157
|
+
assert isinstance(elementType, DataType),\
|
|
158
|
+
"elementType should be DataType. Got" + str(elementType.__class__)
|
|
159
|
+
super(ArrayType, self).__init__(properties)
|
|
160
|
+
self.elementType = elementType
|
|
161
|
+
|
|
162
|
+
def __repr__(self):
|
|
163
|
+
return "ArrayType({}, {})".format(self.elementType, self.properties)
|
|
164
|
+
|
|
165
|
+
def jsonValue(self):
|
|
166
|
+
return dict(list(super(ArrayType, self).jsonValue().items()) +
|
|
167
|
+
[("elementType", self.elementType.jsonValue())])
|
|
168
|
+
|
|
169
|
+
@classmethod
|
|
170
|
+
def fromJsonValue(cls, json_value):
|
|
171
|
+
element_type = _deserialize_json_value(json_value["elementType"])
|
|
172
|
+
return cls(elementType=element_type,
|
|
173
|
+
properties=json_value.get('properties', {}))
|
|
174
|
+
|
|
175
|
+
class SetType(DataType):
|
|
176
|
+
|
|
177
|
+
def __init__(self, elementType=UnknownType(), properties={}):
|
|
178
|
+
assert isinstance(elementType, DataType), \
|
|
179
|
+
"elementType should be DataType. Got" + str(elementType.__class__)
|
|
180
|
+
super(SetType, self).__init__(properties)
|
|
181
|
+
self.elementType = elementType
|
|
182
|
+
|
|
183
|
+
def __repr__(self):
|
|
184
|
+
return "SetType({}, {})".format(self.elementType, self.properties)
|
|
185
|
+
|
|
186
|
+
def jsonValue(self):
|
|
187
|
+
return dict(list(super(SetType, self).jsonValue().items()) +
|
|
188
|
+
[("elementType", self.elementType.jsonValue())])
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
def fromJsonValue(cls, json_value):
|
|
192
|
+
element_type = _deserialize_json_value(json_value["elementType"])
|
|
193
|
+
return cls(elementType=element_type,
|
|
194
|
+
properties=json_value.get('properties', {}))
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class ChoiceType(DataType):
|
|
198
|
+
|
|
199
|
+
def __init__(self, choices=[], properties={}):
|
|
200
|
+
super(ChoiceType, self).__init__(properties)
|
|
201
|
+
self.choices = {}
|
|
202
|
+
for choice in choices:
|
|
203
|
+
self.add(choice)
|
|
204
|
+
|
|
205
|
+
def __repr__(self):
|
|
206
|
+
sorted_values = sorted(self.choices.values(),
|
|
207
|
+
key = lambda x: x.typeName())
|
|
208
|
+
choice_str = "[{}]".format(",".join([str(c) for c in sorted_values]))
|
|
209
|
+
|
|
210
|
+
return "ChoiceType({}, {})".format(choice_str, self.properties)
|
|
211
|
+
|
|
212
|
+
def add(self, new_choice):
|
|
213
|
+
if new_choice.typeName() in self.choices:
|
|
214
|
+
raise ValueError("Attempting to insert duplicate choice",
|
|
215
|
+
new_choice)
|
|
216
|
+
self.choices[new_choice.typeName()] = new_choice
|
|
217
|
+
|
|
218
|
+
def merge(self, new_choices):
|
|
219
|
+
if not isinstance(new_choices, list):
|
|
220
|
+
new_choices = [ new_choices ]
|
|
221
|
+
for choice in new_choices:
|
|
222
|
+
existing = self.choices.get(choice.typeName(), UnknownType())
|
|
223
|
+
self.choices[choice.typeName()] = mergeDataTypes(existing, choice)
|
|
224
|
+
|
|
225
|
+
def jsonValue(self):
|
|
226
|
+
return dict(list(super(ChoiceType, self).jsonValue().items()) +
|
|
227
|
+
[("choices", [v.jsonValue()
|
|
228
|
+
for v in self.choices.values()])])
|
|
229
|
+
|
|
230
|
+
@classmethod
|
|
231
|
+
def fromJsonValue(cls, json_value):
|
|
232
|
+
choices = [_deserialize_json_value(c) for c in json_value["choices"]]
|
|
233
|
+
return cls(choices=choices, properties=json_value.get('properties', {}))
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class MapType(DataType):
|
|
237
|
+
|
|
238
|
+
def __init__(self, valueType=UnknownType(), properties={}):
|
|
239
|
+
assert isinstance(valueType, DataType), "valueType should be DataType"
|
|
240
|
+
super(MapType, self).__init__(properties)
|
|
241
|
+
self.valueType = valueType
|
|
242
|
+
|
|
243
|
+
def __repr__(self):
|
|
244
|
+
return "MapType({}, {})".format(self.valueType, self.properties)
|
|
245
|
+
|
|
246
|
+
def jsonValue(self):
|
|
247
|
+
return dict(list(super(MapType, self).jsonValue().items()) +
|
|
248
|
+
[("valueType", self.valueType.jsonValue())])
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def fromJsonValue(cls, json_value):
|
|
252
|
+
return cls(valueType=_deserialize_json_value(json_value["valueType"]),
|
|
253
|
+
properties=json_value.get('properties', {}))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class Field(object):
|
|
257
|
+
|
|
258
|
+
def __init__(self, name, dataType, properties={}):
|
|
259
|
+
assert isinstance(dataType, DataType),\
|
|
260
|
+
"dataType should be DataType. Got " + str(dataType.__class__)
|
|
261
|
+
assert isinstance(name, basestring),\
|
|
262
|
+
"Field name must be a string. Got " + str(name.__class__)
|
|
263
|
+
|
|
264
|
+
# Note this only applies in Python 2.7 if the name is type unicode. In that case
|
|
265
|
+
# we return a str (bytestring) encoded as utf-8. This is the same behavior as
|
|
266
|
+
# pyspark.sql.types.StructField. Since we are serializing as utf-8 encoded JSON,
|
|
267
|
+
# the correct values should be preserved when this gets mapped to Scala.
|
|
268
|
+
if not isinstance(name, str):
|
|
269
|
+
name = name.encode('utf-8')
|
|
270
|
+
self.name = name
|
|
271
|
+
self.dataType = dataType
|
|
272
|
+
self.properties = properties
|
|
273
|
+
|
|
274
|
+
def __eq__(self, other):
|
|
275
|
+
return (self.name == other.name and
|
|
276
|
+
self.dataType == other.dataType)
|
|
277
|
+
|
|
278
|
+
def __repr__(self):
|
|
279
|
+
return "Field({}, {}, {})".format(self.name, self.dataType,
|
|
280
|
+
self.properties)
|
|
281
|
+
|
|
282
|
+
def jsonValue(self):
|
|
283
|
+
return {"name": self.name,
|
|
284
|
+
"container": self.dataType.jsonValue(),
|
|
285
|
+
"properties": self.properties}
|
|
286
|
+
|
|
287
|
+
@classmethod
|
|
288
|
+
def fromJsonValue(cls, json_value):
|
|
289
|
+
return cls(json_value["name"],
|
|
290
|
+
_deserialize_json_value(json_value["container"]),
|
|
291
|
+
json_value.get("properties", {}))
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class StructType(DataType):
|
|
295
|
+
|
|
296
|
+
def __init__(self, fields=[], properties={}):
|
|
297
|
+
super(StructType, self).__init__(properties)
|
|
298
|
+
assert all(isinstance(f, Field) for f in fields),\
|
|
299
|
+
"fields should be a list of Field"
|
|
300
|
+
self.fields = fields
|
|
301
|
+
self.field_map = {field.name: field for field in fields}
|
|
302
|
+
|
|
303
|
+
def __iter__(self):
|
|
304
|
+
return iter(self.fields)
|
|
305
|
+
|
|
306
|
+
def __repr__(self):
|
|
307
|
+
return "StructType([{}], {})".format(
|
|
308
|
+
",".join([str(f) for f in self.fields]), self.properties)
|
|
309
|
+
|
|
310
|
+
def add(self, field):
|
|
311
|
+
assert isinstance(field, Field), "field must be of type Field"
|
|
312
|
+
self.fields.append(field)
|
|
313
|
+
self.field_map[field.name] = field
|
|
314
|
+
|
|
315
|
+
def hasField(self, field):
|
|
316
|
+
if isinstance(field, Field):
|
|
317
|
+
field = field.name
|
|
318
|
+
return field in self.field_map
|
|
319
|
+
|
|
320
|
+
def getField(self, field):
|
|
321
|
+
if isinstance(field, Field):
|
|
322
|
+
field = field.name
|
|
323
|
+
return self.field_map[field]
|
|
324
|
+
|
|
325
|
+
def jsonValue(self):
|
|
326
|
+
return dict(list(super(StructType, self).jsonValue().items()) +
|
|
327
|
+
[("fields", [f.jsonValue() for f in self.fields])])
|
|
328
|
+
|
|
329
|
+
@classmethod
|
|
330
|
+
def fromJsonValue(cls, json_value):
|
|
331
|
+
return cls([Field.fromJsonValue(f) for f in json_value["fields"]],
|
|
332
|
+
json_value.get("properties", {}))
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
class EntityType(DataType):
|
|
336
|
+
def __init__(self, entity, base_type, properties):
|
|
337
|
+
raise NotImplementedError("EntityTypes not yet supported in Tape.")
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# ---------------------------------------------------------------------------
|
|
341
|
+
# Utility methods
|
|
342
|
+
# ---------------------------------------------------------------------------
|
|
343
|
+
|
|
344
|
+
_atomic_types = [BinaryType, BooleanType, ByteType, DateType, DecimalType,
|
|
345
|
+
DoubleType, EnumType, FloatType, IntegerType, LongType, NullType,
|
|
346
|
+
ShortType, StringType, TimestampType, TimestampNTZType, UnknownType]
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
_complex_types = [ArrayType, ChoiceType, MapType, StructType, SetType]
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
_atomic_type_map = dict((t.typeName(), t) for t in _atomic_types) # type: ignore
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
_complex_type_map = dict((t.typeName(), t) for t in _complex_types)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
_all_type_map = dict((t.typeName(), t) for t in _atomic_types + _complex_types) # type: ignore
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _deserialize_json_string(json_str):
|
|
362
|
+
return _deserialize_json_value(json.loads(json_str))
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _deserialize_json_value(json_val):
|
|
366
|
+
assert isinstance(json_val, dict), "Json value must be dictionary"
|
|
367
|
+
data_type = json_val["dataType"]
|
|
368
|
+
return _all_type_map[data_type].fromJsonValue(json_val)
|
|
369
|
+
|
|
370
|
+
def _serialize_schema(schema):
|
|
371
|
+
return json.dumps(schema.jsonValue())
|
|
372
|
+
|
|
373
|
+
def _make_choice(s1, s2):
|
|
374
|
+
if isinstance(s1, ChoiceType):
|
|
375
|
+
left_types = s1.choices
|
|
376
|
+
else:
|
|
377
|
+
left_types = {s1.typeName(): s1}
|
|
378
|
+
|
|
379
|
+
if isinstance(s2, ChoiceType):
|
|
380
|
+
right_types = s2.choices
|
|
381
|
+
else:
|
|
382
|
+
right_types = {s2.typeName(): s2}
|
|
383
|
+
|
|
384
|
+
for typecode, datatype in iteritems(left_types):
|
|
385
|
+
if typecode in right_types:
|
|
386
|
+
right_types[typecode] = mergeDataTypes(datatype,
|
|
387
|
+
right_types[typecode])
|
|
388
|
+
else:
|
|
389
|
+
right_types[typecode] = datatype
|
|
390
|
+
|
|
391
|
+
return ChoiceType(right_types.values(), s1.properties)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
# Simple Python merge implementation. This is less efficient than the Scala
|
|
395
|
+
# version and should be used primarily for interactive manipulation.
|
|
396
|
+
# Has similar limitations to the Scala version -- does not merge properties,
|
|
397
|
+
# for instance.
|
|
398
|
+
def mergeDataTypes(s1, s2):
|
|
399
|
+
if isinstance(s1, UnknownType) or isinstance(s1, NullType):
|
|
400
|
+
return s2
|
|
401
|
+
elif isinstance(s2, UnknownType) or isinstance(s2, NullType):
|
|
402
|
+
return s1
|
|
403
|
+
elif isinstance(s1, ChoiceType) or isinstance(s2, ChoiceType):
|
|
404
|
+
return _make_choice(s1, s2)
|
|
405
|
+
elif type(s1) != type(s2):
|
|
406
|
+
return _make_choice(s1, s2)
|
|
407
|
+
else:
|
|
408
|
+
if isinstance(s1, StructType):
|
|
409
|
+
new_fields = []
|
|
410
|
+
# Fields that are present in both s1 and s2.
|
|
411
|
+
for field in s1:
|
|
412
|
+
if s2.hasField(field):
|
|
413
|
+
new_fields.append(
|
|
414
|
+
Field(field.name,
|
|
415
|
+
mergeDataTypes(field.dataType,
|
|
416
|
+
s2.getField(field).dataType),
|
|
417
|
+
field.properties))
|
|
418
|
+
else:
|
|
419
|
+
# Fields in s1 that are not in s2.
|
|
420
|
+
new_fields.append(Field(field.name, field.dataType,
|
|
421
|
+
field.properties))
|
|
422
|
+
|
|
423
|
+
# Fields in s2 that are not in s1.
|
|
424
|
+
new_fields.extend([Field(field.name, field.dataType,
|
|
425
|
+
field.properties)
|
|
426
|
+
for field in s2 if not s1.hasField(field)])
|
|
427
|
+
return StructType(new_fields, s1.properties)
|
|
428
|
+
elif isinstance(s1, ArrayType):
|
|
429
|
+
return ArrayType(mergeDataTypes(s1.elementType, s2.elementType))
|
|
430
|
+
elif isinstance(s1, MapType):
|
|
431
|
+
return MapType(mergeDataTypes(s1.valueType, s2.valueType))
|
|
432
|
+
elif isinstance(s1, EnumType):
|
|
433
|
+
return EnumType(s1.options + s2.options)
|
|
434
|
+
else:
|
|
435
|
+
return s1
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _create_dynamic_record(dynamicRecord):
|
|
439
|
+
vals = dict()
|
|
440
|
+
for k, v in dynamicRecord.items():
|
|
441
|
+
val = v
|
|
442
|
+
if type(v) == dict:
|
|
443
|
+
val = DynamicRecord(v)
|
|
444
|
+
vals[k] = val
|
|
445
|
+
return DynamicRecord(vals)
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _revert_to_dict(dynamicRecord):
|
|
449
|
+
if isinstance(dynamicRecord, dict):
|
|
450
|
+
return {k: _revert_to_dict(v) for k,v in iteritems(dynamicRecord)}
|
|
451
|
+
elif isinstance(dynamicRecord, list):
|
|
452
|
+
return [_revert_to_dict(v) for v in dynamicRecord]
|
|
453
|
+
else:
|
|
454
|
+
return dynamicRecord
|
|
455
|
+
|
|
456
|
+
class DynamicRecord(dict):
|
|
457
|
+
def __getattr__(self, attr):
|
|
458
|
+
return self[attr]
|
|
459
|
+
|
|
460
|
+
def __setattr__(self, attr, value):
|
|
461
|
+
self[attr] = value
|
awsglue/job.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
from py4j.java_gateway import java_import # type: ignore
|
|
13
|
+
class Job:
|
|
14
|
+
@classmethod
|
|
15
|
+
def continuation_options(cls):
|
|
16
|
+
return [ '--continuation-option', 'continuation-enabled', 'continuation-readonly', 'continuation-ignore' ]
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def job_bookmark_options(cls):
|
|
20
|
+
return [ '--job-bookmark-option', 'job-bookmark-enable', 'job-bookmark-pause', 'job-bookmark-disable' ]
|
|
21
|
+
@classmethod
|
|
22
|
+
def job_bookmark_range_options(cls):
|
|
23
|
+
return [ '--job-bookmark-from', '--job-bookmark-to' ]
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def id_params(cls):
|
|
27
|
+
return [ '--JOB_NAME', '--JOB_ID', '--JOB_RUN_ID', '--SECURITY_CONFIGURATION' ]
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def encryption_type_options(cls):
|
|
31
|
+
return [ '--encryption-type' , 'sse-s3' ]
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def data_lineage_options(cls):
|
|
35
|
+
return [ '--enable-data-lineage']
|
|
36
|
+
def __init__(self, glue_context_or_spark_session):
|
|
37
|
+
from pyspark.sql import SparkSession
|
|
38
|
+
from awsglue.context import GlueContext
|
|
39
|
+
if isinstance(glue_context_or_spark_session, GlueContext):
|
|
40
|
+
self._job = glue_context_or_spark_session._jvm.Job
|
|
41
|
+
self._glue_context = glue_context_or_spark_session
|
|
42
|
+
self._spark_session = glue_context_or_spark_session.sparkSession
|
|
43
|
+
elif isinstance(glue_context_or_spark_session, SparkSession):
|
|
44
|
+
java_import(glue_context_or_spark_session._jvm, "com.amazonaws.services.glue.util.Job")
|
|
45
|
+
self._job = glue_context_or_spark_session._jvm.Job
|
|
46
|
+
self._glue_context = None
|
|
47
|
+
self._spark_session = glue_context_or_spark_session
|
|
48
|
+
else:
|
|
49
|
+
raise Exception("cannot init Job instance given input parameter type: " + str(type(glue_context_or_spark_session)))
|
|
50
|
+
|
|
51
|
+
def init(self, job_name, args = {}):
|
|
52
|
+
self._job.init(job_name, self._spark_session._jsparkSession, args)
|
|
53
|
+
|
|
54
|
+
def isInitialized(self):
|
|
55
|
+
return self._job.isInitialized()
|
|
56
|
+
|
|
57
|
+
def commit(self):
|
|
58
|
+
self._job.commit()
|
|
59
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|