AWSGlueDataplanePython 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awsglue/README.md +37 -0
- awsglue/__init__.py +15 -0
- awsglue/context.py +690 -0
- awsglue/data_sink.py +49 -0
- awsglue/data_source.py +49 -0
- awsglue/dataframe_transforms/__init__.py +17 -0
- awsglue/dataframe_transforms/apply_mapping.py +76 -0
- awsglue/dataframereader.py +41 -0
- awsglue/dataframewriter.py +21 -0
- awsglue/devutils.py +236 -0
- awsglue/dynamicframe.py +669 -0
- awsglue/functions.py +31 -0
- awsglue/glue_shell.py +38 -0
- awsglue/gluetypes.py +461 -0
- awsglue/job.py +59 -0
- awsglue/scripts/__init__.py +12 -0
- awsglue/scripts/activate_etl_connector.py +362 -0
- awsglue/scripts/connector_activation_util.py +38 -0
- awsglue/scripts/crawler_redo_from_backup.py +75 -0
- awsglue/scripts/crawler_undo.py +121 -0
- awsglue/scripts/scripts_utils.py +106 -0
- awsglue/streaming_data_source.py +28 -0
- awsglue/transforms/__init__.py +47 -0
- awsglue/transforms/apply_mapping.py +72 -0
- awsglue/transforms/coalesce.py +66 -0
- awsglue/transforms/collection_transforms.py +155 -0
- awsglue/transforms/drop_nulls.py +85 -0
- awsglue/transforms/dynamicframe_filter.py +66 -0
- awsglue/transforms/dynamicframe_map.py +72 -0
- awsglue/transforms/errors_as_dynamicframe.py +45 -0
- awsglue/transforms/field_transforms.py +469 -0
- awsglue/transforms/relationalize.py +105 -0
- awsglue/transforms/repartition.py +61 -0
- awsglue/transforms/resolve_choice.py +85 -0
- awsglue/transforms/transform.py +92 -0
- awsglue/transforms/unbox.py +112 -0
- awsglue/transforms/union.py +66 -0
- awsglue/transforms/unnest_frame.py +75 -0
- awsglue/utils.py +159 -0
- awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
- awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
- awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
- awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
from awsglue.context import GlueContext
|
|
15
|
+
from awsglue.dynamicframe import DynamicFrame
|
|
16
|
+
from awsglue.transforms import get_transform
|
|
17
|
+
from pyspark.sql.types import *
|
|
18
|
+
from pyspark.sql.functions import *
|
|
19
|
+
|
|
20
|
+
COLLECT_RESULT_NAME = "collect_list(named_struct(NamePlaceholder(), unresolvedstar()))"
|
|
21
|
+
DEFAULT_CATALOG_ENDPOINT = 'daylight-gamma'
|
|
22
|
+
DEFAULT_GLUE_ENDPOINT = 'glue-beta'
|
|
23
|
+
DEFAULT_REGION = 'us-east-1'
|
|
24
|
+
|
|
25
|
+
def write_backup(data, database_name, backup_location, glue_context):
|
|
26
|
+
nested_tables = nest_data_frame(_order_columns_for_backup(data['table']), database_name, 'table')
|
|
27
|
+
nested_partitions = nest_data_frame(_order_columns_for_backup(data['partition']), database_name, 'partition')
|
|
28
|
+
write_df_to_s3(
|
|
29
|
+
glue_context,
|
|
30
|
+
nested_tables.withColumn("table",lit("empty")).select(col("table"),("items"),("database"),("type")).union(nested_partitions),
|
|
31
|
+
backup_location
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _order_columns_for_backup(dataframe):
|
|
35
|
+
return dataframe.select(
|
|
36
|
+
col('name'),
|
|
37
|
+
col('description'),
|
|
38
|
+
col('owner'),
|
|
39
|
+
col('createTime'),
|
|
40
|
+
col('updateTime'),
|
|
41
|
+
col('lastAccessTime'),
|
|
42
|
+
col('lastAnalyzedTime'),
|
|
43
|
+
col('retention'),
|
|
44
|
+
col('storageDescriptor'),
|
|
45
|
+
col('partitionKeys'),
|
|
46
|
+
col('tableType'),
|
|
47
|
+
col('parameters'),
|
|
48
|
+
col('createdBy'),
|
|
49
|
+
col('values'),
|
|
50
|
+
col('namespaceName'),
|
|
51
|
+
col('tableName'),
|
|
52
|
+
col('table')
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def nest_data_frame(data_frame, database_name, entity_type):
|
|
56
|
+
if entity_type.startswith("table"):
|
|
57
|
+
# Entity is a table
|
|
58
|
+
return data_frame.agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("database",lit(database_name)).withColumn("type", lit(entity_type))
|
|
59
|
+
elif entity_type.startswith("partition"):
|
|
60
|
+
# Entity is a partition
|
|
61
|
+
return data_frame.groupBy('tableName').agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("database",lit(database_name)).withColumn("type", lit(entity_type)).withColumnRenamed("tableName","table")
|
|
62
|
+
elif entity_type.startswith("database"):
|
|
63
|
+
return data_frame.groupBy().agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("type", lit(entity_type))
|
|
64
|
+
else:
|
|
65
|
+
raise Exception("entity_type %s is not recognized, your backup data may be corrupted..." % entity_type)
|
|
66
|
+
|
|
67
|
+
def write_df_to_catalog(data_frame, entity_type, glue_context, options):
|
|
68
|
+
# Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get.
|
|
69
|
+
if data_frame.rdd.isEmpty():
|
|
70
|
+
return # nothing to do
|
|
71
|
+
database_name = options['catalog.database']
|
|
72
|
+
nested_data_frame = nest_data_frame(data_frame, database_name, entity_type)
|
|
73
|
+
dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type)
|
|
74
|
+
sink = glue_context.getSink('catalog', **options)
|
|
75
|
+
sink.write(dynamic_frame)
|
|
76
|
+
|
|
77
|
+
def catalog_dict(data_frame):
|
|
78
|
+
databases = data_frame.filter("type = 'database'").select(explode(data_frame['items'])).select(col("col.*"))
|
|
79
|
+
tables = data_frame.filter("type = 'table'").select(explode(data_frame['items'])).select(col("col.*"))
|
|
80
|
+
table_versions = data_frame.filter("type = 'tableVersion'").select(explode(data_frame['items'])).select(col("col.*"))
|
|
81
|
+
partitions = data_frame.filter("type = 'partition'").select(explode(data_frame['items'])).select(col("col.*"))
|
|
82
|
+
tables_to_delete = data_frame.filter("type = 'tableToDelete'").select(explode(data_frame['items'])).select(col("col.*"))
|
|
83
|
+
partitions_to_delete = data_frame.filter("type = 'partitionToDelete'").select(explode(data_frame['items'])).select(col("col.*"))
|
|
84
|
+
return {
|
|
85
|
+
'database' : databases,
|
|
86
|
+
'table' : tables,
|
|
87
|
+
'tableVersion' : table_versions,
|
|
88
|
+
'partition' : partitions,
|
|
89
|
+
'tableToDelete' : tables_to_delete,
|
|
90
|
+
'partitionToDelete' : partitions_to_delete
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
def read_from_catalog(glue_context, options):
|
|
94
|
+
return catalog_dict(glue_context.create_dynamic_frame_from_options(
|
|
95
|
+
connection_type="com.amazonaws.services.glue.connections.DataCatalogConnection", connection_options=options).toDF())
|
|
96
|
+
|
|
97
|
+
def write_df_to_s3(glue_context, data_frame, backup_location):
|
|
98
|
+
dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3")
|
|
99
|
+
sink = glue_context.getSink("s3", path=backup_location)
|
|
100
|
+
sink.setFormat("json")
|
|
101
|
+
sink.write(dynamic_frame)
|
|
102
|
+
|
|
103
|
+
def read_from_s3(glue_context, backup_location):
|
|
104
|
+
src = glue_context.getSource("file", paths=[backup_location])
|
|
105
|
+
src.setFormat('json')
|
|
106
|
+
return catalog_dict(src.getFrame().toDF())
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from awsglue.utils import makeOptions, callsite
|
|
14
|
+
from pyspark.sql import DataFrame
|
|
15
|
+
|
|
16
|
+
class StreamingDataSource(object):
|
|
17
|
+
def __init__(self, j_source, sql_ctx, name):
|
|
18
|
+
self._jsource = j_source
|
|
19
|
+
self._sql_ctx = sql_ctx
|
|
20
|
+
self.name = name
|
|
21
|
+
|
|
22
|
+
def setFormat(self, format, **options):
|
|
23
|
+
options["callSite"] = callsite()
|
|
24
|
+
self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc, options))
|
|
25
|
+
|
|
26
|
+
def getFrame(self):
|
|
27
|
+
jdf = self._jsource.getDataFrame()
|
|
28
|
+
return DataFrame(jdf, self._sql_ctx)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from .transform import GlueTransform
|
|
14
|
+
from .unbox import Unbox
|
|
15
|
+
from .unnest_frame import UnnestFrame
|
|
16
|
+
from .relationalize import Relationalize
|
|
17
|
+
from .field_transforms import RenameField, DropFields, SelectFields, SplitFields, SplitRows, Join, Spigot
|
|
18
|
+
from .collection_transforms import SelectFromCollection, MapToCollection, FlatMap
|
|
19
|
+
from .drop_nulls import DropNullFields
|
|
20
|
+
from .apply_mapping import ApplyMapping
|
|
21
|
+
from .repartition import Repartition
|
|
22
|
+
from .resolve_choice import ResolveChoice
|
|
23
|
+
from .errors_as_dynamicframe import ErrorsAsDynamicFrame
|
|
24
|
+
from .dynamicframe_filter import Filter
|
|
25
|
+
from .dynamicframe_map import Map
|
|
26
|
+
from .coalesce import Coalesce
|
|
27
|
+
from .union import Union
|
|
28
|
+
import json
|
|
29
|
+
|
|
30
|
+
ALL_TRANSFORMS = {Unbox, RenameField, DropFields, SplitFields, SelectFields, SplitRows,
|
|
31
|
+
UnnestFrame, Relationalize, SelectFromCollection,
|
|
32
|
+
MapToCollection, ErrorsAsDynamicFrame, FlatMap, DropNullFields,
|
|
33
|
+
Join, ApplyMapping, Repartition, ResolveChoice, Spigot, Filter, Map, Coalesce, Union}
|
|
34
|
+
|
|
35
|
+
__all__ = [transform.__name__ for transform in ALL_TRANSFORMS]
|
|
36
|
+
|
|
37
|
+
def get_transforms():
|
|
38
|
+
return {transform() for transform in ALL_TRANSFORMS}
|
|
39
|
+
|
|
40
|
+
def get_transform(name):
|
|
41
|
+
transform, = [t for t in get_transforms() if t.name().lower() == name.lower()] or (None,)
|
|
42
|
+
return transform
|
|
43
|
+
|
|
44
|
+
def describe_transform(name):
|
|
45
|
+
transform = get_transform(name)
|
|
46
|
+
description = transform.describe() if transform else {}
|
|
47
|
+
return json.dumps(description, sort_keys=True, indent=4, separators=(',', ': '))
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from awsglue.transforms import DropFields, GlueTransform
|
|
14
|
+
|
|
15
|
+
class ApplyMapping(GlueTransform):
|
|
16
|
+
def __call__(self, frame, mappings, case_sensitive = False,
|
|
17
|
+
transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
18
|
+
return frame.apply_mapping(mappings, case_sensitive, transformation_ctx,
|
|
19
|
+
info, stageThreshold, totalThreshold)
|
|
20
|
+
|
|
21
|
+
@classmethod
|
|
22
|
+
def describeArgs(cls):
|
|
23
|
+
arg1 = {"name": "frame",
|
|
24
|
+
"type": "DynamicFrame",
|
|
25
|
+
"description": "DynamicFrame to transform",
|
|
26
|
+
"optional": False,
|
|
27
|
+
"defaultValue": None}
|
|
28
|
+
arg2 = {"name": "mappings",
|
|
29
|
+
"type": "DynamicFrame",
|
|
30
|
+
"description": "List of mapping tuples (source col, source type, target col, target type)",
|
|
31
|
+
"optional": False,
|
|
32
|
+
"defaultValue": None}
|
|
33
|
+
arg3 = {"name": "case_sensitive",
|
|
34
|
+
"type": "Boolean",
|
|
35
|
+
"description": "Whether ",
|
|
36
|
+
"optional": True,
|
|
37
|
+
"defaultValue": "False"}
|
|
38
|
+
arg4 = {"name": "transformation_ctx",
|
|
39
|
+
"type": "String",
|
|
40
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
41
|
+
"optional": True,
|
|
42
|
+
"defaultValue": ""}
|
|
43
|
+
arg5 = {"name": "info",
|
|
44
|
+
"type": "String",
|
|
45
|
+
"description": "Any string to be associated with errors in the transformation",
|
|
46
|
+
"optional": True,
|
|
47
|
+
"defaultValue": "\"\""}
|
|
48
|
+
arg6 = {"name": "stageThreshold",
|
|
49
|
+
"type": "Integer",
|
|
50
|
+
"description": "Max number of errors in the transformation until processing will error out",
|
|
51
|
+
"optional": True,
|
|
52
|
+
"defaultValue": "0"}
|
|
53
|
+
arg7 = {"name": "totalThreshold",
|
|
54
|
+
"type": "Integer",
|
|
55
|
+
"description": "Max number of errors total until processing will error out.",
|
|
56
|
+
"optional": True,
|
|
57
|
+
"defaultValue": "0"}
|
|
58
|
+
|
|
59
|
+
return [arg1, arg2, arg3, arg4, arg5, arg6, arg7]
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def describeTransform(cls):
|
|
63
|
+
return "Apply a declarative mapping to this DynamicFrame."
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def describeErrors(cls):
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
@classmethod
|
|
70
|
+
def describeReturn(cls):
|
|
71
|
+
return {"type": "DynamicFrame",
|
|
72
|
+
"description": "DynamicFrame after applying mappings."}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from awsglue.transforms import GlueTransform
|
|
14
|
+
|
|
15
|
+
class Coalesce(GlueTransform):
|
|
16
|
+
def __call__(self, frame, num_partitions, shuffle = False, transformation_ctx = "", info = "",
|
|
17
|
+
stageThreshold = 0, totalThreshold = 0):
|
|
18
|
+
return frame.coalesce(num_partitions, shuffle, transformation_ctx, info, stageThreshold, totalThreshold)
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def describeArgs(cls):
|
|
22
|
+
arg1 = {"name": "num_partitions",
|
|
23
|
+
"type": "DynamicFrame",
|
|
24
|
+
"description": "Number of partitions",
|
|
25
|
+
"optional": False,
|
|
26
|
+
"defaultValue": None}
|
|
27
|
+
arg2 = {"name": "shuffle",
|
|
28
|
+
"type": "Boolean",
|
|
29
|
+
"description": "A boolean indicating whether shuffling enabled for the coalesce process",
|
|
30
|
+
"optional": True,
|
|
31
|
+
"defaultValue": False}
|
|
32
|
+
arg3 = {"name": "transformation_ctx",
|
|
33
|
+
"type": "String",
|
|
34
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
35
|
+
"optional": True,
|
|
36
|
+
"defaultValue": ""}
|
|
37
|
+
arg4 = {"name": "info",
|
|
38
|
+
"type": "String",
|
|
39
|
+
"description": "Any string to be associated with errors in the transformation",
|
|
40
|
+
"optional": True,
|
|
41
|
+
"defaultValue": "\"\""}
|
|
42
|
+
arg5 = {"name": "stageThreshold",
|
|
43
|
+
"type": "Integer",
|
|
44
|
+
"description": "Max number of errors in the transformation until processing will error out",
|
|
45
|
+
"optional": True,
|
|
46
|
+
"defaultValue": "0"}
|
|
47
|
+
arg6 = {"name": "totalThreshold",
|
|
48
|
+
"type": "Integer",
|
|
49
|
+
"description": "Max number of errors total until processing will error out.",
|
|
50
|
+
"optional": True,
|
|
51
|
+
"defaultValue": "0"}
|
|
52
|
+
|
|
53
|
+
return [arg1, arg2, arg3, arg4, arg5, arg6]
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def describeTransform(cls):
|
|
57
|
+
return "Coalesces a DynamicFrame."
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def describeErrors(cls):
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def describeReturn(cls):
|
|
65
|
+
return {"type": "DynamicFrame",
|
|
66
|
+
"description": "The coalesced DynamicFrame."}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from awsglue.transforms import GlueTransform
|
|
14
|
+
|
|
15
|
+
class SelectFromCollection(GlueTransform):
|
|
16
|
+
|
|
17
|
+
def __call__(self, dfc, key, transformation_ctx = ""):
|
|
18
|
+
return dfc.select(key, transformation_ctx)
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def describeArgs(cls):
|
|
22
|
+
arg1 = {"name": "dfc",
|
|
23
|
+
"type": "DynamicFrameCollection",
|
|
24
|
+
"description": "select one DynamicFrame from this DynamicFrameCollection",
|
|
25
|
+
"optional": False,
|
|
26
|
+
"defaultValue": None}
|
|
27
|
+
|
|
28
|
+
arg2 = {"name": "key",
|
|
29
|
+
"type": "String",
|
|
30
|
+
"description": "The key to select",
|
|
31
|
+
"optional": False,
|
|
32
|
+
"defaultValue": None}
|
|
33
|
+
|
|
34
|
+
arg3 = {"name": "transformation_ctx",
|
|
35
|
+
"type": "String",
|
|
36
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
37
|
+
"optional": True,
|
|
38
|
+
"defaultValue": ""}
|
|
39
|
+
|
|
40
|
+
return [arg1, arg2, arg3]
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def describeTransform(cls):
|
|
44
|
+
return "Select one DynamicFrame out from the DynamicFrameCollection"
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def describeErrors(cls):
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def describeReturn(cls):
|
|
52
|
+
return {"type": "DynamicFrame",
|
|
53
|
+
"description": "Dynamic Frame corresponding to name"}
|
|
54
|
+
|
|
55
|
+
class MapToCollection(GlueTransform):
|
|
56
|
+
|
|
57
|
+
def __call__(self, dfc, callable, transformation_ctx = ""):
|
|
58
|
+
return dfc.map(callable, transformation_ctx)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def describeArgs(cls):
|
|
62
|
+
arg1 = {"name": "dfc",
|
|
63
|
+
"type": "CollectionDynamicFrame",
|
|
64
|
+
"description": "apply function on this DynamicFrameCollection",
|
|
65
|
+
"optional": False,
|
|
66
|
+
"defaultValue": None}
|
|
67
|
+
|
|
68
|
+
arg2 = {"name": "callable",
|
|
69
|
+
"type": "Callable",
|
|
70
|
+
"description": "apply this Callable on DynamicFrameCollection",
|
|
71
|
+
"optional": False,
|
|
72
|
+
"defaultValue": None}
|
|
73
|
+
|
|
74
|
+
arg3 = {"name": "transformation_ctx",
|
|
75
|
+
"type": "String",
|
|
76
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
77
|
+
"optional": True,
|
|
78
|
+
"defaultValue": ""}
|
|
79
|
+
|
|
80
|
+
return [arg1, arg2, arg3]
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def describeTransform(cls):
|
|
84
|
+
return "Apply a transform on each DynamicFrame of this DynamicFrameCollection"
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def describeErrors(cls):
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
@classmethod
|
|
91
|
+
def describeReturn(cls):
|
|
92
|
+
return {"type": "DynamicFrameCollection",
|
|
93
|
+
"description": "A new DynamicFrameCollection after apply transform on each element"}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class FlatMap(GlueTransform):
|
|
97
|
+
|
|
98
|
+
def __call__(self, dfc, BaseTransform, frame_name, transformation_ctx = "", **base_kwargs):
|
|
99
|
+
args = {}
|
|
100
|
+
|
|
101
|
+
def apply_inner(frame, transformation_ctx):
|
|
102
|
+
args.clear()
|
|
103
|
+
args.update(base_kwargs)
|
|
104
|
+
args[frame_name] = frame
|
|
105
|
+
args["transformation_ctx"] = transformation_ctx
|
|
106
|
+
return BaseTransform.apply(**args)
|
|
107
|
+
|
|
108
|
+
return dfc.flatmap(apply_inner, transformation_ctx)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def describeArgs(cls):
|
|
112
|
+
arg1 = {"name": "dfc",
|
|
113
|
+
"type": "DynamicFrameCollection",
|
|
114
|
+
"description": "The collection over which to flatmap.",
|
|
115
|
+
"optional": False,
|
|
116
|
+
"defaultValue": None}
|
|
117
|
+
|
|
118
|
+
arg2 = {"name": "BaseTransform",
|
|
119
|
+
"type": "GlueTransform",
|
|
120
|
+
"description": "A GlueTransform to apply to each member of the collection.",
|
|
121
|
+
"optional": False,
|
|
122
|
+
"defaultValue": None}
|
|
123
|
+
|
|
124
|
+
arg3 = {"name": "frame_name",
|
|
125
|
+
"type": "String",
|
|
126
|
+
"description": "The argument name to which to pass the elements of the collection.",
|
|
127
|
+
"optional": False,
|
|
128
|
+
"defaultValue": None}
|
|
129
|
+
|
|
130
|
+
arg4 = {"name": "transformation_ctx",
|
|
131
|
+
"type": "String",
|
|
132
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
133
|
+
"optional": True,
|
|
134
|
+
"defaultValue": ""}
|
|
135
|
+
|
|
136
|
+
arg5 = {"name": "base_kwargs",
|
|
137
|
+
"type": "dict",
|
|
138
|
+
"description": "Arguments to pass to the base transform.",
|
|
139
|
+
"optional": False,
|
|
140
|
+
"defaultValue": None}
|
|
141
|
+
|
|
142
|
+
return [arg1, arg2, arg3, arg4, arg5]
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def describeTransform(cls):
|
|
146
|
+
return "Applies a transform to each DynamicFrame in a collection and flattens the results."
|
|
147
|
+
|
|
148
|
+
@classmethod
|
|
149
|
+
def describeErrors(cls):
|
|
150
|
+
return []
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def describeReturn(cls):
|
|
154
|
+
return {"type": "DynamicFrameCollection",
|
|
155
|
+
"description": "A new DynamicFrameCollection after applying the transform on each element"}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from __future__ import print_function
|
|
14
|
+
from awsglue.transforms import DropFields, GlueTransform
|
|
15
|
+
from awsglue.gluetypes import ArrayType, NullType, StructType
|
|
16
|
+
|
|
17
|
+
class DropNullFields(GlueTransform):
|
|
18
|
+
def _find_null_fields(self, ctx, schema, path, output):
|
|
19
|
+
if isinstance(schema, StructType):
|
|
20
|
+
for field in schema:
|
|
21
|
+
new_path = path + "." if path != "" else path
|
|
22
|
+
self._find_null_fields(ctx, field.dataType, new_path + ctx._jvm.RecordUtils.quoteName(field.name), output)
|
|
23
|
+
|
|
24
|
+
elif isinstance(schema, ArrayType):
|
|
25
|
+
# For the moment we only remove null fields in nested array columns.
|
|
26
|
+
# We don't change ArrayType(NullType).
|
|
27
|
+
if isinstance(schema.elementType, StructType):
|
|
28
|
+
self._find_null_fields(ctx, schema.elementType, path, output)
|
|
29
|
+
|
|
30
|
+
elif isinstance(schema, NullType):
|
|
31
|
+
output.append(path)
|
|
32
|
+
|
|
33
|
+
# Note: dropFields currently does not work through maps,
|
|
34
|
+
# so neither does DropNullFields
|
|
35
|
+
|
|
36
|
+
def __call__(self, frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
|
|
37
|
+
null_fields = []
|
|
38
|
+
self._find_null_fields(frame.glue_ctx, frame.schema(), "", null_fields)
|
|
39
|
+
print("null_fields", null_fields)
|
|
40
|
+
|
|
41
|
+
return DropFields.apply(frame, null_fields, transformation_ctx,
|
|
42
|
+
info, stageThreshold, totalThreshold)
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def describeArgs(cls):
|
|
46
|
+
arg1 = {"name": "frame",
|
|
47
|
+
"type": "DynamicFrame",
|
|
48
|
+
"description": "Drop all null fields in this DynamicFrame",
|
|
49
|
+
"optional": False,
|
|
50
|
+
"defaultValue": None}
|
|
51
|
+
arg2 = {"name": "transformation_ctx",
|
|
52
|
+
"type": "String",
|
|
53
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
54
|
+
"optional": True,
|
|
55
|
+
"defaultValue": ""}
|
|
56
|
+
arg3 = {"name": "info",
|
|
57
|
+
"type": "String",
|
|
58
|
+
"description": "Any string to be associated with errors in the transformation",
|
|
59
|
+
"optional": True,
|
|
60
|
+
"defaultValue": "\"\""}
|
|
61
|
+
arg4 = {"name": "stageThreshold",
|
|
62
|
+
"type": "Integer",
|
|
63
|
+
"description": "Max number of errors in the transformation until processing will error out",
|
|
64
|
+
"optional": True,
|
|
65
|
+
"defaultValue": "0"}
|
|
66
|
+
arg5 = {"name": "totalThreshold",
|
|
67
|
+
"type": "Integer",
|
|
68
|
+
"description": "Max number of errors total until processing will error out.",
|
|
69
|
+
"optional": True,
|
|
70
|
+
"defaultValue": "0"}
|
|
71
|
+
|
|
72
|
+
return [arg1, arg2, arg3, arg4, arg5]
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def describeTransform(cls):
|
|
76
|
+
return "Drop all null fields in this DynamicFrame"
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def describeErrors(cls):
|
|
80
|
+
return []
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def describeReturn(cls):
|
|
84
|
+
return {"type": "DynamicFrame",
|
|
85
|
+
"description": "DynamicFrame without null fields."}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from awsglue.transforms import GlueTransform
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Filter(GlueTransform):
|
|
17
|
+
def __call__(self, frame, f, transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
|
|
18
|
+
return frame.filter(f, transformation_ctx, info, stageThreshold, totalThreshold)
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def describeArgs(cls):
|
|
22
|
+
arg1 = {"name": "frame",
|
|
23
|
+
"type": "DynamicFrame",
|
|
24
|
+
"description": "The DynamicFrame to apply the Filter function",
|
|
25
|
+
"optional": False,
|
|
26
|
+
"defaultValue": None}
|
|
27
|
+
arg2 = {"name": "f",
|
|
28
|
+
"type": "Function",
|
|
29
|
+
"description": "Predicate function to call on the DynamicFrame. The function takes DynamicRecord as the argument and returns True/False",
|
|
30
|
+
"optional": False,
|
|
31
|
+
"defaultValue": None}
|
|
32
|
+
arg3 = {"name": "transformation_ctx",
|
|
33
|
+
"type": "String",
|
|
34
|
+
"description": "A unique string that is used to identify stats / state information",
|
|
35
|
+
"optional": True,
|
|
36
|
+
"defaultValue": ""}
|
|
37
|
+
arg4 = {"name": "info",
|
|
38
|
+
"type": "String",
|
|
39
|
+
"description": "Any string to be associated with errors in the transformation",
|
|
40
|
+
"optional": True,
|
|
41
|
+
"defaultValue": "\"\""}
|
|
42
|
+
arg5 = {"name": "stageThreshold",
|
|
43
|
+
"type": "Integer",
|
|
44
|
+
"description": "Max number of errors in the transformation until processing will error out",
|
|
45
|
+
"optional": True,
|
|
46
|
+
"defaultValue": "0"}
|
|
47
|
+
arg6 = {"name": "totalThreshold",
|
|
48
|
+
"type": "Integer",
|
|
49
|
+
"description": "Max number of errors total until processing will error out.",
|
|
50
|
+
"optional": True,
|
|
51
|
+
"defaultValue": "0"}
|
|
52
|
+
|
|
53
|
+
return [arg1, arg2, arg3, arg4, arg5, arg6]
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def describeTransform(cls):
|
|
57
|
+
return "Builds a new DynamicFrame by selecting records from the input frame that satisfy the predicate function"
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def describeErrors(cls):
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def describeReturn(cls):
|
|
65
|
+
return {"type": "DynamicFrame",
|
|
66
|
+
"description": "new DynamicFrame with DynamicRecords that matched the predicate"}
|