AWSGlueDataplanePython 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. awsglue/README.md +37 -0
  2. awsglue/__init__.py +15 -0
  3. awsglue/context.py +690 -0
  4. awsglue/data_sink.py +49 -0
  5. awsglue/data_source.py +49 -0
  6. awsglue/dataframe_transforms/__init__.py +17 -0
  7. awsglue/dataframe_transforms/apply_mapping.py +76 -0
  8. awsglue/dataframereader.py +41 -0
  9. awsglue/dataframewriter.py +21 -0
  10. awsglue/devutils.py +236 -0
  11. awsglue/dynamicframe.py +669 -0
  12. awsglue/functions.py +31 -0
  13. awsglue/glue_shell.py +38 -0
  14. awsglue/gluetypes.py +461 -0
  15. awsglue/job.py +59 -0
  16. awsglue/scripts/__init__.py +12 -0
  17. awsglue/scripts/activate_etl_connector.py +362 -0
  18. awsglue/scripts/connector_activation_util.py +38 -0
  19. awsglue/scripts/crawler_redo_from_backup.py +75 -0
  20. awsglue/scripts/crawler_undo.py +121 -0
  21. awsglue/scripts/scripts_utils.py +106 -0
  22. awsglue/streaming_data_source.py +28 -0
  23. awsglue/transforms/__init__.py +47 -0
  24. awsglue/transforms/apply_mapping.py +72 -0
  25. awsglue/transforms/coalesce.py +66 -0
  26. awsglue/transforms/collection_transforms.py +155 -0
  27. awsglue/transforms/drop_nulls.py +85 -0
  28. awsglue/transforms/dynamicframe_filter.py +66 -0
  29. awsglue/transforms/dynamicframe_map.py +72 -0
  30. awsglue/transforms/errors_as_dynamicframe.py +45 -0
  31. awsglue/transforms/field_transforms.py +469 -0
  32. awsglue/transforms/relationalize.py +105 -0
  33. awsglue/transforms/repartition.py +61 -0
  34. awsglue/transforms/resolve_choice.py +85 -0
  35. awsglue/transforms/transform.py +92 -0
  36. awsglue/transforms/unbox.py +112 -0
  37. awsglue/transforms/union.py +66 -0
  38. awsglue/transforms/unnest_frame.py +75 -0
  39. awsglue/utils.py +159 -0
  40. awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
  41. awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
  42. awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
  43. awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
  44. awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
  45. awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,106 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ import os
14
+ from awsglue.context import GlueContext
15
+ from awsglue.dynamicframe import DynamicFrame
16
+ from awsglue.transforms import get_transform
17
+ from pyspark.sql.types import *
18
+ from pyspark.sql.functions import *
19
+
20
+ COLLECT_RESULT_NAME = "collect_list(named_struct(NamePlaceholder(), unresolvedstar()))"
21
+ DEFAULT_CATALOG_ENDPOINT = 'daylight-gamma'
22
+ DEFAULT_GLUE_ENDPOINT = 'glue-beta'
23
+ DEFAULT_REGION = 'us-east-1'
24
+
25
+ def write_backup(data, database_name, backup_location, glue_context):
26
+ nested_tables = nest_data_frame(_order_columns_for_backup(data['table']), database_name, 'table')
27
+ nested_partitions = nest_data_frame(_order_columns_for_backup(data['partition']), database_name, 'partition')
28
+ write_df_to_s3(
29
+ glue_context,
30
+ nested_tables.withColumn("table",lit("empty")).select(col("table"),("items"),("database"),("type")).union(nested_partitions),
31
+ backup_location
32
+ )
33
+
34
+ def _order_columns_for_backup(dataframe):
35
+ return dataframe.select(
36
+ col('name'),
37
+ col('description'),
38
+ col('owner'),
39
+ col('createTime'),
40
+ col('updateTime'),
41
+ col('lastAccessTime'),
42
+ col('lastAnalyzedTime'),
43
+ col('retention'),
44
+ col('storageDescriptor'),
45
+ col('partitionKeys'),
46
+ col('tableType'),
47
+ col('parameters'),
48
+ col('createdBy'),
49
+ col('values'),
50
+ col('namespaceName'),
51
+ col('tableName'),
52
+ col('table')
53
+ )
54
+
55
+ def nest_data_frame(data_frame, database_name, entity_type):
56
+ if entity_type.startswith("table"):
57
+ # Entity is a table
58
+ return data_frame.agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("database",lit(database_name)).withColumn("type", lit(entity_type))
59
+ elif entity_type.startswith("partition"):
60
+ # Entity is a partition
61
+ return data_frame.groupBy('tableName').agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("database",lit(database_name)).withColumn("type", lit(entity_type)).withColumnRenamed("tableName","table")
62
+ elif entity_type.startswith("database"):
63
+ return data_frame.groupBy().agg(collect_list(struct("*"))).withColumnRenamed(COLLECT_RESULT_NAME, "items").withColumn("type", lit(entity_type))
64
+ else:
65
+ raise Exception("entity_type %s is not recognized, your backup data may be corrupted..." % entity_type)
66
+
67
+ def write_df_to_catalog(data_frame, entity_type, glue_context, options):
68
+ # Check if data frame is empty. There is no "empty" method for data frame, this is the closest we get.
69
+ if data_frame.rdd.isEmpty():
70
+ return # nothing to do
71
+ database_name = options['catalog.database']
72
+ nested_data_frame = nest_data_frame(data_frame, database_name, entity_type)
73
+ dynamic_frame = DynamicFrame.fromDF(nested_data_frame, glue_context, entity_type)
74
+ sink = glue_context.getSink('catalog', **options)
75
+ sink.write(dynamic_frame)
76
+
77
+ def catalog_dict(data_frame):
78
+ databases = data_frame.filter("type = 'database'").select(explode(data_frame['items'])).select(col("col.*"))
79
+ tables = data_frame.filter("type = 'table'").select(explode(data_frame['items'])).select(col("col.*"))
80
+ table_versions = data_frame.filter("type = 'tableVersion'").select(explode(data_frame['items'])).select(col("col.*"))
81
+ partitions = data_frame.filter("type = 'partition'").select(explode(data_frame['items'])).select(col("col.*"))
82
+ tables_to_delete = data_frame.filter("type = 'tableToDelete'").select(explode(data_frame['items'])).select(col("col.*"))
83
+ partitions_to_delete = data_frame.filter("type = 'partitionToDelete'").select(explode(data_frame['items'])).select(col("col.*"))
84
+ return {
85
+ 'database' : databases,
86
+ 'table' : tables,
87
+ 'tableVersion' : table_versions,
88
+ 'partition' : partitions,
89
+ 'tableToDelete' : tables_to_delete,
90
+ 'partitionToDelete' : partitions_to_delete
91
+ }
92
+
93
+ def read_from_catalog(glue_context, options):
94
+ return catalog_dict(glue_context.create_dynamic_frame_from_options(
95
+ connection_type="com.amazonaws.services.glue.connections.DataCatalogConnection", connection_options=options).toDF())
96
+
97
+ def write_df_to_s3(glue_context, data_frame, backup_location):
98
+ dynamic_frame = DynamicFrame.fromDF(data_frame, glue_context, "toS3")
99
+ sink = glue_context.getSink("s3", path=backup_location)
100
+ sink.setFormat("json")
101
+ sink.write(dynamic_frame)
102
+
103
+ def read_from_s3(glue_context, backup_location):
104
+ src = glue_context.getSource("file", paths=[backup_location])
105
+ src.setFormat('json')
106
+ return catalog_dict(src.getFrame().toDF())
@@ -0,0 +1,28 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from awsglue.utils import makeOptions, callsite
14
+ from pyspark.sql import DataFrame
15
+
16
+ class StreamingDataSource(object):
17
+ def __init__(self, j_source, sql_ctx, name):
18
+ self._jsource = j_source
19
+ self._sql_ctx = sql_ctx
20
+ self.name = name
21
+
22
+ def setFormat(self, format, **options):
23
+ options["callSite"] = callsite()
24
+ self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc, options))
25
+
26
+ def getFrame(self):
27
+ jdf = self._jsource.getDataFrame()
28
+ return DataFrame(jdf, self._sql_ctx)
@@ -0,0 +1,47 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from .transform import GlueTransform
14
+ from .unbox import Unbox
15
+ from .unnest_frame import UnnestFrame
16
+ from .relationalize import Relationalize
17
+ from .field_transforms import RenameField, DropFields, SelectFields, SplitFields, SplitRows, Join, Spigot
18
+ from .collection_transforms import SelectFromCollection, MapToCollection, FlatMap
19
+ from .drop_nulls import DropNullFields
20
+ from .apply_mapping import ApplyMapping
21
+ from .repartition import Repartition
22
+ from .resolve_choice import ResolveChoice
23
+ from .errors_as_dynamicframe import ErrorsAsDynamicFrame
24
+ from .dynamicframe_filter import Filter
25
+ from .dynamicframe_map import Map
26
+ from .coalesce import Coalesce
27
+ from .union import Union
28
+ import json
29
+
30
+ ALL_TRANSFORMS = {Unbox, RenameField, DropFields, SplitFields, SelectFields, SplitRows,
31
+ UnnestFrame, Relationalize, SelectFromCollection,
32
+ MapToCollection, ErrorsAsDynamicFrame, FlatMap, DropNullFields,
33
+ Join, ApplyMapping, Repartition, ResolveChoice, Spigot, Filter, Map, Coalesce, Union}
34
+
35
+ __all__ = [transform.__name__ for transform in ALL_TRANSFORMS]
36
+
37
+ def get_transforms():
38
+ return {transform() for transform in ALL_TRANSFORMS}
39
+
40
+ def get_transform(name):
41
+ transform, = [t for t in get_transforms() if t.name().lower() == name.lower()] or (None,)
42
+ return transform
43
+
44
+ def describe_transform(name):
45
+ transform = get_transform(name)
46
+ description = transform.describe() if transform else {}
47
+ return json.dumps(description, sort_keys=True, indent=4, separators=(',', ': '))
@@ -0,0 +1,72 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from awsglue.transforms import DropFields, GlueTransform
14
+
15
+ class ApplyMapping(GlueTransform):
16
+ def __call__(self, frame, mappings, case_sensitive = False,
17
+ transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
18
+ return frame.apply_mapping(mappings, case_sensitive, transformation_ctx,
19
+ info, stageThreshold, totalThreshold)
20
+
21
+ @classmethod
22
+ def describeArgs(cls):
23
+ arg1 = {"name": "frame",
24
+ "type": "DynamicFrame",
25
+ "description": "DynamicFrame to transform",
26
+ "optional": False,
27
+ "defaultValue": None}
28
+ arg2 = {"name": "mappings",
29
+ "type": "DynamicFrame",
30
+ "description": "List of mapping tuples (source col, source type, target col, target type)",
31
+ "optional": False,
32
+ "defaultValue": None}
33
+ arg3 = {"name": "case_sensitive",
34
+ "type": "Boolean",
35
+ "description": "Whether ",
36
+ "optional": True,
37
+ "defaultValue": "False"}
38
+ arg4 = {"name": "transformation_ctx",
39
+ "type": "String",
40
+ "description": "A unique string that is used to identify stats / state information",
41
+ "optional": True,
42
+ "defaultValue": ""}
43
+ arg5 = {"name": "info",
44
+ "type": "String",
45
+ "description": "Any string to be associated with errors in the transformation",
46
+ "optional": True,
47
+ "defaultValue": "\"\""}
48
+ arg6 = {"name": "stageThreshold",
49
+ "type": "Integer",
50
+ "description": "Max number of errors in the transformation until processing will error out",
51
+ "optional": True,
52
+ "defaultValue": "0"}
53
+ arg7 = {"name": "totalThreshold",
54
+ "type": "Integer",
55
+ "description": "Max number of errors total until processing will error out.",
56
+ "optional": True,
57
+ "defaultValue": "0"}
58
+
59
+ return [arg1, arg2, arg3, arg4, arg5, arg6, arg7]
60
+
61
+ @classmethod
62
+ def describeTransform(cls):
63
+ return "Apply a declarative mapping to this DynamicFrame."
64
+
65
+ @classmethod
66
+ def describeErrors(cls):
67
+ return []
68
+
69
+ @classmethod
70
+ def describeReturn(cls):
71
+ return {"type": "DynamicFrame",
72
+ "description": "DynamicFrame after applying mappings."}
@@ -0,0 +1,66 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from awsglue.transforms import GlueTransform
14
+
15
+ class Coalesce(GlueTransform):
16
+ def __call__(self, frame, num_partitions, shuffle = False, transformation_ctx = "", info = "",
17
+ stageThreshold = 0, totalThreshold = 0):
18
+ return frame.coalesce(num_partitions, shuffle, transformation_ctx, info, stageThreshold, totalThreshold)
19
+
20
+ @classmethod
21
+ def describeArgs(cls):
22
+ arg1 = {"name": "num_partitions",
23
+ "type": "DynamicFrame",
24
+ "description": "Number of partitions",
25
+ "optional": False,
26
+ "defaultValue": None}
27
+ arg2 = {"name": "shuffle",
28
+ "type": "Boolean",
29
+ "description": "A boolean indicating whether shuffling enabled for the coalesce process",
30
+ "optional": True,
31
+ "defaultValue": False}
32
+ arg3 = {"name": "transformation_ctx",
33
+ "type": "String",
34
+ "description": "A unique string that is used to identify stats / state information",
35
+ "optional": True,
36
+ "defaultValue": ""}
37
+ arg4 = {"name": "info",
38
+ "type": "String",
39
+ "description": "Any string to be associated with errors in the transformation",
40
+ "optional": True,
41
+ "defaultValue": "\"\""}
42
+ arg5 = {"name": "stageThreshold",
43
+ "type": "Integer",
44
+ "description": "Max number of errors in the transformation until processing will error out",
45
+ "optional": True,
46
+ "defaultValue": "0"}
47
+ arg6 = {"name": "totalThreshold",
48
+ "type": "Integer",
49
+ "description": "Max number of errors total until processing will error out.",
50
+ "optional": True,
51
+ "defaultValue": "0"}
52
+
53
+ return [arg1, arg2, arg3, arg4, arg5, arg6]
54
+
55
+ @classmethod
56
+ def describeTransform(cls):
57
+ return "Coalesces a DynamicFrame."
58
+
59
+ @classmethod
60
+ def describeErrors(cls):
61
+ return []
62
+
63
+ @classmethod
64
+ def describeReturn(cls):
65
+ return {"type": "DynamicFrame",
66
+ "description": "The coalesced DynamicFrame."}
@@ -0,0 +1,155 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from awsglue.transforms import GlueTransform
14
+
15
+ class SelectFromCollection(GlueTransform):
16
+
17
+ def __call__(self, dfc, key, transformation_ctx = ""):
18
+ return dfc.select(key, transformation_ctx)
19
+
20
+ @classmethod
21
+ def describeArgs(cls):
22
+ arg1 = {"name": "dfc",
23
+ "type": "DynamicFrameCollection",
24
+ "description": "select one DynamicFrame from this DynamicFrameCollection",
25
+ "optional": False,
26
+ "defaultValue": None}
27
+
28
+ arg2 = {"name": "key",
29
+ "type": "String",
30
+ "description": "The key to select",
31
+ "optional": False,
32
+ "defaultValue": None}
33
+
34
+ arg3 = {"name": "transformation_ctx",
35
+ "type": "String",
36
+ "description": "A unique string that is used to identify stats / state information",
37
+ "optional": True,
38
+ "defaultValue": ""}
39
+
40
+ return [arg1, arg2, arg3]
41
+
42
+ @classmethod
43
+ def describeTransform(cls):
44
+ return "Select one DynamicFrame out from the DynamicFrameCollection"
45
+
46
+ @classmethod
47
+ def describeErrors(cls):
48
+ return []
49
+
50
+ @classmethod
51
+ def describeReturn(cls):
52
+ return {"type": "DynamicFrame",
53
+ "description": "Dynamic Frame corresponding to name"}
54
+
55
+ class MapToCollection(GlueTransform):
56
+
57
+ def __call__(self, dfc, callable, transformation_ctx = ""):
58
+ return dfc.map(callable, transformation_ctx)
59
+
60
+ @classmethod
61
+ def describeArgs(cls):
62
+ arg1 = {"name": "dfc",
63
+ "type": "CollectionDynamicFrame",
64
+ "description": "apply function on this DynamicFrameCollection",
65
+ "optional": False,
66
+ "defaultValue": None}
67
+
68
+ arg2 = {"name": "callable",
69
+ "type": "Callable",
70
+ "description": "apply this Callable on DynamicFrameCollection",
71
+ "optional": False,
72
+ "defaultValue": None}
73
+
74
+ arg3 = {"name": "transformation_ctx",
75
+ "type": "String",
76
+ "description": "A unique string that is used to identify stats / state information",
77
+ "optional": True,
78
+ "defaultValue": ""}
79
+
80
+ return [arg1, arg2, arg3]
81
+
82
+ @classmethod
83
+ def describeTransform(cls):
84
+ return "Apply a transform on each DynamicFrame of this DynamicFrameCollection"
85
+
86
+ @classmethod
87
+ def describeErrors(cls):
88
+ return []
89
+
90
+ @classmethod
91
+ def describeReturn(cls):
92
+ return {"type": "DynamicFrameCollection",
93
+ "description": "A new DynamicFrameCollection after apply transform on each element"}
94
+
95
+
96
+ class FlatMap(GlueTransform):
97
+
98
+ def __call__(self, dfc, BaseTransform, frame_name, transformation_ctx = "", **base_kwargs):
99
+ args = {}
100
+
101
+ def apply_inner(frame, transformation_ctx):
102
+ args.clear()
103
+ args.update(base_kwargs)
104
+ args[frame_name] = frame
105
+ args["transformation_ctx"] = transformation_ctx
106
+ return BaseTransform.apply(**args)
107
+
108
+ return dfc.flatmap(apply_inner, transformation_ctx)
109
+
110
+ @classmethod
111
+ def describeArgs(cls):
112
+ arg1 = {"name": "dfc",
113
+ "type": "DynamicFrameCollection",
114
+ "description": "The collection over which to flatmap.",
115
+ "optional": False,
116
+ "defaultValue": None}
117
+
118
+ arg2 = {"name": "BaseTransform",
119
+ "type": "GlueTransform",
120
+ "description": "A GlueTransform to apply to each member of the collection.",
121
+ "optional": False,
122
+ "defaultValue": None}
123
+
124
+ arg3 = {"name": "frame_name",
125
+ "type": "String",
126
+ "description": "The argument name to which to pass the elements of the collection.",
127
+ "optional": False,
128
+ "defaultValue": None}
129
+
130
+ arg4 = {"name": "transformation_ctx",
131
+ "type": "String",
132
+ "description": "A unique string that is used to identify stats / state information",
133
+ "optional": True,
134
+ "defaultValue": ""}
135
+
136
+ arg5 = {"name": "base_kwargs",
137
+ "type": "dict",
138
+ "description": "Arguments to pass to the base transform.",
139
+ "optional": False,
140
+ "defaultValue": None}
141
+
142
+ return [arg1, arg2, arg3, arg4, arg5]
143
+
144
+ @classmethod
145
+ def describeTransform(cls):
146
+ return "Applies a transform to each DynamicFrame in a collection and flattens the results."
147
+
148
+ @classmethod
149
+ def describeErrors(cls):
150
+ return []
151
+
152
+ @classmethod
153
+ def describeReturn(cls):
154
+ return {"type": "DynamicFrameCollection",
155
+ "description": "A new DynamicFrameCollection after applying the transform on each element"}
@@ -0,0 +1,85 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from __future__ import print_function
14
+ from awsglue.transforms import DropFields, GlueTransform
15
+ from awsglue.gluetypes import ArrayType, NullType, StructType
16
+
17
+ class DropNullFields(GlueTransform):
18
+ def _find_null_fields(self, ctx, schema, path, output):
19
+ if isinstance(schema, StructType):
20
+ for field in schema:
21
+ new_path = path + "." if path != "" else path
22
+ self._find_null_fields(ctx, field.dataType, new_path + ctx._jvm.RecordUtils.quoteName(field.name), output)
23
+
24
+ elif isinstance(schema, ArrayType):
25
+ # For the moment we only remove null fields in nested array columns.
26
+ # We don't change ArrayType(NullType).
27
+ if isinstance(schema.elementType, StructType):
28
+ self._find_null_fields(ctx, schema.elementType, path, output)
29
+
30
+ elif isinstance(schema, NullType):
31
+ output.append(path)
32
+
33
+ # Note: dropFields currently does not work through maps,
34
+ # so neither does DropNullFields
35
+
36
+ def __call__(self, frame, transformation_ctx = "", info = "", stageThreshold = 0, totalThreshold = 0):
37
+ null_fields = []
38
+ self._find_null_fields(frame.glue_ctx, frame.schema(), "", null_fields)
39
+ print("null_fields", null_fields)
40
+
41
+ return DropFields.apply(frame, null_fields, transformation_ctx,
42
+ info, stageThreshold, totalThreshold)
43
+
44
+ @classmethod
45
+ def describeArgs(cls):
46
+ arg1 = {"name": "frame",
47
+ "type": "DynamicFrame",
48
+ "description": "Drop all null fields in this DynamicFrame",
49
+ "optional": False,
50
+ "defaultValue": None}
51
+ arg2 = {"name": "transformation_ctx",
52
+ "type": "String",
53
+ "description": "A unique string that is used to identify stats / state information",
54
+ "optional": True,
55
+ "defaultValue": ""}
56
+ arg3 = {"name": "info",
57
+ "type": "String",
58
+ "description": "Any string to be associated with errors in the transformation",
59
+ "optional": True,
60
+ "defaultValue": "\"\""}
61
+ arg4 = {"name": "stageThreshold",
62
+ "type": "Integer",
63
+ "description": "Max number of errors in the transformation until processing will error out",
64
+ "optional": True,
65
+ "defaultValue": "0"}
66
+ arg5 = {"name": "totalThreshold",
67
+ "type": "Integer",
68
+ "description": "Max number of errors total until processing will error out.",
69
+ "optional": True,
70
+ "defaultValue": "0"}
71
+
72
+ return [arg1, arg2, arg3, arg4, arg5]
73
+
74
+ @classmethod
75
+ def describeTransform(cls):
76
+ return "Drop all null fields in this DynamicFrame"
77
+
78
+ @classmethod
79
+ def describeErrors(cls):
80
+ return []
81
+
82
+ @classmethod
83
+ def describeReturn(cls):
84
+ return {"type": "DynamicFrame",
85
+ "description": "DynamicFrame without null fields."}
@@ -0,0 +1,66 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from awsglue.transforms import GlueTransform
14
+
15
+
16
+ class Filter(GlueTransform):
17
+ def __call__(self, frame, f, transformation_ctx = "", info="", stageThreshold=0, totalThreshold=0):
18
+ return frame.filter(f, transformation_ctx, info, stageThreshold, totalThreshold)
19
+
20
+ @classmethod
21
+ def describeArgs(cls):
22
+ arg1 = {"name": "frame",
23
+ "type": "DynamicFrame",
24
+ "description": "The DynamicFrame to apply the Filter function",
25
+ "optional": False,
26
+ "defaultValue": None}
27
+ arg2 = {"name": "f",
28
+ "type": "Function",
29
+ "description": "Predicate function to call on the DynamicFrame. The function takes DynamicRecord as the argument and returns True/False",
30
+ "optional": False,
31
+ "defaultValue": None}
32
+ arg3 = {"name": "transformation_ctx",
33
+ "type": "String",
34
+ "description": "A unique string that is used to identify stats / state information",
35
+ "optional": True,
36
+ "defaultValue": ""}
37
+ arg4 = {"name": "info",
38
+ "type": "String",
39
+ "description": "Any string to be associated with errors in the transformation",
40
+ "optional": True,
41
+ "defaultValue": "\"\""}
42
+ arg5 = {"name": "stageThreshold",
43
+ "type": "Integer",
44
+ "description": "Max number of errors in the transformation until processing will error out",
45
+ "optional": True,
46
+ "defaultValue": "0"}
47
+ arg6 = {"name": "totalThreshold",
48
+ "type": "Integer",
49
+ "description": "Max number of errors total until processing will error out.",
50
+ "optional": True,
51
+ "defaultValue": "0"}
52
+
53
+ return [arg1, arg2, arg3, arg4, arg5, arg6]
54
+
55
+ @classmethod
56
+ def describeTransform(cls):
57
+ return "Builds a new DynamicFrame by selecting records from the input frame that satisfy the predicate function"
58
+
59
+ @classmethod
60
+ def describeErrors(cls):
61
+ return []
62
+
63
+ @classmethod
64
+ def describeReturn(cls):
65
+ return {"type": "DynamicFrame",
66
+ "description": "new DynamicFrame with DynamicRecords that matched the predicate"}