AWSGlueDataplanePython 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. awsglue/README.md +37 -0
  2. awsglue/__init__.py +15 -0
  3. awsglue/context.py +690 -0
  4. awsglue/data_sink.py +49 -0
  5. awsglue/data_source.py +49 -0
  6. awsglue/dataframe_transforms/__init__.py +17 -0
  7. awsglue/dataframe_transforms/apply_mapping.py +76 -0
  8. awsglue/dataframereader.py +41 -0
  9. awsglue/dataframewriter.py +21 -0
  10. awsglue/devutils.py +236 -0
  11. awsglue/dynamicframe.py +669 -0
  12. awsglue/functions.py +31 -0
  13. awsglue/glue_shell.py +38 -0
  14. awsglue/gluetypes.py +461 -0
  15. awsglue/job.py +59 -0
  16. awsglue/scripts/__init__.py +12 -0
  17. awsglue/scripts/activate_etl_connector.py +362 -0
  18. awsglue/scripts/connector_activation_util.py +38 -0
  19. awsglue/scripts/crawler_redo_from_backup.py +75 -0
  20. awsglue/scripts/crawler_undo.py +121 -0
  21. awsglue/scripts/scripts_utils.py +106 -0
  22. awsglue/streaming_data_source.py +28 -0
  23. awsglue/transforms/__init__.py +47 -0
  24. awsglue/transforms/apply_mapping.py +72 -0
  25. awsglue/transforms/coalesce.py +66 -0
  26. awsglue/transforms/collection_transforms.py +155 -0
  27. awsglue/transforms/drop_nulls.py +85 -0
  28. awsglue/transforms/dynamicframe_filter.py +66 -0
  29. awsglue/transforms/dynamicframe_map.py +72 -0
  30. awsglue/transforms/errors_as_dynamicframe.py +45 -0
  31. awsglue/transforms/field_transforms.py +469 -0
  32. awsglue/transforms/relationalize.py +105 -0
  33. awsglue/transforms/repartition.py +61 -0
  34. awsglue/transforms/resolve_choice.py +85 -0
  35. awsglue/transforms/transform.py +92 -0
  36. awsglue/transforms/unbox.py +112 -0
  37. awsglue/transforms/union.py +66 -0
  38. awsglue/transforms/unnest_frame.py +75 -0
  39. awsglue/utils.py +159 -0
  40. awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
  41. awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
  42. awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
  43. awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
  44. awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
  45. awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,362 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+ """
13
+ This script is supposed to be invoked by the tape-run.sh or PrepareLaunch class within the Tape container. It iterates
14
+ the connections supplied to extract the ECR URL. Using the URL, the docker image will be downloaded in a per-layer
15
+ fashion and unpacked onto to the container file system. Finally, the paths to the connector jars are written out to an
16
+ output file. Reference: https://rmannibucau.metawerx.net/post/docker-extracts-fileystem-with-bash
17
+ """
18
+
19
+ import argparse
20
+ import gzip
21
+ import logging
22
+ import os
23
+ import random
24
+ import re
25
+ import shutil
26
+ import string
27
+ import subprocess
28
+ import sys
29
+ from typing import Any, Dict, List, Optional, Tuple, Union
30
+ from urllib.parse import urlparse
31
+ from os import path
32
+
33
+ import boto3
34
+ import requests
35
+ from botocore.config import Config
36
+ from botocore.exceptions import ClientError, NoCredentialsError
37
+ from .connector_activation_util import boto_client_error
38
+
39
+ LAYER_TAR_DIR = "layers/tar"
40
+ LAYER_GZ_DIR = "layers/gz"
41
+ MARKETPLACE = "MARKETPLACE"
42
+ CUSTOM = "CUSTOM"
43
+ HTTP_PROXY = "HTTP_PROXY"
44
+ HTTPS_PROXY = "HTTPS_PROXY"
45
+ NO_PROXY = "NO_PROXY"
46
+ ECR_HOST_PATTERN = r"^([0-9]{12})\.dkr\.ecr\.[a-z]{2}-[a-z]{4}-[0-9]\.amazonaws\.com$"
47
+
48
+ logger = logging.getLogger(__name__)
49
+ logger.setLevel(logging.INFO)
50
+
51
+
52
+ def add_stream_handler() -> None:
53
+ """
54
+ Add a new stream handler to the logger at module level to emit LogRecord to std.out. With this setup, logs will show
55
+ up in both customer's logStream and our docker logStream to aid debugging.
56
+ """
57
+ stream_handler = logging.StreamHandler(stream=sys.stdout)
58
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - Glue ETL Marketplace - %(message)s")
59
+ stream_handler.setFormatter(formatter)
60
+ stream_handler.setLevel(logging.INFO)
61
+ logger.addHandler(stream_handler)
62
+
63
+
64
+ def run_commands(commands: List[str]) -> Tuple[bytes, bytes]:
65
+ """
66
+ Util function to run shell commands from Python.
67
+ """
68
+ process = subprocess.Popen(commands,
69
+ stdout=subprocess.PIPE,
70
+ stderr=subprocess.PIPE)
71
+ stdout, stderr = process.communicate()
72
+ logger.info(f"run_commands output - \"{' '.join(commands)}\"\n"
73
+ f"stdout: {stdout.decode()}\n"
74
+ f"stderr: {stderr.decode()}")
75
+ return stdout, stderr
76
+
77
+
78
+ def send_get_request(url: str, header: Dict[str, str]) -> requests.Response:
79
+ logger.debug(f"Sending GET request to {url} with {header.keys()} specified in header.")
80
+ response = requests.get(url, headers=header)
81
+ response.raise_for_status()
82
+ return response
83
+
84
+
85
+ def parse_url(url: str) -> Tuple[str, str]:
86
+ res = urlparse(url, allow_fragments=False)
87
+ return res.netloc, res.path.strip("/")
88
+
89
+
90
+ def extract_ecr_region(ecr_root: str) -> Union[None, str]:
91
+ """
92
+ Extract AWS Region of the ECR registry from its root address
93
+ e.g. xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com
94
+ """
95
+ session = boto3.session.Session()
96
+ for region in session.get_available_regions("ecr"):
97
+ if region in ecr_root:
98
+ return region
99
+ return None
100
+
101
+
102
+ def extract_registry_id(ecr_root: str) -> str:
103
+ """
104
+ Extract AWS account id of the ECR registry from its root address
105
+ e.g. xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com
106
+ """
107
+ match = re.match(ECR_HOST_PATTERN, ecr_root)
108
+ if match:
109
+ return match.group(1)
110
+ else:
111
+ raise ValueError(f"Invalid ECR url supplied, couldn't find aws account from {ecr_root}.")
112
+
113
+
114
+ @boto_client_error(logger)
115
+ def get_ecr_authorization_token(ecr_root: str) -> str:
116
+ """
117
+ Get the ECR authorization token to be used later to call ECR HTTP API. Even though not clearly documented, the
118
+ region is actually required to get the correct token, otherwise ECR returns Code 400 when the wrong token is used.
119
+ """
120
+ region = extract_ecr_region(ecr_root)
121
+ registry_id = extract_registry_id(ecr_root)
122
+ ecr = boto3.client(service_name="ecr", region_name=region)
123
+ logger.info(f"Requesting ECR authorization token for registryIds={registry_id} and region_name={region}.")
124
+ response = ecr.get_authorization_token(registryIds=[registry_id])
125
+ return response["authorizationData"][0]["authorizationToken"]
126
+
127
+
128
+ def parse_ecr_url(ecr_url: str) -> Tuple[str, str, str]:
129
+ """
130
+ Parse ECR root address, image name and tag from the given ECR URL.
131
+ E.g. https://xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/salesforce:7.2.0-latest
132
+ """
133
+ ecr_root, repo = parse_url(ecr_url)
134
+ if not re.match(ECR_HOST_PATTERN, ecr_root):
135
+ raise ValueError("malformed registry, correct pattern is https://aws_account_id.dkr.ecr.region.amazonaws.com")
136
+ if not re.match("^[^:]+:[^:]+$", repo):
137
+ raise ValueError("malformed image name, only one colon allowed to delimit image name and tag")
138
+ image_name, tag = repo.split(":")
139
+ return ecr_root, image_name, tag
140
+
141
+
142
+ def get_docker_manifest(ecr_url: str, header: Dict[str, str]) -> Dict[str, Any]:
143
+ """
144
+ Returns the manifest for the given image in ECR. It includes information about an image such as layers, size and
145
+ digest. We extract the layers to get the digest id to download archive file for each layer.
146
+ """
147
+ ecr_root, image_name, tag = parse_ecr_url(ecr_url)
148
+ manifest_url = f"https://{ecr_root}/v2/{image_name}/manifests/{tag}"
149
+ logger.info(f"Calling ECR HTTP API to get manifest of {ecr_url}.")
150
+ manifest = send_get_request(manifest_url, header).json()
151
+ return manifest
152
+
153
+
154
+ def download_and_unpack_docker_layer(ecr_url: str, digest: str, dir_prefix: str, header: Dict[str, str]) -> None:
155
+ """
156
+ Docker cli and the daemon process are both not available within Glue Python Shell runtime. In order to download
157
+ docker image and extract the connector jars inside, we need to download the layers that consist the image and unpack
158
+ the file system so that we can access the jar files. The layer itself has multiple levels of compression applied,
159
+ which is why we need to download it as gz file and then unpack as tar file. The final unpack of the tar file is done
160
+ via the 'tar' command line tool because the tarfile library doesn't work for permission issue.
161
+ """
162
+ logger.info(f"Download/unpacking {digest} layer of image: {ecr_url}.")
163
+ layer_id = digest.split(":")[1]
164
+ logger.info(f"Preparing layer url and gz file path to store layer {layer_id}.")
165
+ layer_gz_path = f"{dir_prefix}/{LAYER_GZ_DIR}/{layer_id}.gz"
166
+ ecr_root, image_name, tag = parse_ecr_url(ecr_url)
167
+ layer_url = f"https://{ecr_root}/v2/{image_name}/blobs/{digest}"
168
+
169
+ logger.info(f"Getting the layer file {layer_id} and store it as gz.")
170
+ layer = send_get_request(layer_url, header)
171
+ with open(layer_gz_path, "wb") as f:
172
+ f.write(layer.content)
173
+
174
+ logger.info(f"Unzipping the {layer_id} layer and store as tar file.")
175
+ with gzip.open(f"{layer_gz_path}", "rb") as f_in:
176
+ with open(f"{dir_prefix}/{LAYER_TAR_DIR}/{layer_id}", "wb") as f_out:
177
+ shutil.copyfileobj(f_in, f_out)
178
+
179
+ logger.info(f"Unarchiving {layer_id} layer as tar file.")
180
+ run_commands(["tar", "-C", f"{dir_prefix}/{LAYER_TAR_DIR}/", "-xf", f"{dir_prefix}/{LAYER_TAR_DIR}/{layer_id}"])
181
+
182
+
183
+ def parse_args(args: List[str]) -> List[str]:
184
+ arg_parser = argparse.ArgumentParser()
185
+ arg_parser.add_argument("--connections",
186
+ required=True,
187
+ type=lambda x: x.split(","),
188
+ help="a list of connection names we'll use to download jars for")
189
+ arg_parser.add_argument("--result_path",
190
+ required=True,
191
+ help="file path to store the jar downloading result")
192
+ arg_parser.add_argument("--region",
193
+ required=True,
194
+ help="aws region of the connections supplied")
195
+ arg_parser.add_argument("--endpoint",
196
+ required=True,
197
+ help="endpoint to use to talk with Glue service")
198
+ arg_parser.add_argument("--proxy",
199
+ default=None,
200
+ help="proxy to talk to Glue backend in case of VPC job")
201
+ parsed_args = arg_parser.parse_args(args)
202
+ return [parsed_args.connections, parsed_args.result_path, parsed_args.region,
203
+ parsed_args.endpoint, parsed_args.proxy]
204
+
205
+
206
+ def id_generator(size: int = 5, chars: str = string.ascii_uppercase + string.digits) -> str:
207
+ """
208
+ Generate a random Id using letters from "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" with {size} digits.
209
+ """
210
+ return ''.join(random.choice(chars) for _ in range(size))
211
+
212
+
213
+ def get_connection(region: str, endpoint: str, conn: str, proxy: Optional[str] = None) -> Union[Dict, None]:
214
+ """
215
+ Get catalog connection metadata by calling Boto3 get_connection API, supports custom supplied region and endpoint.
216
+ """
217
+ config = Config()
218
+ if proxy:
219
+ config.proxies = {'https': proxy} # type: ignore
220
+ glue = boto3.Session().client(
221
+ service_name="glue",
222
+ region_name=region,
223
+ endpoint_url=endpoint,
224
+ config=config
225
+ )
226
+ logger.info(f"using region: {region}, proxy: {proxy} and glue endpoint: {endpoint} to get connection: {conn}")
227
+ try:
228
+ return glue.get_connection(Name=conn)
229
+ except ClientError:
230
+ logger.exception(f"Failed to get connection detail for {conn}, skip jar downloading for it")
231
+ except NoCredentialsError:
232
+ logger.exception(f"Unable to get credential to call GetConnection for {conn}, skip jar downloading for it."
233
+ f" Check if the IAM role has the right permission or if you need to increase IMDS retry.")
234
+ return None
235
+
236
+
237
+ def collect_files_by_suffix(input_dir: str, suffix: str) -> List[str]:
238
+ """
239
+ Given an input path to a directory, find all files ending with the input suffix. Return a list of absolute paths of
240
+ these files.
241
+ """
242
+ res = []
243
+ for dirpath, _, filenames in os.walk(input_dir):
244
+ for file in filenames:
245
+ if not file.endswith(suffix):
246
+ continue
247
+ else:
248
+ abs_path = os.path.abspath(os.path.join(dirpath, file))
249
+ res.append(abs_path)
250
+ return res
251
+
252
+
253
+ @boto_client_error(logger, "Failed to download jars for custom connection from S3...")
254
+ def download_custom_jars(conn: Dict[str, Any], dest_folder: str = "/tmp/custom_connection_jars"):
255
+ os.makedirs(dest_folder, exist_ok=True)
256
+ s3_urls: List[str] = conn["Connection"]["ConnectionProperties"]["CONNECTOR_URL"].split(",")
257
+ s3 = boto3.client("s3")
258
+ res = []
259
+
260
+ for url in s3_urls:
261
+ if url.strip().startswith("s3://") and url.strip().endswith(".jar"):
262
+ bucket, key = parse_url(url.strip())
263
+ file_path = f"{dest_folder}/etl-{key.split('/')[-1]}"
264
+ s3.download_file(bucket, key, file_path)
265
+ res.append(file_path)
266
+ else:
267
+ logger.error("custom connection can only have S3 urls end with '.jar' as connector url.")
268
+ logger.info(f"collected jar paths: {res} for connection: {conn}.")
269
+ return res
270
+
271
+
272
+ def download_jars_per_connection(conn: str, region: str, endpoint: str, proxy: Optional[str] = None) -> List[str]:
273
+ # validate connection type
274
+ connection = get_connection(region, endpoint, conn, proxy)
275
+ if connection is None:
276
+ return []
277
+ # download jars from S3 in case of custom connection
278
+ elif connection["Connection"]["ConnectionType"] == CUSTOM:
279
+ logger.info(f"Connection {conn} is a Custom connection, try to download jars for it from S3.")
280
+ return download_custom_jars(connection)
281
+ # return empty list in case of non-marketplace connection
282
+ elif connection["Connection"]["ConnectionType"] != MARKETPLACE:
283
+ logger.warning(f"Connection {conn} is not a Marketplace connection, skip jar downloading for it")
284
+ return []
285
+
286
+ # get the connection classname
287
+ if "CONNECTOR_CLASS_NAME" in connection["Connection"]["ConnectionProperties"]:
288
+ driver_name = connection["Connection"]["ConnectionProperties"]["CONNECTOR_CLASS_NAME"]
289
+
290
+ # get the the connection ecr url
291
+ ecr_url = connection["Connection"]["ConnectionProperties"]["CONNECTOR_URL"]
292
+ ecr_root, _, _ = parse_ecr_url(ecr_url)
293
+
294
+ # download the jars
295
+ token = get_ecr_authorization_token(ecr_root)
296
+ http_header = {"Authorization": f"Basic {token}"}
297
+
298
+ manifest = get_docker_manifest(ecr_url, http_header)
299
+
300
+ # make directory for the jars of the given connection
301
+ dir_prefix = id_generator()
302
+ os.makedirs(f"{dir_prefix}/{LAYER_TAR_DIR}", exist_ok=True)
303
+ os.makedirs(f"{dir_prefix}/{LAYER_GZ_DIR}", exist_ok=True)
304
+
305
+ for layer in manifest["layers"]:
306
+ download_and_unpack_docker_layer(ecr_url, layer["digest"], dir_prefix, http_header)
307
+
308
+ # return the jar paths
309
+ res = collect_files_by_suffix(f"{dir_prefix}/{LAYER_TAR_DIR}/jars", ".jar")
310
+ logger.info(f"Container paths are: {res}")
311
+
312
+ # Write OEM key to /tmp/glue-marketplace.conf
313
+ oem_key_path = f"{dir_prefix}/{LAYER_TAR_DIR}/oem/oem.txt"
314
+ if path.exists(oem_key_path):
315
+ with open(oem_key_path, 'r') as oem_file:
316
+ oem_key = oem_file.readline()
317
+ oem_value = oem_file.readline()
318
+ output = """marketplace_oem = {
319
+ %s = {
320
+ oem_key = %s oem_value = %s
321
+ }
322
+ }\n""" % (driver_name, oem_key, oem_value)
323
+ with open("/tmp/glue-marketplace.conf", 'a') as opened_file:
324
+ opened_file.write(output)
325
+ logger.info(f"OEM information is written.")
326
+
327
+ if not res:
328
+ logger.warning(f"found no connector jars from {ecr_url} provided by {conn}, please contact AWS support of"
329
+ f" the Connector product owner to debug the issue.")
330
+ else:
331
+ logger.info(f"collected jar paths: {res} for connection: {conn}")
332
+ return res
333
+
334
+
335
+ def main():
336
+ # in case of VPC, we directly update config with proxy for glue client. Hence here we unset the environmental values
337
+ # to avoid clients for other AWS services to go through Glue's proxy. The unset is process local and will not affect
338
+ # subsequent aws cli usage.
339
+ if HTTP_PROXY in os.environ:
340
+ del os.environ[HTTP_PROXY]
341
+ if HTTPS_PROXY in os.environ:
342
+ del os.environ[HTTPS_PROXY]
343
+ if NO_PROXY in os.environ:
344
+ del os.environ[NO_PROXY]
345
+
346
+ connections, result_path, region, endpoint, proxy = parse_args(sys.argv[1:])
347
+ add_stream_handler()
348
+
349
+ res = []
350
+ for conn in connections:
351
+ logger.info(f"Start downloading connector jars for connection: {conn}")
352
+ res += download_jars_per_connection(conn, region, endpoint, proxy)
353
+
354
+ # concatenate the jar paths as a string and write it out to result_path
355
+ with open(result_path, "w") as f:
356
+ f.write(",".join(res))
357
+
358
+ logger.info(f"successfully wrote jar paths to \"{result_path}\"")
359
+
360
+
361
+ if __name__ == "__main__":
362
+ main()
@@ -0,0 +1,38 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from logging import Logger
14
+
15
+ from botocore.exceptions import ClientError, NoCredentialsError
16
+
17
+
18
+ def boto_client_error(logger: Logger, message: str = ""):
19
+ def decorator(func):
20
+ def wrapper(*args, **kwargs):
21
+ try:
22
+ return func(*args, **kwargs)
23
+ except ClientError as error:
24
+ if error.response['Error']['Code'] == 'InternalError': # Generic error
25
+ # We grab the message, request ID, and HTTP code to give to customer support
26
+ logger.error('Error Message: {}'.format(error.response['Error']['Message']))
27
+ logger.error('Request ID: {}'.format(error.response['ResponseMetadata']['RequestId']))
28
+ logger.error('Http code: {}'.format(error.response['ResponseMetadata']['HTTPStatusCode']))
29
+ else:
30
+ logger.error(f"boto3 clientError raised in function {func.__name__}" + repr(error) + message)
31
+ raise
32
+ except NoCredentialsError as error:
33
+ logger.error(f"boto3 NoCredentialsError raised in function {func.__name__}: {repr(error)}"
34
+ f"Check if the IAM role has the right permission or if you need to increase IMDS retry.")
35
+ raise
36
+
37
+ return wrapper
38
+ return decorator
@@ -0,0 +1,75 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from __future__ import print_function
14
+
15
+ import sys
16
+ import argparse
17
+ from awsglue.context import GlueContext
18
+ from pyspark.context import SparkContext
19
+ from awsglue.dynamicframe import DynamicFrame
20
+ from awsglue.transforms import get_transform
21
+ from pyspark.sql.types import *
22
+ from .scripts_utils import *
23
+
24
+ def crawler_redo_from_backup(glue_context, **options):
25
+ spark_ctxt = glue_context._instantiatedContext
26
+ backup_location = options['s3.backup_location']
27
+
28
+ # Read from s3
29
+ data = read_from_s3(glue_context, backup_location)
30
+
31
+ # Write to Catalog
32
+ for entity_type in ['table', 'tableToDelete', 'partition', 'partitionToDelete']:
33
+ write_df_to_catalog(data[entity_type], entity_type, glue_context, options)
34
+
35
+ def crawler_redo_from_backup_options(args):
36
+ # arguments
37
+ parser = argparse.ArgumentParser(description='This script allows you to restore a namespace to a specific backup.')
38
+ parser.add_argument('-c', '--crawler-name', required=True, help='Name of the crawler to restore.')
39
+ parser.add_argument('-b', '--backup-location', required=True, help='Location of the backup to use.')
40
+ parser.add_argument('-d', '--database-name', required=False, help='Database to back up. If not specified, '
41
+ 'the database target of the crawler is used instead.')
42
+ parser.add_argument('-r', '--region', required=False, default=DEFAULT_REGION, help='Optional service endpoint region.')
43
+
44
+
45
+ options, unknown = parser.parse_known_args(args)
46
+
47
+ if options.database_name is not None:
48
+ database_name = options.database_name
49
+ else:
50
+ import boto3
51
+ glue_endpoint = DEFAULT_GLUE_ENDPOINT
52
+ glue = boto3.client('glue', endpoint_url="https://%s.%s.amazonaws.com" % (glue_endpoint, options.region))
53
+ crawler = glue.get_crawler(Name=options.crawler_name)['Crawler']
54
+ database_name = crawler['DatabaseName']
55
+
56
+ return {
57
+ "catalog.name": DEFAULT_CATALOG_ENDPOINT,
58
+ "catalog.region": options.region,
59
+ "catalog.database": database_name,
60
+ "crawler.name" : options.crawler_name,
61
+ "s3.backup_location" : options.backup_location
62
+ }
63
+
64
+ def main():
65
+
66
+ # spark env
67
+ sc = SparkContext()
68
+ glue_context = GlueContext(sc)
69
+
70
+ crawler_redo_from_backup(
71
+ glue_context,
72
+ **crawler_redo_from_backup_options(sys.argv[1:]))
73
+
74
+ if __name__ == '__main__':
75
+ main()
@@ -0,0 +1,121 @@
1
+ # Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # Licensed under the Amazon Software License (the "License"). You may not use
3
+ # this file except in compliance with the License. A copy of the License is
4
+ # located at
5
+ #
6
+ # http://aws.amazon.com/asl/
7
+ #
8
+ # or in the "license" file accompanying this file. This file is distributed
9
+ # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
10
+ # or implied. See the License for the specific language governing
11
+ # permissions and limitations under the License.
12
+
13
+ from __future__ import print_function
14
+
15
+ import sys
16
+ import argparse
17
+ from awsglue.context import GlueContext
18
+ from pyspark.context import SparkContext
19
+
20
+ from awsglue.dynamicframe import DynamicFrame
21
+ from awsglue.transforms import get_transform
22
+ from pyspark.sql.types import *
23
+ from .scripts_utils import *
24
+ from pyspark.sql.functions import col
25
+
26
+ def crawler_backup(glue_context, data, options):
27
+ crawler_name = options['crawler.name']
28
+ backup_location = options['s3.backup_location']
29
+ database_name = options['catalog.database']
30
+
31
+ # Only get data for this crawler
32
+ data['table'] = data['table'].filter("parameters.UPDATED_BY_CRAWLER = '%s'" % crawler_name)
33
+ data['partition'] = data['partition'].join(data['table'].withColumn('tableName', col('name')), 'tableName', 'leftsemi')
34
+
35
+ if backup_location is not None:
36
+ # Backup the contents of the catalog at an s3 location
37
+ write_backup(data, database_name, backup_location, glue_context)
38
+
39
+ def crawler_undo(glue_context, **options):
40
+ spark_ctxt = glue_context._instantiatedContext
41
+ crawler_name = options['crawler.name']
42
+ database_name = options['catalog.database']
43
+ timestamp = options['timestamp']
44
+ options["catalog.tableVersions"] = True
45
+
46
+ data = read_from_catalog(glue_context, options)
47
+
48
+ crawler_backup(glue_context, data, options)
49
+
50
+ # Find all the table versions for this crawler
51
+ crawler_tables = data['tableVersion'].select(col("table.updateTime").alias("updateTime"), col("table"), col('table.parameters.UPDATED_BY_CRAWLER')).filter("UPDATED_BY_CRAWLER = '%s'" % crawler_name)
52
+
53
+ # Find the latest previous version of tables for this crawler that were updated or deleted since the last timestamp.
54
+ filtered = crawler_tables.filter("updateTime <= %d" % timestamp).withColumn("filtered_name", col("table.name"))
55
+ update_times = filtered.groupBy("table.name").max("table.updateTime").withColumnRenamed("max(table.updateTime AS `updateTime`)","time")
56
+ joined = filtered.join(update_times, (col("filtered_name") == col("name")) & (col("updateTime") == col("time")), 'inner')
57
+ tables_to_write = joined.select(col("table.*"))
58
+
59
+ # Find the tables that were created since the last timestamp
60
+ names = crawler_tables.select(col("table.name")).distinct()
61
+ present_before_timestamp = joined.select(col("table.name"))
62
+ tables_to_delete = names.subtract(present_before_timestamp)
63
+
64
+ # Find the partitions that were created since the last timestamp
65
+ partitions_to_delete = data['partition'].withColumn('name', col('tableName')).join(crawler_tables.withColumn('name', col('table.name')), 'name', 'leftsemi').filter("creationTime < %d" % timestamp)
66
+
67
+ # Write to Catalog
68
+ write_df_to_catalog(tables_to_write, "table", glue_context, options)
69
+ write_df_to_catalog(tables_to_delete, "tableToDelete", glue_context, options)
70
+ write_df_to_catalog(partitions_to_delete, "partitionToDelete", glue_context, options)
71
+
72
+ def crawler_undo_options(args):
73
+ # arguments
74
+ parser = argparse.ArgumentParser(description='This script allows you to rollback the effects of a crawler.')
75
+ parser.add_argument('-c', '--crawler-name', required=True, help='Name of the crawler to rollback.')
76
+ parser.add_argument('-b', '--backup-location', required=False, help='Location of the backup to use. If not specified, no backup is used.')
77
+ parser.add_argument('-d', '--database-name', required=False, help='Database to roll back. If not specified, '
78
+ 'the database target of the crawler is used instead.')
79
+ parser.add_argument('-t', '--timestamp', required=False, help='Timestamp to rollback to, in milliseconds since epoch. If not specified, '
80
+ 'the start timestamp of the crawler is used instead.')
81
+ parser.add_argument('-r', '--region', required=False, default=DEFAULT_REGION, help='Optional DataCatalog service endpoint region.')
82
+
83
+ options, unknown = parser.parse_known_args(args)
84
+
85
+ if not (options.database_name is not None and options.timestamp is not None):
86
+ import boto3 # Import is done here to ensure script does not fail in case boto3 is not required.
87
+ glue_endpoint = DEFAULT_GLUE_ENDPOINT
88
+ glue = boto3.client('glue', endpoint_url="https://%s.%s.amazonaws.com" % (glue_endpoint, options.region))
89
+ crawler = glue.get_crawler(Name=options.crawler_name)['Crawler']
90
+
91
+ if options.database_name is not None:
92
+ database_name = options.database_name
93
+ else:
94
+ database_name = crawler['DatabaseName']
95
+
96
+ if options.timestamp is not None:
97
+ timestamp = options.timestamp
98
+ else:
99
+ timestamp = crawler['LastCrawlInfo']['StartTime']
100
+
101
+ return {
102
+ "catalog.name": DEFAULT_CATALOG_ENDPOINT,
103
+ "catalog.region": options.region,
104
+ "catalog.database": database_name,
105
+ "crawler.name" : options.crawler_name,
106
+ "s3.backup_location" : options.backup_location,
107
+ "timestamp": int(timestamp)
108
+ }
109
+
110
+ def main():
111
+
112
+ # spark env
113
+ sc = SparkContext()
114
+ glue_context = GlueContext(sc)
115
+
116
+ crawler_undo(
117
+ glue_context,
118
+ **crawler_undo_options(sys.argv[1:]))
119
+
120
+ if __name__ == '__main__':
121
+ main()