AWSGlueDataplanePython 5.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- awsglue/README.md +37 -0
- awsglue/__init__.py +15 -0
- awsglue/context.py +690 -0
- awsglue/data_sink.py +49 -0
- awsglue/data_source.py +49 -0
- awsglue/dataframe_transforms/__init__.py +17 -0
- awsglue/dataframe_transforms/apply_mapping.py +76 -0
- awsglue/dataframereader.py +41 -0
- awsglue/dataframewriter.py +21 -0
- awsglue/devutils.py +236 -0
- awsglue/dynamicframe.py +669 -0
- awsglue/functions.py +31 -0
- awsglue/glue_shell.py +38 -0
- awsglue/gluetypes.py +461 -0
- awsglue/job.py +59 -0
- awsglue/scripts/__init__.py +12 -0
- awsglue/scripts/activate_etl_connector.py +362 -0
- awsglue/scripts/connector_activation_util.py +38 -0
- awsglue/scripts/crawler_redo_from_backup.py +75 -0
- awsglue/scripts/crawler_undo.py +121 -0
- awsglue/scripts/scripts_utils.py +106 -0
- awsglue/streaming_data_source.py +28 -0
- awsglue/transforms/__init__.py +47 -0
- awsglue/transforms/apply_mapping.py +72 -0
- awsglue/transforms/coalesce.py +66 -0
- awsglue/transforms/collection_transforms.py +155 -0
- awsglue/transforms/drop_nulls.py +85 -0
- awsglue/transforms/dynamicframe_filter.py +66 -0
- awsglue/transforms/dynamicframe_map.py +72 -0
- awsglue/transforms/errors_as_dynamicframe.py +45 -0
- awsglue/transforms/field_transforms.py +469 -0
- awsglue/transforms/relationalize.py +105 -0
- awsglue/transforms/repartition.py +61 -0
- awsglue/transforms/resolve_choice.py +85 -0
- awsglue/transforms/transform.py +92 -0
- awsglue/transforms/unbox.py +112 -0
- awsglue/transforms/union.py +66 -0
- awsglue/transforms/unnest_frame.py +75 -0
- awsglue/utils.py +159 -0
- awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
- awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
- awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
- awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
- awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
"""
|
|
13
|
+
This script is supposed to be invoked by the tape-run.sh or PrepareLaunch class within the Tape container. It iterates
|
|
14
|
+
the connections supplied to extract the ECR URL. Using the URL, the docker image will be downloaded in a per-layer
|
|
15
|
+
fashion and unpacked onto to the container file system. Finally, the paths to the connector jars are written out to an
|
|
16
|
+
output file. Reference: https://rmannibucau.metawerx.net/post/docker-extracts-fileystem-with-bash
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import argparse
|
|
20
|
+
import gzip
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import random
|
|
24
|
+
import re
|
|
25
|
+
import shutil
|
|
26
|
+
import string
|
|
27
|
+
import subprocess
|
|
28
|
+
import sys
|
|
29
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
30
|
+
from urllib.parse import urlparse
|
|
31
|
+
from os import path
|
|
32
|
+
|
|
33
|
+
import boto3
|
|
34
|
+
import requests
|
|
35
|
+
from botocore.config import Config
|
|
36
|
+
from botocore.exceptions import ClientError, NoCredentialsError
|
|
37
|
+
from .connector_activation_util import boto_client_error
|
|
38
|
+
|
|
39
|
+
LAYER_TAR_DIR = "layers/tar"
|
|
40
|
+
LAYER_GZ_DIR = "layers/gz"
|
|
41
|
+
MARKETPLACE = "MARKETPLACE"
|
|
42
|
+
CUSTOM = "CUSTOM"
|
|
43
|
+
HTTP_PROXY = "HTTP_PROXY"
|
|
44
|
+
HTTPS_PROXY = "HTTPS_PROXY"
|
|
45
|
+
NO_PROXY = "NO_PROXY"
|
|
46
|
+
ECR_HOST_PATTERN = r"^([0-9]{12})\.dkr\.ecr\.[a-z]{2}-[a-z]{4}-[0-9]\.amazonaws\.com$"
|
|
47
|
+
|
|
48
|
+
logger = logging.getLogger(__name__)
|
|
49
|
+
logger.setLevel(logging.INFO)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def add_stream_handler() -> None:
|
|
53
|
+
"""
|
|
54
|
+
Add a new stream handler to the logger at module level to emit LogRecord to std.out. With this setup, logs will show
|
|
55
|
+
up in both customer's logStream and our docker logStream to aid debugging.
|
|
56
|
+
"""
|
|
57
|
+
stream_handler = logging.StreamHandler(stream=sys.stdout)
|
|
58
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - Glue ETL Marketplace - %(message)s")
|
|
59
|
+
stream_handler.setFormatter(formatter)
|
|
60
|
+
stream_handler.setLevel(logging.INFO)
|
|
61
|
+
logger.addHandler(stream_handler)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def run_commands(commands: List[str]) -> Tuple[bytes, bytes]:
|
|
65
|
+
"""
|
|
66
|
+
Util function to run shell commands from Python.
|
|
67
|
+
"""
|
|
68
|
+
process = subprocess.Popen(commands,
|
|
69
|
+
stdout=subprocess.PIPE,
|
|
70
|
+
stderr=subprocess.PIPE)
|
|
71
|
+
stdout, stderr = process.communicate()
|
|
72
|
+
logger.info(f"run_commands output - \"{' '.join(commands)}\"\n"
|
|
73
|
+
f"stdout: {stdout.decode()}\n"
|
|
74
|
+
f"stderr: {stderr.decode()}")
|
|
75
|
+
return stdout, stderr
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def send_get_request(url: str, header: Dict[str, str]) -> requests.Response:
|
|
79
|
+
logger.debug(f"Sending GET request to {url} with {header.keys()} specified in header.")
|
|
80
|
+
response = requests.get(url, headers=header)
|
|
81
|
+
response.raise_for_status()
|
|
82
|
+
return response
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def parse_url(url: str) -> Tuple[str, str]:
|
|
86
|
+
res = urlparse(url, allow_fragments=False)
|
|
87
|
+
return res.netloc, res.path.strip("/")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_ecr_region(ecr_root: str) -> Union[None, str]:
|
|
91
|
+
"""
|
|
92
|
+
Extract AWS Region of the ECR registry from its root address
|
|
93
|
+
e.g. xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com
|
|
94
|
+
"""
|
|
95
|
+
session = boto3.session.Session()
|
|
96
|
+
for region in session.get_available_regions("ecr"):
|
|
97
|
+
if region in ecr_root:
|
|
98
|
+
return region
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_registry_id(ecr_root: str) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Extract AWS account id of the ECR registry from its root address
|
|
105
|
+
e.g. xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com
|
|
106
|
+
"""
|
|
107
|
+
match = re.match(ECR_HOST_PATTERN, ecr_root)
|
|
108
|
+
if match:
|
|
109
|
+
return match.group(1)
|
|
110
|
+
else:
|
|
111
|
+
raise ValueError(f"Invalid ECR url supplied, couldn't find aws account from {ecr_root}.")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@boto_client_error(logger)
|
|
115
|
+
def get_ecr_authorization_token(ecr_root: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Get the ECR authorization token to be used later to call ECR HTTP API. Even though not clearly documented, the
|
|
118
|
+
region is actually required to get the correct token, otherwise ECR returns Code 400 when the wrong token is used.
|
|
119
|
+
"""
|
|
120
|
+
region = extract_ecr_region(ecr_root)
|
|
121
|
+
registry_id = extract_registry_id(ecr_root)
|
|
122
|
+
ecr = boto3.client(service_name="ecr", region_name=region)
|
|
123
|
+
logger.info(f"Requesting ECR authorization token for registryIds={registry_id} and region_name={region}.")
|
|
124
|
+
response = ecr.get_authorization_token(registryIds=[registry_id])
|
|
125
|
+
return response["authorizationData"][0]["authorizationToken"]
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def parse_ecr_url(ecr_url: str) -> Tuple[str, str, str]:
|
|
129
|
+
"""
|
|
130
|
+
Parse ECR root address, image name and tag from the given ECR URL.
|
|
131
|
+
E.g. https://xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/salesforce:7.2.0-latest
|
|
132
|
+
"""
|
|
133
|
+
ecr_root, repo = parse_url(ecr_url)
|
|
134
|
+
if not re.match(ECR_HOST_PATTERN, ecr_root):
|
|
135
|
+
raise ValueError("malformed registry, correct pattern is https://aws_account_id.dkr.ecr.region.amazonaws.com")
|
|
136
|
+
if not re.match("^[^:]+:[^:]+$", repo):
|
|
137
|
+
raise ValueError("malformed image name, only one colon allowed to delimit image name and tag")
|
|
138
|
+
image_name, tag = repo.split(":")
|
|
139
|
+
return ecr_root, image_name, tag
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_docker_manifest(ecr_url: str, header: Dict[str, str]) -> Dict[str, Any]:
|
|
143
|
+
"""
|
|
144
|
+
Returns the manifest for the given image in ECR. It includes information about an image such as layers, size and
|
|
145
|
+
digest. We extract the layers to get the digest id to download archive file for each layer.
|
|
146
|
+
"""
|
|
147
|
+
ecr_root, image_name, tag = parse_ecr_url(ecr_url)
|
|
148
|
+
manifest_url = f"https://{ecr_root}/v2/{image_name}/manifests/{tag}"
|
|
149
|
+
logger.info(f"Calling ECR HTTP API to get manifest of {ecr_url}.")
|
|
150
|
+
manifest = send_get_request(manifest_url, header).json()
|
|
151
|
+
return manifest
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def download_and_unpack_docker_layer(ecr_url: str, digest: str, dir_prefix: str, header: Dict[str, str]) -> None:
|
|
155
|
+
"""
|
|
156
|
+
Docker cli and the daemon process are both not available within Glue Python Shell runtime. In order to download
|
|
157
|
+
docker image and extract the connector jars inside, we need to download the layers that consist the image and unpack
|
|
158
|
+
the file system so that we can access the jar files. The layer itself has multiple levels of compression applied,
|
|
159
|
+
which is why we need to download it as gz file and then unpack as tar file. The final unpack of the tar file is done
|
|
160
|
+
via the 'tar' command line tool because the tarfile library doesn't work for permission issue.
|
|
161
|
+
"""
|
|
162
|
+
logger.info(f"Download/unpacking {digest} layer of image: {ecr_url}.")
|
|
163
|
+
layer_id = digest.split(":")[1]
|
|
164
|
+
logger.info(f"Preparing layer url and gz file path to store layer {layer_id}.")
|
|
165
|
+
layer_gz_path = f"{dir_prefix}/{LAYER_GZ_DIR}/{layer_id}.gz"
|
|
166
|
+
ecr_root, image_name, tag = parse_ecr_url(ecr_url)
|
|
167
|
+
layer_url = f"https://{ecr_root}/v2/{image_name}/blobs/{digest}"
|
|
168
|
+
|
|
169
|
+
logger.info(f"Getting the layer file {layer_id} and store it as gz.")
|
|
170
|
+
layer = send_get_request(layer_url, header)
|
|
171
|
+
with open(layer_gz_path, "wb") as f:
|
|
172
|
+
f.write(layer.content)
|
|
173
|
+
|
|
174
|
+
logger.info(f"Unzipping the {layer_id} layer and store as tar file.")
|
|
175
|
+
with gzip.open(f"{layer_gz_path}", "rb") as f_in:
|
|
176
|
+
with open(f"{dir_prefix}/{LAYER_TAR_DIR}/{layer_id}", "wb") as f_out:
|
|
177
|
+
shutil.copyfileobj(f_in, f_out)
|
|
178
|
+
|
|
179
|
+
logger.info(f"Unarchiving {layer_id} layer as tar file.")
|
|
180
|
+
run_commands(["tar", "-C", f"{dir_prefix}/{LAYER_TAR_DIR}/", "-xf", f"{dir_prefix}/{LAYER_TAR_DIR}/{layer_id}"])
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def parse_args(args: List[str]) -> List[str]:
|
|
184
|
+
arg_parser = argparse.ArgumentParser()
|
|
185
|
+
arg_parser.add_argument("--connections",
|
|
186
|
+
required=True,
|
|
187
|
+
type=lambda x: x.split(","),
|
|
188
|
+
help="a list of connection names we'll use to download jars for")
|
|
189
|
+
arg_parser.add_argument("--result_path",
|
|
190
|
+
required=True,
|
|
191
|
+
help="file path to store the jar downloading result")
|
|
192
|
+
arg_parser.add_argument("--region",
|
|
193
|
+
required=True,
|
|
194
|
+
help="aws region of the connections supplied")
|
|
195
|
+
arg_parser.add_argument("--endpoint",
|
|
196
|
+
required=True,
|
|
197
|
+
help="endpoint to use to talk with Glue service")
|
|
198
|
+
arg_parser.add_argument("--proxy",
|
|
199
|
+
default=None,
|
|
200
|
+
help="proxy to talk to Glue backend in case of VPC job")
|
|
201
|
+
parsed_args = arg_parser.parse_args(args)
|
|
202
|
+
return [parsed_args.connections, parsed_args.result_path, parsed_args.region,
|
|
203
|
+
parsed_args.endpoint, parsed_args.proxy]
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def id_generator(size: int = 5, chars: str = string.ascii_uppercase + string.digits) -> str:
|
|
207
|
+
"""
|
|
208
|
+
Generate a random Id using letters from "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" with {size} digits.
|
|
209
|
+
"""
|
|
210
|
+
return ''.join(random.choice(chars) for _ in range(size))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def get_connection(region: str, endpoint: str, conn: str, proxy: Optional[str] = None) -> Union[Dict, None]:
|
|
214
|
+
"""
|
|
215
|
+
Get catalog connection metadata by calling Boto3 get_connection API, supports custom supplied region and endpoint.
|
|
216
|
+
"""
|
|
217
|
+
config = Config()
|
|
218
|
+
if proxy:
|
|
219
|
+
config.proxies = {'https': proxy} # type: ignore
|
|
220
|
+
glue = boto3.Session().client(
|
|
221
|
+
service_name="glue",
|
|
222
|
+
region_name=region,
|
|
223
|
+
endpoint_url=endpoint,
|
|
224
|
+
config=config
|
|
225
|
+
)
|
|
226
|
+
logger.info(f"using region: {region}, proxy: {proxy} and glue endpoint: {endpoint} to get connection: {conn}")
|
|
227
|
+
try:
|
|
228
|
+
return glue.get_connection(Name=conn)
|
|
229
|
+
except ClientError:
|
|
230
|
+
logger.exception(f"Failed to get connection detail for {conn}, skip jar downloading for it")
|
|
231
|
+
except NoCredentialsError:
|
|
232
|
+
logger.exception(f"Unable to get credential to call GetConnection for {conn}, skip jar downloading for it."
|
|
233
|
+
f" Check if the IAM role has the right permission or if you need to increase IMDS retry.")
|
|
234
|
+
return None
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def collect_files_by_suffix(input_dir: str, suffix: str) -> List[str]:
|
|
238
|
+
"""
|
|
239
|
+
Given an input path to a directory, find all files ending with the input suffix. Return a list of absolute paths of
|
|
240
|
+
these files.
|
|
241
|
+
"""
|
|
242
|
+
res = []
|
|
243
|
+
for dirpath, _, filenames in os.walk(input_dir):
|
|
244
|
+
for file in filenames:
|
|
245
|
+
if not file.endswith(suffix):
|
|
246
|
+
continue
|
|
247
|
+
else:
|
|
248
|
+
abs_path = os.path.abspath(os.path.join(dirpath, file))
|
|
249
|
+
res.append(abs_path)
|
|
250
|
+
return res
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@boto_client_error(logger, "Failed to download jars for custom connection from S3...")
|
|
254
|
+
def download_custom_jars(conn: Dict[str, Any], dest_folder: str = "/tmp/custom_connection_jars"):
|
|
255
|
+
os.makedirs(dest_folder, exist_ok=True)
|
|
256
|
+
s3_urls: List[str] = conn["Connection"]["ConnectionProperties"]["CONNECTOR_URL"].split(",")
|
|
257
|
+
s3 = boto3.client("s3")
|
|
258
|
+
res = []
|
|
259
|
+
|
|
260
|
+
for url in s3_urls:
|
|
261
|
+
if url.strip().startswith("s3://") and url.strip().endswith(".jar"):
|
|
262
|
+
bucket, key = parse_url(url.strip())
|
|
263
|
+
file_path = f"{dest_folder}/etl-{key.split('/')[-1]}"
|
|
264
|
+
s3.download_file(bucket, key, file_path)
|
|
265
|
+
res.append(file_path)
|
|
266
|
+
else:
|
|
267
|
+
logger.error("custom connection can only have S3 urls end with '.jar' as connector url.")
|
|
268
|
+
logger.info(f"collected jar paths: {res} for connection: {conn}.")
|
|
269
|
+
return res
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def download_jars_per_connection(conn: str, region: str, endpoint: str, proxy: Optional[str] = None) -> List[str]:
|
|
273
|
+
# validate connection type
|
|
274
|
+
connection = get_connection(region, endpoint, conn, proxy)
|
|
275
|
+
if connection is None:
|
|
276
|
+
return []
|
|
277
|
+
# download jars from S3 in case of custom connection
|
|
278
|
+
elif connection["Connection"]["ConnectionType"] == CUSTOM:
|
|
279
|
+
logger.info(f"Connection {conn} is a Custom connection, try to download jars for it from S3.")
|
|
280
|
+
return download_custom_jars(connection)
|
|
281
|
+
# return empty list in case of non-marketplace connection
|
|
282
|
+
elif connection["Connection"]["ConnectionType"] != MARKETPLACE:
|
|
283
|
+
logger.warning(f"Connection {conn} is not a Marketplace connection, skip jar downloading for it")
|
|
284
|
+
return []
|
|
285
|
+
|
|
286
|
+
# get the connection classname
|
|
287
|
+
if "CONNECTOR_CLASS_NAME" in connection["Connection"]["ConnectionProperties"]:
|
|
288
|
+
driver_name = connection["Connection"]["ConnectionProperties"]["CONNECTOR_CLASS_NAME"]
|
|
289
|
+
|
|
290
|
+
# get the the connection ecr url
|
|
291
|
+
ecr_url = connection["Connection"]["ConnectionProperties"]["CONNECTOR_URL"]
|
|
292
|
+
ecr_root, _, _ = parse_ecr_url(ecr_url)
|
|
293
|
+
|
|
294
|
+
# download the jars
|
|
295
|
+
token = get_ecr_authorization_token(ecr_root)
|
|
296
|
+
http_header = {"Authorization": f"Basic {token}"}
|
|
297
|
+
|
|
298
|
+
manifest = get_docker_manifest(ecr_url, http_header)
|
|
299
|
+
|
|
300
|
+
# make directory for the jars of the given connection
|
|
301
|
+
dir_prefix = id_generator()
|
|
302
|
+
os.makedirs(f"{dir_prefix}/{LAYER_TAR_DIR}", exist_ok=True)
|
|
303
|
+
os.makedirs(f"{dir_prefix}/{LAYER_GZ_DIR}", exist_ok=True)
|
|
304
|
+
|
|
305
|
+
for layer in manifest["layers"]:
|
|
306
|
+
download_and_unpack_docker_layer(ecr_url, layer["digest"], dir_prefix, http_header)
|
|
307
|
+
|
|
308
|
+
# return the jar paths
|
|
309
|
+
res = collect_files_by_suffix(f"{dir_prefix}/{LAYER_TAR_DIR}/jars", ".jar")
|
|
310
|
+
logger.info(f"Container paths are: {res}")
|
|
311
|
+
|
|
312
|
+
# Write OEM key to /tmp/glue-marketplace.conf
|
|
313
|
+
oem_key_path = f"{dir_prefix}/{LAYER_TAR_DIR}/oem/oem.txt"
|
|
314
|
+
if path.exists(oem_key_path):
|
|
315
|
+
with open(oem_key_path, 'r') as oem_file:
|
|
316
|
+
oem_key = oem_file.readline()
|
|
317
|
+
oem_value = oem_file.readline()
|
|
318
|
+
output = """marketplace_oem = {
|
|
319
|
+
%s = {
|
|
320
|
+
oem_key = %s oem_value = %s
|
|
321
|
+
}
|
|
322
|
+
}\n""" % (driver_name, oem_key, oem_value)
|
|
323
|
+
with open("/tmp/glue-marketplace.conf", 'a') as opened_file:
|
|
324
|
+
opened_file.write(output)
|
|
325
|
+
logger.info(f"OEM information is written.")
|
|
326
|
+
|
|
327
|
+
if not res:
|
|
328
|
+
logger.warning(f"found no connector jars from {ecr_url} provided by {conn}, please contact AWS support of"
|
|
329
|
+
f" the Connector product owner to debug the issue.")
|
|
330
|
+
else:
|
|
331
|
+
logger.info(f"collected jar paths: {res} for connection: {conn}")
|
|
332
|
+
return res
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def main():
|
|
336
|
+
# in case of VPC, we directly update config with proxy for glue client. Hence here we unset the environmental values
|
|
337
|
+
# to avoid clients for other AWS services to go through Glue's proxy. The unset is process local and will not affect
|
|
338
|
+
# subsequent aws cli usage.
|
|
339
|
+
if HTTP_PROXY in os.environ:
|
|
340
|
+
del os.environ[HTTP_PROXY]
|
|
341
|
+
if HTTPS_PROXY in os.environ:
|
|
342
|
+
del os.environ[HTTPS_PROXY]
|
|
343
|
+
if NO_PROXY in os.environ:
|
|
344
|
+
del os.environ[NO_PROXY]
|
|
345
|
+
|
|
346
|
+
connections, result_path, region, endpoint, proxy = parse_args(sys.argv[1:])
|
|
347
|
+
add_stream_handler()
|
|
348
|
+
|
|
349
|
+
res = []
|
|
350
|
+
for conn in connections:
|
|
351
|
+
logger.info(f"Start downloading connector jars for connection: {conn}")
|
|
352
|
+
res += download_jars_per_connection(conn, region, endpoint, proxy)
|
|
353
|
+
|
|
354
|
+
# concatenate the jar paths as a string and write it out to result_path
|
|
355
|
+
with open(result_path, "w") as f:
|
|
356
|
+
f.write(",".join(res))
|
|
357
|
+
|
|
358
|
+
logger.info(f"successfully wrote jar paths to \"{result_path}\"")
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
if __name__ == "__main__":
|
|
362
|
+
main()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from logging import Logger
|
|
14
|
+
|
|
15
|
+
from botocore.exceptions import ClientError, NoCredentialsError
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def boto_client_error(logger: Logger, message: str = ""):
|
|
19
|
+
def decorator(func):
|
|
20
|
+
def wrapper(*args, **kwargs):
|
|
21
|
+
try:
|
|
22
|
+
return func(*args, **kwargs)
|
|
23
|
+
except ClientError as error:
|
|
24
|
+
if error.response['Error']['Code'] == 'InternalError': # Generic error
|
|
25
|
+
# We grab the message, request ID, and HTTP code to give to customer support
|
|
26
|
+
logger.error('Error Message: {}'.format(error.response['Error']['Message']))
|
|
27
|
+
logger.error('Request ID: {}'.format(error.response['ResponseMetadata']['RequestId']))
|
|
28
|
+
logger.error('Http code: {}'.format(error.response['ResponseMetadata']['HTTPStatusCode']))
|
|
29
|
+
else:
|
|
30
|
+
logger.error(f"boto3 clientError raised in function {func.__name__}" + repr(error) + message)
|
|
31
|
+
raise
|
|
32
|
+
except NoCredentialsError as error:
|
|
33
|
+
logger.error(f"boto3 NoCredentialsError raised in function {func.__name__}: {repr(error)}"
|
|
34
|
+
f"Check if the IAM role has the right permission or if you need to increase IMDS retry.")
|
|
35
|
+
raise
|
|
36
|
+
|
|
37
|
+
return wrapper
|
|
38
|
+
return decorator
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from __future__ import print_function
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import argparse
|
|
17
|
+
from awsglue.context import GlueContext
|
|
18
|
+
from pyspark.context import SparkContext
|
|
19
|
+
from awsglue.dynamicframe import DynamicFrame
|
|
20
|
+
from awsglue.transforms import get_transform
|
|
21
|
+
from pyspark.sql.types import *
|
|
22
|
+
from .scripts_utils import *
|
|
23
|
+
|
|
24
|
+
def crawler_redo_from_backup(glue_context, **options):
|
|
25
|
+
spark_ctxt = glue_context._instantiatedContext
|
|
26
|
+
backup_location = options['s3.backup_location']
|
|
27
|
+
|
|
28
|
+
# Read from s3
|
|
29
|
+
data = read_from_s3(glue_context, backup_location)
|
|
30
|
+
|
|
31
|
+
# Write to Catalog
|
|
32
|
+
for entity_type in ['table', 'tableToDelete', 'partition', 'partitionToDelete']:
|
|
33
|
+
write_df_to_catalog(data[entity_type], entity_type, glue_context, options)
|
|
34
|
+
|
|
35
|
+
def crawler_redo_from_backup_options(args):
|
|
36
|
+
# arguments
|
|
37
|
+
parser = argparse.ArgumentParser(description='This script allows you to restore a namespace to a specific backup.')
|
|
38
|
+
parser.add_argument('-c', '--crawler-name', required=True, help='Name of the crawler to restore.')
|
|
39
|
+
parser.add_argument('-b', '--backup-location', required=True, help='Location of the backup to use.')
|
|
40
|
+
parser.add_argument('-d', '--database-name', required=False, help='Database to back up. If not specified, '
|
|
41
|
+
'the database target of the crawler is used instead.')
|
|
42
|
+
parser.add_argument('-r', '--region', required=False, default=DEFAULT_REGION, help='Optional service endpoint region.')
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
options, unknown = parser.parse_known_args(args)
|
|
46
|
+
|
|
47
|
+
if options.database_name is not None:
|
|
48
|
+
database_name = options.database_name
|
|
49
|
+
else:
|
|
50
|
+
import boto3
|
|
51
|
+
glue_endpoint = DEFAULT_GLUE_ENDPOINT
|
|
52
|
+
glue = boto3.client('glue', endpoint_url="https://%s.%s.amazonaws.com" % (glue_endpoint, options.region))
|
|
53
|
+
crawler = glue.get_crawler(Name=options.crawler_name)['Crawler']
|
|
54
|
+
database_name = crawler['DatabaseName']
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
"catalog.name": DEFAULT_CATALOG_ENDPOINT,
|
|
58
|
+
"catalog.region": options.region,
|
|
59
|
+
"catalog.database": database_name,
|
|
60
|
+
"crawler.name" : options.crawler_name,
|
|
61
|
+
"s3.backup_location" : options.backup_location
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
def main():
|
|
65
|
+
|
|
66
|
+
# spark env
|
|
67
|
+
sc = SparkContext()
|
|
68
|
+
glue_context = GlueContext(sc)
|
|
69
|
+
|
|
70
|
+
crawler_redo_from_backup(
|
|
71
|
+
glue_context,
|
|
72
|
+
**crawler_redo_from_backup_options(sys.argv[1:]))
|
|
73
|
+
|
|
74
|
+
if __name__ == '__main__':
|
|
75
|
+
main()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
|
|
2
|
+
# Licensed under the Amazon Software License (the "License"). You may not use
|
|
3
|
+
# this file except in compliance with the License. A copy of the License is
|
|
4
|
+
# located at
|
|
5
|
+
#
|
|
6
|
+
# http://aws.amazon.com/asl/
|
|
7
|
+
#
|
|
8
|
+
# or in the "license" file accompanying this file. This file is distributed
|
|
9
|
+
# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
|
|
10
|
+
# or implied. See the License for the specific language governing
|
|
11
|
+
# permissions and limitations under the License.
|
|
12
|
+
|
|
13
|
+
from __future__ import print_function
|
|
14
|
+
|
|
15
|
+
import sys
|
|
16
|
+
import argparse
|
|
17
|
+
from awsglue.context import GlueContext
|
|
18
|
+
from pyspark.context import SparkContext
|
|
19
|
+
|
|
20
|
+
from awsglue.dynamicframe import DynamicFrame
|
|
21
|
+
from awsglue.transforms import get_transform
|
|
22
|
+
from pyspark.sql.types import *
|
|
23
|
+
from .scripts_utils import *
|
|
24
|
+
from pyspark.sql.functions import col
|
|
25
|
+
|
|
26
|
+
def crawler_backup(glue_context, data, options):
|
|
27
|
+
crawler_name = options['crawler.name']
|
|
28
|
+
backup_location = options['s3.backup_location']
|
|
29
|
+
database_name = options['catalog.database']
|
|
30
|
+
|
|
31
|
+
# Only get data for this crawler
|
|
32
|
+
data['table'] = data['table'].filter("parameters.UPDATED_BY_CRAWLER = '%s'" % crawler_name)
|
|
33
|
+
data['partition'] = data['partition'].join(data['table'].withColumn('tableName', col('name')), 'tableName', 'leftsemi')
|
|
34
|
+
|
|
35
|
+
if backup_location is not None:
|
|
36
|
+
# Backup the contents of the catalog at an s3 location
|
|
37
|
+
write_backup(data, database_name, backup_location, glue_context)
|
|
38
|
+
|
|
39
|
+
def crawler_undo(glue_context, **options):
|
|
40
|
+
spark_ctxt = glue_context._instantiatedContext
|
|
41
|
+
crawler_name = options['crawler.name']
|
|
42
|
+
database_name = options['catalog.database']
|
|
43
|
+
timestamp = options['timestamp']
|
|
44
|
+
options["catalog.tableVersions"] = True
|
|
45
|
+
|
|
46
|
+
data = read_from_catalog(glue_context, options)
|
|
47
|
+
|
|
48
|
+
crawler_backup(glue_context, data, options)
|
|
49
|
+
|
|
50
|
+
# Find all the table versions for this crawler
|
|
51
|
+
crawler_tables = data['tableVersion'].select(col("table.updateTime").alias("updateTime"), col("table"), col('table.parameters.UPDATED_BY_CRAWLER')).filter("UPDATED_BY_CRAWLER = '%s'" % crawler_name)
|
|
52
|
+
|
|
53
|
+
# Find the latest previous version of tables for this crawler that were updated or deleted since the last timestamp.
|
|
54
|
+
filtered = crawler_tables.filter("updateTime <= %d" % timestamp).withColumn("filtered_name", col("table.name"))
|
|
55
|
+
update_times = filtered.groupBy("table.name").max("table.updateTime").withColumnRenamed("max(table.updateTime AS `updateTime`)","time")
|
|
56
|
+
joined = filtered.join(update_times, (col("filtered_name") == col("name")) & (col("updateTime") == col("time")), 'inner')
|
|
57
|
+
tables_to_write = joined.select(col("table.*"))
|
|
58
|
+
|
|
59
|
+
# Find the tables that were created since the last timestamp
|
|
60
|
+
names = crawler_tables.select(col("table.name")).distinct()
|
|
61
|
+
present_before_timestamp = joined.select(col("table.name"))
|
|
62
|
+
tables_to_delete = names.subtract(present_before_timestamp)
|
|
63
|
+
|
|
64
|
+
# Find the partitions that were created since the last timestamp
|
|
65
|
+
partitions_to_delete = data['partition'].withColumn('name', col('tableName')).join(crawler_tables.withColumn('name', col('table.name')), 'name', 'leftsemi').filter("creationTime < %d" % timestamp)
|
|
66
|
+
|
|
67
|
+
# Write to Catalog
|
|
68
|
+
write_df_to_catalog(tables_to_write, "table", glue_context, options)
|
|
69
|
+
write_df_to_catalog(tables_to_delete, "tableToDelete", glue_context, options)
|
|
70
|
+
write_df_to_catalog(partitions_to_delete, "partitionToDelete", glue_context, options)
|
|
71
|
+
|
|
72
|
+
def crawler_undo_options(args):
|
|
73
|
+
# arguments
|
|
74
|
+
parser = argparse.ArgumentParser(description='This script allows you to rollback the effects of a crawler.')
|
|
75
|
+
parser.add_argument('-c', '--crawler-name', required=True, help='Name of the crawler to rollback.')
|
|
76
|
+
parser.add_argument('-b', '--backup-location', required=False, help='Location of the backup to use. If not specified, no backup is used.')
|
|
77
|
+
parser.add_argument('-d', '--database-name', required=False, help='Database to roll back. If not specified, '
|
|
78
|
+
'the database target of the crawler is used instead.')
|
|
79
|
+
parser.add_argument('-t', '--timestamp', required=False, help='Timestamp to rollback to, in milliseconds since epoch. If not specified, '
|
|
80
|
+
'the start timestamp of the crawler is used instead.')
|
|
81
|
+
parser.add_argument('-r', '--region', required=False, default=DEFAULT_REGION, help='Optional DataCatalog service endpoint region.')
|
|
82
|
+
|
|
83
|
+
options, unknown = parser.parse_known_args(args)
|
|
84
|
+
|
|
85
|
+
if not (options.database_name is not None and options.timestamp is not None):
|
|
86
|
+
import boto3 # Import is done here to ensure script does not fail in case boto3 is not required.
|
|
87
|
+
glue_endpoint = DEFAULT_GLUE_ENDPOINT
|
|
88
|
+
glue = boto3.client('glue', endpoint_url="https://%s.%s.amazonaws.com" % (glue_endpoint, options.region))
|
|
89
|
+
crawler = glue.get_crawler(Name=options.crawler_name)['Crawler']
|
|
90
|
+
|
|
91
|
+
if options.database_name is not None:
|
|
92
|
+
database_name = options.database_name
|
|
93
|
+
else:
|
|
94
|
+
database_name = crawler['DatabaseName']
|
|
95
|
+
|
|
96
|
+
if options.timestamp is not None:
|
|
97
|
+
timestamp = options.timestamp
|
|
98
|
+
else:
|
|
99
|
+
timestamp = crawler['LastCrawlInfo']['StartTime']
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"catalog.name": DEFAULT_CATALOG_ENDPOINT,
|
|
103
|
+
"catalog.region": options.region,
|
|
104
|
+
"catalog.database": database_name,
|
|
105
|
+
"crawler.name" : options.crawler_name,
|
|
106
|
+
"s3.backup_location" : options.backup_location,
|
|
107
|
+
"timestamp": int(timestamp)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
def main():
|
|
111
|
+
|
|
112
|
+
# spark env
|
|
113
|
+
sc = SparkContext()
|
|
114
|
+
glue_context = GlueContext(sc)
|
|
115
|
+
|
|
116
|
+
crawler_undo(
|
|
117
|
+
glue_context,
|
|
118
|
+
**crawler_undo_options(sys.argv[1:]))
|
|
119
|
+
|
|
120
|
+
if __name__ == '__main__':
|
|
121
|
+
main()
|