PyPI - AWSGlueDataplanePython - Versions diffs - 5.0.0__py3-none-any.whl - Mend

AWSGlueDataplanePython 5.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

awsglue/README.md +37 -0
awsglue/__init__.py +15 -0
awsglue/context.py +690 -0
awsglue/data_sink.py +49 -0
awsglue/data_source.py +49 -0
awsglue/dataframe_transforms/__init__.py +17 -0
awsglue/dataframe_transforms/apply_mapping.py +76 -0
awsglue/dataframereader.py +41 -0
awsglue/dataframewriter.py +21 -0
awsglue/devutils.py +236 -0
awsglue/dynamicframe.py +669 -0
awsglue/functions.py +31 -0
awsglue/glue_shell.py +38 -0
awsglue/gluetypes.py +461 -0
awsglue/job.py +59 -0
awsglue/scripts/__init__.py +12 -0
awsglue/scripts/activate_etl_connector.py +362 -0
awsglue/scripts/connector_activation_util.py +38 -0
awsglue/scripts/crawler_redo_from_backup.py +75 -0
awsglue/scripts/crawler_undo.py +121 -0
awsglue/scripts/scripts_utils.py +106 -0
awsglue/streaming_data_source.py +28 -0
awsglue/transforms/__init__.py +47 -0
awsglue/transforms/apply_mapping.py +72 -0
awsglue/transforms/coalesce.py +66 -0
awsglue/transforms/collection_transforms.py +155 -0
awsglue/transforms/drop_nulls.py +85 -0
awsglue/transforms/dynamicframe_filter.py +66 -0
awsglue/transforms/dynamicframe_map.py +72 -0
awsglue/transforms/errors_as_dynamicframe.py +45 -0
awsglue/transforms/field_transforms.py +469 -0
awsglue/transforms/relationalize.py +105 -0
awsglue/transforms/repartition.py +61 -0
awsglue/transforms/resolve_choice.py +85 -0
awsglue/transforms/transform.py +92 -0
awsglue/transforms/unbox.py +112 -0
awsglue/transforms/union.py +66 -0
awsglue/transforms/unnest_frame.py +75 -0
awsglue/utils.py +159 -0
awsgluedataplanepython-5.0.0.dist-info/METADATA +178 -0
awsgluedataplanepython-5.0.0.dist-info/RECORD +45 -0
awsgluedataplanepython-5.0.0.dist-info/WHEEL +5 -0
awsgluedataplanepython-5.0.0.dist-info/licenses/LICENSE.txt +96 -0
awsgluedataplanepython-5.0.0.dist-info/licenses/NOTICE.txt +3 -0
awsgluedataplanepython-5.0.0.dist-info/top_level.txt +1 -0

awsglue/data_sink.py ADDED Viewed

@@ -0,0 +1,49 @@
+# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Licensed under the Amazon Software License (the "License"). You may not use
+# this file except in compliance with the License. A copy of the License is
+# located at
+#
+#  http://aws.amazon.com/asl/
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
+# or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from awsglue.dynamicframe import DynamicFrame, DynamicFrameCollection
+from awsglue.utils import makeOptions, callsite
+from pyspark.sql import DataFrame
+class DataSink(object):
+    def __init__(self, j_sink, sql_ctx):
+        self._jsink = j_sink
+        self._sql_ctx = sql_ctx
+    def setFormat(self, format, **options):
+        self._jsink.setFormat(format, makeOptions(self._sql_ctx._sc, options))
+    def setAccumulableSize(self, size):
+        self._jsink.setAccumulableSize(size)
+    def setCatalogInfo(self, catalogDatabase, catalogTableName, catalogId = ""):
+        self._jsink.setCatalogInfo(catalogDatabase, catalogTableName, catalogId)
+    def writeFrame(self, dynamic_frame, info = ""):
+        return DynamicFrame(self._jsink.pyWriteDynamicFrame(dynamic_frame._jdf, callsite(), info), dynamic_frame.glue_ctx, dynamic_frame.name + "_errors")
+    def writeDataFrame(self, data_frame, glue_context, info = ""):
+        return DataFrame(self._jsink.pyWriteDataFrame(data_frame._jdf, glue_context._glue_scala_context, callsite(), info), self._sql_ctx)
+    def write(self, dynamic_frame_or_dfc, info = ""):
+        if isinstance(dynamic_frame_or_dfc, DynamicFrame):
+            return self.writeFrame(dynamic_frame_or_dfc, info)
+        elif isinstance(dynamic_frame_or_dfc, DynamicFrameCollection):
+            res_frames = [self.writeFrame(frame)
+                          for frame in dynamic_frame_or_dfc.values()]
+            return DynamicFrameCollection(res_frames, self._sql_ctx)
+        else:
+            raise TypeError("dynamic_frame_or_dfc must be an instance of"
+                            "DynamicFrame or DynamicFrameCollection. Got "
+                            + str(type(dynamic_frame_or_dfc)))

awsglue/data_source.py ADDED Viewed

@@ -0,0 +1,49 @@
+# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Licensed under the Amazon Software License (the "License"). You may not use
+# this file except in compliance with the License. A copy of the License is
+# located at
+#
+#  http://aws.amazon.com/asl/
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
+# or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from awsglue.dynamicframe import DynamicFrame
+from awsglue.utils import makeOptions, callsite
+from pyspark.sql import DataFrame
+class DataSource(object):
+    def __init__(self, j_source, sql_ctx, name):
+        self._jsource = j_source
+        self._sql_ctx = sql_ctx
+        self.name = name
+    def setFormat(self, format, **options):
+        options["callSite"] = callsite()
+        self._jsource.setFormat(format, makeOptions(self._sql_ctx._sc, options))
+    def getFrame(self, **options):
+        minPartitions = targetPartitions = None
+        if 'minPartitions' in options:
+            minPartitions = options['minPartitions']
+            targetPartitions = options.get('targetPartitions', minPartitions)
+        elif 'targetPartitions' in options:
+            minPartitions = targetPartitions = options['targetPartitions']
+        if minPartitions is None:
+            jframe = self._jsource.getDynamicFrame()
+        else:
+            jframe = self._jsource.getDynamicFrame(minPartitions, targetPartitions)
+        return DynamicFrame(jframe, self._sql_ctx, self.name)
+    def getSampleFrame(self, num, **options):
+        jframe = self._jsource.getSampleDynamicFrame(num, makeOptions(self._sql_ctx._sc, options))
+        return DynamicFrame(jframe, self._sql_ctx, self.name)
+    def getDataFrame(self):
+        jdf = self._jsource.getDataFrame()
+        return DataFrame(jdf, self._sql_ctx)

awsglue/dataframe_transforms/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Licensed under the Amazon Software License (the "License"). You may not use
+# this file except in compliance with the License. A copy of the License is
+# located at
+#
+#  http://aws.amazon.com/asl/
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
+# or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from .apply_mapping import ApplyMapping
+ALL_TRANSFORMS = {ApplyMapping}
+__all__ = [transform.__name__ for transform in ALL_TRANSFORMS]

awsglue/dataframe_transforms/apply_mapping.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Licensed under the Amazon Software License (the "License"). You may not use
+# this file except in compliance with the License. A copy of the License is
+# located at
+#
+#  http://aws.amazon.com/asl/
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
+# or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from py4j.java_gateway import java_import # type: ignore
+from pyspark.sql.dataframe import DataFrame
+class ApplyMapping():
+    @staticmethod
+    def apply(frame, mappings):
+        jvm = frame.sql_ctx._jvm
+        def _to_java_mapping(mapping_tup):
+            if not isinstance(mapping_tup, tuple):
+                raise TypeError("Mapping must be specified as a tuple. Got " +
+                                mapping_tup)
+            tup2 = jvm.scala.Tuple2
+            tup3 = jvm.scala.Tuple3
+            tup4 = jvm.scala.Tuple4
+            if len(mapping_tup) == 2:
+                return tup2.apply(mapping_tup[0], mapping_tup[1])
+            elif len(mapping_tup) == 3:
+                return tup3.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2])
+            elif len(mapping_tup) == 4:
+                return tup4.apply(mapping_tup[0], mapping_tup[1], mapping_tup[2], mapping_tup[3])
+            else:
+                raise ValueError("Mapping tuple must be of length 2, 3, or 4"
+                                 "Got tuple of length " + str(len(mapping_tup)))
+        if isinstance(mappings, tuple):
+            mappings = [mappings]
+        mappings_seq = jvm.PythonUtils.toSeq([_to_java_mapping(m) for m in mappings])
+        java_import(jvm, "com.amazonaws.services.glue.dataframeTransforms.ApplyMapping")
+        return DataFrame(jvm.ApplyMapping.apply(frame._jdf, mappings_seq), frame.sql_ctx)
+    @classmethod
+    def describeArgs(cls):
+        arg1 = {"name": "frame",
+                "type": "DataFrame",
+                "description": "DataFrame to transform",
+                "optional": False,
+                "defaultValue": None}
+        arg2 = {"name": "mappings",
+                "type": "DataFrame",
+                "description": "List of mapping tuples (source col, source type, target col, target type)",
+                "optional": False,
+                "defaultValue": None}
+        return [arg1, arg2]
+    @classmethod
+    def describeTransform(cls):
+        return "Apply a declarative mapping to this DataFrame."
+    @classmethod
+    def describeErrors(cls):
+        return []
+    @classmethod
+    def describeReturn(cls):
+        return {"type": "DataFrame",
+                "description": "DataFrame after applying mappings."}

awsglue/dataframereader.py ADDED Viewed

@@ -0,0 +1,41 @@
+# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Licensed under the Amazon Software License (the "License"). You may not use
+# this file except in compliance with the License. A copy of the License is
+# located at
+#
+#  http://aws.amazon.com/asl/
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
+# or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+class DataFrameReader(object):
+    def __init__(self, glue_context):
+        self._glue_context = glue_context
+    def from_catalog(self, database = None, table_name = None, redshift_tmp_dir = "", transformation_ctx = "", push_down_predicate = "", additional_options = {}, catalog_id = None, **kwargs):
+        """Creates a DynamicFrame with the specified catalog name space and table name.
+        """
+        if database is not None and "name_space" in kwargs:
+            raise Exception("Parameter name_space and database are both specified, choose one.")
+        elif database is None and "name_space" not in kwargs:
+            raise Exception("Parameter name_space or database is missing.")
+        elif "name_space" in kwargs:
+            db = kwargs.pop("name_space")
+        else:
+            db = database
+        if table_name is None:
+            raise Exception("Parameter table_name is missing.")
+        return self._glue_context.create_data_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)
+    def from_options(self, connection_type, connection_options={},
+                     format=None, format_options={}, transformation_ctx="", push_down_predicate = "", **kwargs):
+        """Creates a DataFrame with the specified connection and format.
+        """
+        return self._glue_context.create_data_frame_from_options(connection_type,
+                                                                    connection_options,
+                                                                    format,
+                                                                    format_options, transformation_ctx, push_down_predicate, **kwargs)

awsglue/dataframewriter.py ADDED Viewed

@@ -0,0 +1,21 @@
+class DataFrameWriter(object):
+    def __init__(self, glue_context):
+        self._glue_context = glue_context
+    def from_catalog(self, frame, database=None, table_name=None, redshift_tmp_dir="", transformation_ctx="",
+                     additional_options={}, catalog_id=None, **kwargs):
+        """Writes a DataFrame with the specified catalog name space and table name.
+        """
+        if database is not None and "name_space" in kwargs:
+            raise Exception("Parameter name_space and database are both specified, choose one.")
+        elif database is None and "name_space" not in kwargs:
+            raise Exception("Parameter name_space or database is missing.")
+        elif "name_space" in kwargs:
+            db = kwargs.pop("name_space")
+        else:
+            db = database
+        if table_name is None:
+            raise Exception("Parameter table_name is missing.")
+        return self._glue_context.write_data_frame_from_catalog(frame, db, table_name, redshift_tmp_dir,
+                                                                   transformation_ctx, additional_options, catalog_id)

awsglue/devutils.py ADDED Viewed

@@ -0,0 +1,236 @@
+# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# Licensed under the Amazon Software License (the "License"). You may not use
+# this file except in compliance with the License. A copy of the License is
+# located at
+#
+#  http://aws.amazon.com/asl/
+#
+# or in the "license" file accompanying this file. This file is distributed
+# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express
+# or implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from __future__ import print_function
+import boto3
+import os
+import logging
+import copy
+from datetime import datetime
+class ExecutionProperty:
+    def __init__(self, maxConcurrentRuns=1):
+        self.maxConcurrentRuns = maxConcurrentRuns
+    def __repr__(self):
+        return "{'maxConcurrentRuns': "+ str(self.maxConcurrentRuns)+ "}"
+    def as_dict(self):
+        return {'maxConcurrentRuns': self.maxConcurrentRuns}
+class Command:
+    def __init__(self, name, scriptLocation):
+        self.name=name
+        self.scriptLocation=scriptLocation
+    def __repr__(self):
+        return "{'name': '"+ str(self.name)+",' 'scriptLocation': '"+ str(self.scriptLocation)+"'}"
+    def as_dict(self):
+        return {'name': self.name, 'scriptLocation': self.scriptLocation}
+class Connections:
+    def __init__(self, connections=[]):
+        self.connections=connections
+    def __repr__(self):
+        return "{'connections': "+str(self.connections) + "}"
+    def as_dict(self):
+        return {'connections': self.connections}
+class Job:
+    def __init__(self):
+        self.name = ''
+        self.description = ''
+        self.logUri = ''
+        self.role = ''
+        self.executionProperty = ExecutionProperty()
+        self.command = Command("glueetl", "UNKNOWN")
+        self.defaultArguments = {}
+        self.connections = Connections()
+        self.maxRetries = 1
+        self.allocatedCapacity = 1
+        self.createdOn = datetime.now()
+        self.lastModifiedOn = datetime.now()
+    def __repr__(self):
+        return "{'command': "+str(self.command) + ",\n" + \
+            "'connections': "+str(self.connections) + ",\n" + \
+            "'createdOn': "+str(self.createdOn) + ",\n" + \
+            "'description': '"+str(self.description) + "',\n" + \
+            "'defaultArguments': "+str(self.defaultArguments) + ",\n" + \
+            "'executionProperty': "+str(self.executionProperty) + ",\n" + \
+            "'lastModifiedOn': "+str(self.lastModifiedOn) + ",\n" + \
+            "'logUri': '"+str(self.logUri) + "',\n" + \
+            "'maxRetries': "+str(self.maxRetries) + ",\n" + \
+            "'name': '"+str(self.name) + "',\n" + \
+            "'role': '"+str(self.role) + "',\n" + \
+            "}"
+    def as_dict(self):
+        job_dict = {}
+        job_dict['command'] = self.command.as_dict()
+        if len(self.connections.connections) > 0:
+            job_dict['connections'] = self.connections.as_dict()
+        job_dict['createdOn'] = self.createdOn
+        if len(self.description) > 0:
+            job_dict['description'] = self.description
+        job_dict['defaultArguments'] = self.defaultArguments
+        job_dict['executionProperty'] = self.executionProperty.as_dict()
+        job_dict['lastModifiedOn'] = self.lastModifiedOn
+        job_dict['logUri'] = self.logUri
+        job_dict['maxRetries'] = self.maxRetries
+        job_dict['name'] = self.name
+        job_dict['role'] = self.role
+        return job_dict
+    def as_job_create_dict(self):
+        job_dict = copy.deepcopy(self.as_dict())
+        del job_dict['createdOn']
+        del job_dict['lastModifiedOn']
+        return job_dict
+    def as_job_update_dict(self):
+        job_dict = copy.deepcopy(self.as_dict())
+        del job_dict['name']
+        del job_dict['createdOn']
+        del job_dict['lastModifiedOn']
+        return job_dict
+class GlueJobUtils:
+    def __init__(self, glue_context):
+        proxy_url = glue_context._jvm.AWSConnectionUtils.getGlueProxyUrl()
+        glue_endpoint = glue_context._jvm.AWSConnectionUtils.getGlueEndpoint()
+        region = glue_context._jvm.AWSConnectionUtils.getRegion()
+        # s3 service calls are not allowed through the proxy for the moment, so we use the s3 vpc endpoint instead
+        self.s3 = boto3.resource('s3')
+        # Boto does not have a API to set proxy information. It uses environment variables to lookup proxy informtion
+        if not proxy_url[8:].startswith('null'):
+            os.environ['https_proxy'] = proxy_url
+        self.glue = boto3.client('glue', endpoint_url=glue_endpoint, region_name=region)
+    def _glue_job_response_to_job(self, response_job):
+        job = Job()
+        job.name = response_job['name']
+        try:
+            job.description = response_job['description']
+        except KeyError:
+            logging.warning('description is missing in job response for job %s' % job.name)
+        try:
+            job.defaultArguments = response_job['defaultArguments']
+        except KeyError:
+            logging.warning('defaultArguments is missing in job response for job %s' % job.name)
+        try:
+            job.logUri = response_job['logUri']
+        except KeyError:
+            logging.warning('logUri is missing in job response for job %s' % job.name)
+        try:
+            job.role = response_job['role']
+        except KeyError:
+            logging.warning('role is missing in job response for job %s' % job.name)
+        try:
+            execution_property_dict = response_job['executionProperty']
+            job.executionProperty = ExecutionProperty(execution_property_dict['maxConcurrentRuns'])
+        except KeyError:
+            logging.warning('executionProperty is missing in job response for job %s' % job.name)
+        try:
+            command_dict = response_job['command']
+            job.command = Command(command_dict['name'], command_dict['scriptLocation'])
+        except KeyError:
+            logging.warning('command is missing in job response for job %s' % job.name)
+        try:
+            connections_dict = response_job['connections']
+            job.connections = Connections(connections_dict['connections'])
+        except KeyError:
+            logging.warning('connections is missing in job response for job %s' % job.name)
+        try:
+            job.maxRetries = response_job['maxRetries']
+        except KeyError:
+            logging.warning('maxRetries is missing in job response for job %s' % job.name)
+        try:
+            job.createdOn = response_job['createdOn']
+        except KeyError:
+            logging.warning('createdOn is missing in job response for job %s' % job.name)
+        try:
+            job.lastModifiedOn = response_job['lastModifiedOn']
+        except KeyError:
+            logging.warning('lastModifiedOn is missing in job response for job %s' % job.name)
+        return job
+    def get_jobs(self, nextToken=''):
+        response = self.glue.get_jobs(nextToken=nextToken)
+        list_jobs_response = {}
+        try:
+            list_jobs_response['NextToken'] = response['NextToken']
+        except KeyError:
+            logging.info('NextToken is not present in get_jobs response')
+        list_jobs_response['jobs'] = [self._glue_job_response_to_job(j) for j in response['jobs']]
+        return list_jobs_response
+    def get_job(self, jobName):
+        response = self.glue.get_job(jobName=jobName)
+        return self._glue_job_response_to_job(response['job'])
+    def _get_bucket_prefix_from_s3_url(self, s3_url):
+        if not s3_url.startswith('s3://'):
+            raise Exception('s3 url for scriptLocation should start with s3:// but given %s' % s3_url)
+        url_parts = s3_url[5:].split('/', 1)
+        if not len(url_parts) == 2:
+            raise Exception('s3 url for scriptLocation does not include a prefix: %s' % s3_url)
+        if url_parts[1].endswith('/'):
+            raise Exception('s3 url for scriptLocation should ot end with '/': %s' % s3_url)
+        return {'bucket': url_parts[0], 'prefix': url_parts[1]}
+    def _upload_file_to_s3(self, s3_url, file):
+        if len(file) == 0:
+            logging.warning('script file is not specified, skipping upload of script to s3')
+        else:
+            s3_parts = self._get_bucket_prefix_from_s3_url(s3_url)
+            self.s3.Object(s3_parts['bucket'], s3_parts['prefix']).put(Body=open(file, 'rb'))
+    def create_job(self, job, file=''):
+        try:
+            self._upload_file_to_s3(job.command.scriptLocation, file)
+            return self.glue.create_job(**job.as_job_create_dict())
+        except Exception as inst:
+            print(inst)
+            logging.error('Failed to create job')
+    def update_job(self, job, file=''):
+        try:
+            self._upload_file_to_s3(job.command.scriptLocation, file)
+            return self.glue.update_job(jobName=job.name, jobUpdate=job.as_job_update_dict())
+        except Exception as inst:
+            print(inst)
+            logging.error('Failed to update job')
+    def delete_job(self, jobName):
+        return self.glue.delete_job(jobName=jobName)