egis 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/egis.gemspec +30 -0
- data/lib/egis/aws_client_provider.rb +30 -0
- data/lib/egis/cartesian_product_generator.rb +17 -0
- data/lib/egis/client.rb +142 -0
- data/lib/egis/configuration.rb +9 -0
- data/lib/egis/database.rb +102 -0
- data/lib/egis/errors.rb +12 -0
- data/lib/egis/output_downloader.rb +21 -0
- data/lib/egis/output_parser.rb +24 -0
- data/lib/egis/partitions_generator.rb +55 -0
- data/lib/egis/query_output_location.rb +13 -0
- data/lib/egis/query_status.rb +76 -0
- data/lib/egis/s3_cleaner.rb +22 -0
- data/lib/egis/s3_location_parser.rb +14 -0
- data/lib/egis/standard_mode.rb +18 -0
- data/lib/egis/table.rb +163 -0
- data/lib/egis/table_data_wiper.rb +51 -0
- data/lib/egis/table_ddl_generator.rb +50 -0
- data/lib/egis/table_schema.rb +49 -0
- data/lib/egis/testing/testing_mode.rb +62 -0
- data/lib/egis/testing.rb +48 -0
- data/lib/egis/types/boolean_serializer.rb +53 -0
- data/lib/egis/types/default_serializer.rb +20 -0
- data/lib/egis/types/integer_serializer.rb +20 -0
- data/lib/egis/types/null_serializer.rb +36 -0
- data/lib/egis/types/string_serializer.rb +20 -0
- data/lib/egis/types/timestamp_serializer.rb +22 -0
- data/lib/egis/types.rb +30 -0
- data/lib/egis/version.rb +5 -0
- data/lib/egis.rb +62 -0
- metadata +106 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: 1d2255a76aef464d0d8faee0c39be753e1a928133a25de5c4a86f553e51e92ca
         | 
| 4 | 
            +
              data.tar.gz: f18ae28053651576ccf941bd42f5a9bf40db32fed3c1c2025e1bc987844199f8
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 9b143dbc650f8c02ba39f1404a2d9b4ebe36c49182a36b947a1d091079b1c94839320c2d05dd9a0db1516d82c9b03694244e4fa467898302e888addb234e1291
         | 
| 7 | 
            +
              data.tar.gz: a1577d30cbfd63632dbd55052adf0f863f7a5f571e8a8f8a210649446acc786764d8bec6a15be7cd91742f4f7681bfc1d379a9099934f62b8a30379d12a4bd77
         | 
    
        data/egis.gemspec
    ADDED
    
    | @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            lib = File.expand_path('lib', __dir__)
         | 
| 4 | 
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         | 
| 5 | 
            +
            require 'egis/version'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Gem::Specification.new do |spec|
         | 
| 8 | 
            +
              spec.name          = 'egis'
         | 
| 9 | 
            +
              spec.version       = Egis::VERSION
         | 
| 10 | 
            +
              spec.authors       = ['Agnieszka Czereba', 'Marek Mateja']
         | 
| 11 | 
            +
              spec.email         = %w[agnieszka.czereba@u2i.com marek.mateja@u2i.com]
         | 
| 12 | 
            +
             | 
| 13 | 
            +
              spec.summary       = 'A handy wrapper for AWS Athena Ruby SDK.'
         | 
| 14 | 
            +
              spec.homepage      = 'https://github.com/u2i/egis'
         | 
| 15 | 
            +
              spec.license       = 'MIT'
         | 
| 16 | 
            +
             | 
| 17 | 
            +
              spec.metadata['homepage_uri'] = spec.homepage
         | 
| 18 | 
            +
              spec.metadata['source_code_uri'] = spec.homepage
         | 
| 19 | 
            +
              spec.metadata['changelog_uri'] = 'https://github.com/u2i/egis/blob/master/CHANGELOG.md'
         | 
| 20 | 
            +
             | 
| 21 | 
            +
              # Specify which files should be added to the gem when it is released.
         | 
| 22 | 
            +
              # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
         | 
| 23 | 
            +
              spec.files = Dir.chdir(File.expand_path(__dir__)) do
         | 
| 24 | 
            +
                `git ls-files -z egis.gemspec lib/`.split("\x0")
         | 
| 25 | 
            +
              end
         | 
| 26 | 
            +
              spec.require_paths = ['lib']
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              spec.add_dependency 'aws-sdk-athena', '~> 1.0'
         | 
| 29 | 
            +
              spec.add_dependency 'aws-sdk-s3', '~> 1.0'
         | 
| 30 | 
            +
            end
         | 
| @@ -0,0 +1,30 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'aws-sdk-s3'
         | 
| 4 | 
            +
            require 'aws-sdk-athena'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            module Egis
         | 
| 7 | 
            +
              # @!visibility private
         | 
| 8 | 
            +
              class AwsClientProvider
         | 
| 9 | 
            +
                def s3_client
         | 
| 10 | 
            +
                  Aws::S3::Client.new(client_config)
         | 
| 11 | 
            +
                end
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                def athena_client
         | 
| 14 | 
            +
                  Aws::Athena::Client.new(client_config)
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                private
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                def client_config
         | 
| 20 | 
            +
                  configuration = Egis.configuration
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                  config = {}
         | 
| 23 | 
            +
                  config[:region] = configuration.aws_region if configuration.aws_region
         | 
| 24 | 
            +
                  config[:access_key_id] = configuration.aws_access_key_id if configuration.aws_access_key_id
         | 
| 25 | 
            +
                  config[:secret_access_key] = configuration.aws_secret_access_key if configuration.aws_secret_access_key
         | 
| 26 | 
            +
                  config[:profile] = configuration.aws_profile if configuration.aws_profile
         | 
| 27 | 
            +
                  config
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
              end
         | 
| 30 | 
            +
            end
         | 
| @@ -0,0 +1,17 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              # @!visibility private
         | 
| 5 | 
            +
              class CartesianProductGenerator
         | 
| 6 | 
            +
                def cartesian_product(values_by_key)
         | 
| 7 | 
            +
                  keys = values_by_key.keys
         | 
| 8 | 
            +
                  values = values_by_key.values
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                  head, *tail = values
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                  return keys.zip(head) unless tail
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  head.product(*tail).map { |vals| keys.zip(vals) }
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
              end
         | 
| 17 | 
            +
            end
         | 
    
        data/lib/egis/client.rb
    ADDED
    
    | @@ -0,0 +1,142 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              ##
         | 
| 5 | 
            +
              # The most fundamental {Egis} class. Provides an interface for executing Athena queries.
         | 
| 6 | 
            +
              #
         | 
| 7 | 
            +
              # See configuration instructions {Egis.configure}.
         | 
| 8 | 
            +
              #
         | 
| 9 | 
            +
              # @see Egis.configure
         | 
| 10 | 
            +
              #
         | 
| 11 | 
            +
              # @example Create client and execute asynchronous query
         | 
| 12 | 
            +
              #   client = Egis::Client.new
         | 
| 13 | 
            +
              #   status = client.execute_query('SELECT * FROM my_table;')
         | 
| 14 | 
            +
              #
         | 
| 15 | 
            +
              #   while status.in_progress?
         | 
| 16 | 
            +
              #     # do something useful
         | 
| 17 | 
            +
              #     # ...
         | 
| 18 | 
            +
              #     status = client.query_status(status.id)
         | 
| 19 | 
            +
              #   end
         | 
| 20 | 
            +
              #
         | 
| 21 | 
            +
              #   status.output_location.url # s3://my-bucket/result/path
         | 
| 22 | 
            +
              #
         | 
| 23 | 
            +
              # @example Execute synchronous query and fetch results
         | 
| 24 | 
            +
              #   status = client.execute_query('SELECT MAX(time), MIN(id) FROM my_table;', async: false)
         | 
| 25 | 
            +
              #   status.fetch_result(schema: [:timestamp, :int]) # [[2020-05-04 11:19:03 +0200, 7]]
         | 
| 26 | 
            +
              #
         | 
| 27 | 
            +
              class Client
         | 
| 28 | 
            +
                QUERY_STATUS_MAPPING = {
         | 
| 29 | 
            +
                  'QUEUED' => Egis::QueryStatus::QUEUED,
         | 
| 30 | 
            +
                  'RUNNING' => Egis::QueryStatus::RUNNING,
         | 
| 31 | 
            +
                  'SUCCEEDED' => Egis::QueryStatus::FINISHED,
         | 
| 32 | 
            +
                  'FAILED' => Egis::QueryStatus::FAILED,
         | 
| 33 | 
            +
                  'CANCELLED' => Egis::QueryStatus::CANCELLED
         | 
| 34 | 
            +
                }.freeze
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                DEFAULT_QUERY_STATUS_BACKOFF = ->(attempt) { 1.5**attempt - 1 }
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                private_constant :QUERY_STATUS_MAPPING, :DEFAULT_QUERY_STATUS_BACKOFF
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                def initialize(aws_client_provider: Egis::AwsClientProvider.new, s3_location_parser: Egis::S3LocationParser.new)
         | 
| 41 | 
            +
                  @aws_athena_client = aws_client_provider.athena_client
         | 
| 42 | 
            +
                  @s3_location_parser = s3_location_parser
         | 
| 43 | 
            +
                  @query_status_backoff = Egis.configuration.query_status_backoff || DEFAULT_QUERY_STATUS_BACKOFF
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                ##
         | 
| 47 | 
            +
                # Creates {Egis::Database} object with a given name. Executing it doesn't create Athena database yet.
         | 
| 48 | 
            +
                #
         | 
| 49 | 
            +
                # @param [String] database_name
         | 
| 50 | 
            +
                # @return [Egis::Database]
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                def database(database_name)
         | 
| 53 | 
            +
                  Database.new(database_name, client: self)
         | 
| 54 | 
            +
                end
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                ##
         | 
| 57 | 
            +
                # Executes Athena query. By default, queries are being executed asynchronously.
         | 
| 58 | 
            +
                #
         | 
| 59 | 
            +
                # @param [String] query SQL query to execute
         | 
| 60 | 
            +
                # @param [Boolean] async Decide whether you want to run query asynchronously or block execution until it finishes
         | 
| 61 | 
            +
                # @param [String] work_group Change Athena work group the query will be executed in.
         | 
| 62 | 
            +
                # @param [String] database Run query in the context of a specific database (implicit table references are expected
         | 
| 63 | 
            +
                #   to be in given database).
         | 
| 64 | 
            +
                # @param [String] output_location S3 url of the desired output location. By default, Athena uses location defined in
         | 
| 65 | 
            +
                #   by workgroup.
         | 
| 66 | 
            +
                # @return [Egis::QueryStatus]
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                def execute_query(query, work_group: nil, database: nil, output_location: nil, async: true)
         | 
| 69 | 
            +
                  query_execution_id = aws_athena_client.start_query_execution(
         | 
| 70 | 
            +
                    query_execution_params(query, work_group, database, output_location)
         | 
| 71 | 
            +
                  ).query_execution_id
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                  return query_status(query_execution_id) if Egis.mode.async(async)
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                  query_status = wait_for_query_to_finish(query_execution_id)
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                  raise Egis::Errors::QueryExecutionError, query_status.message unless query_status.finished?
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                  query_status
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
             | 
| 82 | 
            +
                ##
         | 
| 83 | 
            +
                # Check the status of asynchronous query execution.
         | 
| 84 | 
            +
                #
         | 
| 85 | 
            +
                # @param [String] query_id Query id from {Egis::QueryStatus} returned by {#execute_query} method
         | 
| 86 | 
            +
                # @return [Egis::QueryStatus]
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                def query_status(query_id)
         | 
| 89 | 
            +
                  resp = aws_athena_client.get_query_execution(query_execution_id: query_id)
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                  query_execution = resp.query_execution
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                  Egis::QueryStatus.new(
         | 
| 94 | 
            +
                    query_execution.query_execution_id,
         | 
| 95 | 
            +
                    QUERY_STATUS_MAPPING.fetch(query_execution.status.state),
         | 
| 96 | 
            +
                    query_execution.status.state_change_reason,
         | 
| 97 | 
            +
                    parse_output_location(query_execution)
         | 
| 98 | 
            +
                  )
         | 
| 99 | 
            +
                end
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                private
         | 
| 102 | 
            +
             | 
| 103 | 
            +
                attr_reader :aws_athena_client, :s3_location_parser, :query_status_backoff
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                def query_execution_params(query, work_group, database, output_location)
         | 
| 106 | 
            +
                  work_group_params = work_group || Egis.configuration.work_group
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                  params = {query_string: query}
         | 
| 109 | 
            +
                  params[:work_group] = work_group_params if work_group_params
         | 
| 110 | 
            +
                  params[:query_execution_context] = {database: database_name(database)} if database
         | 
| 111 | 
            +
                  params[:result_configuration] = {output_location: translate_path(output_location)} if output_location
         | 
| 112 | 
            +
                  params
         | 
| 113 | 
            +
                end
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                def wait_for_query_to_finish(query_execution_id)
         | 
| 116 | 
            +
                  attempt = 1
         | 
| 117 | 
            +
                  loop do
         | 
| 118 | 
            +
                    sleep(query_status_backoff.call(attempt))
         | 
| 119 | 
            +
                    status = query_status(query_execution_id)
         | 
| 120 | 
            +
                    return status unless status.queued? || status.running?
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    attempt += 1
         | 
| 123 | 
            +
                  end
         | 
| 124 | 
            +
                end
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                def parse_output_location(query_execution)
         | 
| 127 | 
            +
                  url = query_execution.result_configuration.output_location
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                  bucket, path = s3_location_parser.parse_url(url)
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                  QueryOutputLocation.new(url, bucket, path)
         | 
| 132 | 
            +
                end
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                def translate_path(s3_url)
         | 
| 135 | 
            +
                  Egis.mode.s3_path(s3_url)
         | 
| 136 | 
            +
                end
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                def database_name(name)
         | 
| 139 | 
            +
                  Egis.mode.database_name(name)
         | 
| 140 | 
            +
                end
         | 
| 141 | 
            +
              end
         | 
| 142 | 
            +
            end
         | 
| @@ -0,0 +1,102 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              ##
         | 
| 5 | 
            +
              # Interface for database manipulation and querying.
         | 
| 6 | 
            +
              #
         | 
| 7 | 
            +
              # Extends the interface of {Egis::Client} but all the queries scheduled using {Egis::Database} are executed
         | 
| 8 | 
            +
              # within the database's context. SQL table references without explicit database will implicitly refer to
         | 
| 9 | 
            +
              # the database they are executed from.
         | 
| 10 | 
            +
              #
         | 
| 11 | 
            +
              # It is recommended to create database objects using {Egis::Client#database} method.
         | 
| 12 | 
            +
              #
         | 
| 13 | 
            +
              class Database
         | 
| 14 | 
            +
                def initialize(database_name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new)
         | 
| 15 | 
            +
                  @client = client
         | 
| 16 | 
            +
                  @database_name = database_name
         | 
| 17 | 
            +
                  @output_downloader = output_downloader
         | 
| 18 | 
            +
                end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                ##
         | 
| 21 | 
            +
                # Creates {Egis::Table} object. Executing it doesn't create Athena table yet.
         | 
| 22 | 
            +
                #
         | 
| 23 | 
            +
                # @param [String] table_name
         | 
| 24 | 
            +
                # @param [Egis::TableSchema] table_schema
         | 
| 25 | 
            +
                # @param [String] table_location S3 URL with table location (e.g. `s3://s3_bucket/table/location/`)
         | 
| 26 | 
            +
                # @param [:tsv, :csv, :orc] format Table format (defaults to :tsv)
         | 
| 27 | 
            +
                # @return [Egis::Table]
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def table(table_name, table_schema, table_location, **options)
         | 
| 30 | 
            +
                  Table.new(self, table_name, table_schema, table_location, options: options)
         | 
| 31 | 
            +
                end
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                ##
         | 
| 34 | 
            +
                # Creates database in Athena.
         | 
| 35 | 
            +
                #
         | 
| 36 | 
            +
                # @return [void]
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                def create
         | 
| 39 | 
            +
                  client.execute_query("CREATE DATABASE IF NOT EXISTS #{translate_name(database_name)};", async: false)
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                ##
         | 
| 43 | 
            +
                # The same as {#create} but raising error if it already exists.
         | 
| 44 | 
            +
                #
         | 
| 45 | 
            +
                # @return [void]
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                def create!
         | 
| 48 | 
            +
                  client.execute_query("CREATE DATABASE #{translate_name(database_name)};", async: false)
         | 
| 49 | 
            +
                end
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                ##
         | 
| 52 | 
            +
                # Removes database in Athena.
         | 
| 53 | 
            +
                #
         | 
| 54 | 
            +
                # @return [void]
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def drop
         | 
| 57 | 
            +
                  client.execute_query("DROP DATABASE IF EXISTS #{translate_name(database_name)} CASCADE;", async: false)
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                ##
         | 
| 61 | 
            +
                # The same as {#drop} but raising error if it the database does not exist.
         | 
| 62 | 
            +
                #
         | 
| 63 | 
            +
                # @return [void]
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                def drop!
         | 
| 66 | 
            +
                  client.execute_query("DROP DATABASE #{translate_name(database_name)} CASCADE;", async: false)
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                ##
         | 
| 70 | 
            +
                # (see Egis::Client#execute_query)
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                def execute_query(query, **options)
         | 
| 73 | 
            +
                  client.execute_query(query, **{database: database_name, **options})
         | 
| 74 | 
            +
                end
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                ##
         | 
| 77 | 
            +
                # (see Egis::Client#query_status)
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                def query_status(query_id)
         | 
| 80 | 
            +
                  client.query_status(query_id)
         | 
| 81 | 
            +
                end
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                ##
         | 
| 84 | 
            +
                # Checks whether database with such name exists in Athena.
         | 
| 85 | 
            +
                #
         | 
| 86 | 
            +
                # @return [Boolean]
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                def exists?
         | 
| 89 | 
            +
                  query_status = client.execute_query("SHOW DATABASES LIKE '#{database_name}';", async: false)
         | 
| 90 | 
            +
                  parsed_result = output_downloader.download(query_status.output_location)
         | 
| 91 | 
            +
                  parsed_result.flatten.include?(database_name)
         | 
| 92 | 
            +
                end
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                private
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                attr_reader :client, :database_name, :output_downloader
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                def translate_name(name)
         | 
| 99 | 
            +
                  Egis.mode.database_name(name)
         | 
| 100 | 
            +
                end
         | 
| 101 | 
            +
              end
         | 
| 102 | 
            +
            end
         | 
    
        data/lib/egis/errors.rb
    ADDED
    
    | @@ -0,0 +1,12 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              module Errors
         | 
| 5 | 
            +
                class Error < StandardError; end
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                class UnsupportedTableFormat < Error; end
         | 
| 8 | 
            +
                class QueryExecutionError < Error; end
         | 
| 9 | 
            +
                class PartitionError < Error; end
         | 
| 10 | 
            +
                class TypeError < Error; end
         | 
| 11 | 
            +
              end
         | 
| 12 | 
            +
            end
         | 
| @@ -0,0 +1,21 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'csv'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Egis
         | 
| 6 | 
            +
              # @!visibility private
         | 
| 7 | 
            +
              class OutputDownloader
         | 
| 8 | 
            +
                def initialize(aws_client_provider: Egis::AwsClientProvider.new)
         | 
| 9 | 
            +
                  @s3_client = aws_client_provider.s3_client
         | 
| 10 | 
            +
                end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                def download(output_location)
         | 
| 13 | 
            +
                  query_result = s3_client.get_object(bucket: output_location.bucket, key: output_location.key)
         | 
| 14 | 
            +
                  CSV.parse(query_result.body.read)
         | 
| 15 | 
            +
                end
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                private
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                attr_reader :s3_client
         | 
| 20 | 
            +
              end
         | 
| 21 | 
            +
            end
         | 
| @@ -0,0 +1,24 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              # @!visibility private
         | 
| 5 | 
            +
              class OutputParser
         | 
| 6 | 
            +
                def parse(output, types)
         | 
| 7 | 
            +
                  header, *content = output
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                  serializers = serializers(header, types)
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  content.map do |row|
         | 
| 12 | 
            +
                    row.zip(serializers).map do |string, serializer|
         | 
| 13 | 
            +
                      serializer.load(string)
         | 
| 14 | 
            +
                    end
         | 
| 15 | 
            +
                  end
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                private
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def serializers(row, types)
         | 
| 21 | 
            +
                  row.zip(types).map { |_, type| type ? Types.serializer(type) : Types::DefaultSerializer.new }
         | 
| 22 | 
            +
                end
         | 
| 23 | 
            +
              end
         | 
| 24 | 
            +
            end
         | 
| @@ -0,0 +1,55 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              # @!visibility private
         | 
| 5 | 
            +
              class PartitionsGenerator
         | 
| 6 | 
            +
                def initialize(cartesian_product_generator: Egis::CartesianProductGenerator.new)
         | 
| 7 | 
            +
                  @cartesian_product_generator = cartesian_product_generator
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def to_sql(table_name, values_by_partition, permissive: false)
         | 
| 11 | 
            +
                  validate_partition_values(values_by_partition)
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  <<~SQL
         | 
| 14 | 
            +
                    ALTER TABLE #{table_name} ADD #{permissive_statement(permissive)}
         | 
| 15 | 
            +
                      #{partitions_definition(values_by_partition)};
         | 
| 16 | 
            +
                  SQL
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                private
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                attr_reader :cartesian_product_generator
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                def validate_partition_values(values_by_partition)
         | 
| 24 | 
            +
                  raise Errors::PartitionError, 'Partition value(s) missing' if partition_values_missing?(values_by_partition)
         | 
| 25 | 
            +
                end
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                def partition_values_missing?(values_by_partition)
         | 
| 28 | 
            +
                  values_by_partition.nil? || values_by_partition.empty? || values_by_partition.values.any?(&:empty?)
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def permissive_statement(permissive)
         | 
| 32 | 
            +
                  'IF NOT EXISTS' if permissive
         | 
| 33 | 
            +
                end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                def partitions_definition(values_by_partition)
         | 
| 36 | 
            +
                  cartesian_product_generator.cartesian_product(values_by_partition).
         | 
| 37 | 
            +
                    map { |partition_values_combination| partition_values_clause(partition_values_combination) }.
         | 
| 38 | 
            +
                    join("\n")
         | 
| 39 | 
            +
                end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                def partition_values_clause(partition_values_combination)
         | 
| 42 | 
            +
                  "PARTITION (#{partition_values(partition_values_combination).join(', ')})"
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                def partition_values(partition_values_combination)
         | 
| 46 | 
            +
                  partition_values_combination.map do |partition_name, value|
         | 
| 47 | 
            +
                    if value.is_a?(String)
         | 
| 48 | 
            +
                      "#{partition_name} = '#{value}'"
         | 
| 49 | 
            +
                    else
         | 
| 50 | 
            +
                      "#{partition_name} = #{value}"
         | 
| 51 | 
            +
                    end
         | 
| 52 | 
            +
                  end
         | 
| 53 | 
            +
                end
         | 
| 54 | 
            +
              end
         | 
| 55 | 
            +
            end
         | 
| @@ -0,0 +1,13 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              ##
         | 
| 5 | 
            +
              # @!attribute [r] url
         | 
| 6 | 
            +
              #   @return [String] Query output file's URL
         | 
| 7 | 
            +
              # @!attribute [r] bucket
         | 
| 8 | 
            +
              #   @return [String] Query output's S3 bucket
         | 
| 9 | 
            +
              # @!attribute [r] key
         | 
| 10 | 
            +
              #   @return [String] Query output's S3 path
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              QueryOutputLocation = Struct.new(:url, :bucket, :key)
         | 
| 13 | 
            +
            end
         | 
| @@ -0,0 +1,76 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              ##
         | 
| 5 | 
            +
              # @!attribute [r] id
         | 
| 6 | 
            +
              #   @return [String] Athena query execution ID
         | 
| 7 | 
            +
              # @!attribute [r] status
         | 
| 8 | 
            +
              #   @return [:queued, :running, :finished, :failed, :cancelled]
         | 
| 9 | 
            +
              # @!attribute [r] message
         | 
| 10 | 
            +
              #   @return [String]
         | 
| 11 | 
            +
              # @!attribute [r] output_location
         | 
| 12 | 
            +
              #   @return [Egis::OutputLocation]
         | 
| 13 | 
            +
              #
         | 
| 14 | 
            +
              class QueryStatus
         | 
| 15 | 
            +
                QUEUED = :queued
         | 
| 16 | 
            +
                RUNNING = :running
         | 
| 17 | 
            +
                FINISHED = :finished
         | 
| 18 | 
            +
                FAILED = :failed
         | 
| 19 | 
            +
                CANCELLED = :cancelled
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                STATUSES = [QUEUED, RUNNING, FINISHED, FAILED, CANCELLED].freeze
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                attr_reader :id, :status, :message, :output_location
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                def initialize(id, status, message, output_location,
         | 
| 26 | 
            +
                               output_downloader: Egis::OutputDownloader.new,
         | 
| 27 | 
            +
                               output_parser: Egis::OutputParser.new)
         | 
| 28 | 
            +
                  raise ArgumentError, "Unsupported status #{status}" unless STATUSES.include?(status)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                  @id = id
         | 
| 31 | 
            +
                  @status = status
         | 
| 32 | 
            +
                  @message = message
         | 
| 33 | 
            +
                  @output_location = output_location
         | 
| 34 | 
            +
                  @output_downloader = output_downloader
         | 
| 35 | 
            +
                  @output_parser = output_parser
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                def finished?
         | 
| 39 | 
            +
                  status == FINISHED
         | 
| 40 | 
            +
                end
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                def failed?
         | 
| 43 | 
            +
                  status == FAILED
         | 
| 44 | 
            +
                end
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                def queued?
         | 
| 47 | 
            +
                  status == QUEUED
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                def running?
         | 
| 51 | 
            +
                  status == RUNNING
         | 
| 52 | 
            +
                end
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                def in_progress?
         | 
| 55 | 
            +
                  [RUNNING, QUEUED].include?(status)
         | 
| 56 | 
            +
                end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                ##
         | 
| 59 | 
            +
                # Download query result.
         | 
| 60 | 
            +
                #
         | 
| 61 | 
            +
                # By default, Egis will just parse output CSV and return array of string arrays. Additionally, you
         | 
| 62 | 
            +
                # can pass expected query result column types to parse them into Ruby objects accordingly.
         | 
| 63 | 
            +
                #
         | 
| 64 | 
            +
                # @param [Array] schema Array with expected query column types
         | 
| 65 | 
            +
                # @return [Array] Array of row values
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                def fetch_result(schema: [])
         | 
| 68 | 
            +
                  output = output_downloader.download(output_location)
         | 
| 69 | 
            +
                  output_parser.parse(output, schema)
         | 
| 70 | 
            +
                end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                private
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                attr_reader :output_downloader, :output_parser
         | 
| 75 | 
            +
              end
         | 
| 76 | 
            +
            end
         | 
| @@ -0,0 +1,22 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              # @!visibility private
         | 
| 5 | 
            +
              class S3Cleaner
         | 
| 6 | 
            +
                def initialize(aws_client_provider: Egis::AwsClientProvider.new)
         | 
| 7 | 
            +
                  @s3_client = aws_client_provider.s3_client
         | 
| 8 | 
            +
                end
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def delete(bucket, prefix)
         | 
| 11 | 
            +
                  prefix_contents = s3_client.list_objects_v2(bucket: bucket, prefix: prefix).contents
         | 
| 12 | 
            +
                  return if prefix_contents.empty?
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                  objects_to_remove = prefix_contents.map { |content| {key: content.key} }
         | 
| 15 | 
            +
                  s3_client.delete_objects(bucket: bucket, delete: {objects: objects_to_remove})
         | 
| 16 | 
            +
                end
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                private
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                attr_reader :s3_client
         | 
| 21 | 
            +
              end
         | 
| 22 | 
            +
            end
         | 
| @@ -0,0 +1,14 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Egis
         | 
| 4 | 
            +
              # @!visibility private
         | 
| 5 | 
            +
              class S3LocationParser
         | 
| 6 | 
            +
                S3_URL_PATTERN = %r{^s3://(?<bucket>\S+?)/(?<key>\S+)$}.freeze
         | 
| 7 | 
            +
             | 
| 8 | 
            +
                def parse_url(url)
         | 
| 9 | 
            +
                  matched_data = S3_URL_PATTERN.match(url)
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                  [matched_data['bucket'], matched_data['key']]
         | 
| 12 | 
            +
                end
         | 
| 13 | 
            +
              end
         | 
| 14 | 
            +
            end
         |