egis 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1d2255a76aef464d0d8faee0c39be753e1a928133a25de5c4a86f553e51e92ca
4
+ data.tar.gz: f18ae28053651576ccf941bd42f5a9bf40db32fed3c1c2025e1bc987844199f8
5
+ SHA512:
6
+ metadata.gz: 9b143dbc650f8c02ba39f1404a2d9b4ebe36c49182a36b947a1d091079b1c94839320c2d05dd9a0db1516d82c9b03694244e4fa467898302e888addb234e1291
7
+ data.tar.gz: a1577d30cbfd63632dbd55052adf0f863f7a5f571e8a8f8a210649446acc786764d8bec6a15be7cd91742f4f7681bfc1d379a9099934f62b8a30379d12a4bd77
data/egis.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'egis/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'egis'
9
+ spec.version = Egis::VERSION
10
+ spec.authors = ['Agnieszka Czereba', 'Marek Mateja']
11
+ spec.email = %w[agnieszka.czereba@u2i.com marek.mateja@u2i.com]
12
+
13
+ spec.summary = 'A handy wrapper for AWS Athena Ruby SDK.'
14
+ spec.homepage = 'https://github.com/u2i/egis'
15
+ spec.license = 'MIT'
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = spec.homepage
19
+ spec.metadata['changelog_uri'] = 'https://github.com/u2i/egis/blob/master/CHANGELOG.md'
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z egis.gemspec lib/`.split("\x0")
25
+ end
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.add_dependency 'aws-sdk-athena', '~> 1.0'
29
+ spec.add_dependency 'aws-sdk-s3', '~> 1.0'
30
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aws-sdk-s3'
4
+ require 'aws-sdk-athena'
5
+
6
+ module Egis
7
+ # @!visibility private
8
+ class AwsClientProvider
9
+ def s3_client
10
+ Aws::S3::Client.new(client_config)
11
+ end
12
+
13
+ def athena_client
14
+ Aws::Athena::Client.new(client_config)
15
+ end
16
+
17
+ private
18
+
19
+ def client_config
20
+ configuration = Egis.configuration
21
+
22
+ config = {}
23
+ config[:region] = configuration.aws_region if configuration.aws_region
24
+ config[:access_key_id] = configuration.aws_access_key_id if configuration.aws_access_key_id
25
+ config[:secret_access_key] = configuration.aws_secret_access_key if configuration.aws_secret_access_key
26
+ config[:profile] = configuration.aws_profile if configuration.aws_profile
27
+ config
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class CartesianProductGenerator
6
+ def cartesian_product(values_by_key)
7
+ keys = values_by_key.keys
8
+ values = values_by_key.values
9
+
10
+ head, *tail = values
11
+
12
+ return keys.zip(head) unless tail
13
+
14
+ head.product(*tail).map { |vals| keys.zip(vals) }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # The most fundamental {Egis} class. Provides an interface for executing Athena queries.
6
+ #
7
+ # See configuration instructions {Egis.configure}.
8
+ #
9
+ # @see Egis.configure
10
+ #
11
+ # @example Create client and execute asynchronous query
12
+ # client = Egis::Client.new
13
+ # status = client.execute_query('SELECT * FROM my_table;')
14
+ #
15
+ # while status.in_progress?
16
+ # # do something useful
17
+ # # ...
18
+ # status = client.query_status(status.id)
19
+ # end
20
+ #
21
+ # status.output_location.url # s3://my-bucket/result/path
22
+ #
23
+ # @example Execute synchronous query and fetch results
24
+ # status = client.execute_query('SELECT MAX(time), MIN(id) FROM my_table;', async: false)
25
+ # status.fetch_result(schema: [:timestamp, :int]) # [[2020-05-04 11:19:03 +0200, 7]]
26
+ #
27
+ class Client
28
+ QUERY_STATUS_MAPPING = {
29
+ 'QUEUED' => Egis::QueryStatus::QUEUED,
30
+ 'RUNNING' => Egis::QueryStatus::RUNNING,
31
+ 'SUCCEEDED' => Egis::QueryStatus::FINISHED,
32
+ 'FAILED' => Egis::QueryStatus::FAILED,
33
+ 'CANCELLED' => Egis::QueryStatus::CANCELLED
34
+ }.freeze
35
+
36
+ DEFAULT_QUERY_STATUS_BACKOFF = ->(attempt) { 1.5**attempt - 1 }
37
+
38
+ private_constant :QUERY_STATUS_MAPPING, :DEFAULT_QUERY_STATUS_BACKOFF
39
+
40
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new, s3_location_parser: Egis::S3LocationParser.new)
41
+ @aws_athena_client = aws_client_provider.athena_client
42
+ @s3_location_parser = s3_location_parser
43
+ @query_status_backoff = Egis.configuration.query_status_backoff || DEFAULT_QUERY_STATUS_BACKOFF
44
+ end
45
+
46
+ ##
47
+ # Creates {Egis::Database} object with a given name. Executing it doesn't create Athena database yet.
48
+ #
49
+ # @param [String] database_name
50
+ # @return [Egis::Database]
51
+
52
+ def database(database_name)
53
+ Database.new(database_name, client: self)
54
+ end
55
+
56
+ ##
57
+ # Executes Athena query. By default, queries are being executed asynchronously.
58
+ #
59
+ # @param [String] query SQL query to execute
60
+ # @param [Boolean] async Decide whether you want to run query asynchronously or block execution until it finishes
61
+ # @param [String] work_group Change Athena work group the query will be executed in.
62
+ # @param [String] database Run query in the context of a specific database (implicit table references are expected
63
+ # to be in given database).
64
+ # @param [String] output_location S3 url of the desired output location. By default, Athena uses location defined in
65
+ # by workgroup.
66
+ # @return [Egis::QueryStatus]
67
+
68
+ def execute_query(query, work_group: nil, database: nil, output_location: nil, async: true)
69
+ query_execution_id = aws_athena_client.start_query_execution(
70
+ query_execution_params(query, work_group, database, output_location)
71
+ ).query_execution_id
72
+
73
+ return query_status(query_execution_id) if Egis.mode.async(async)
74
+
75
+ query_status = wait_for_query_to_finish(query_execution_id)
76
+
77
+ raise Egis::Errors::QueryExecutionError, query_status.message unless query_status.finished?
78
+
79
+ query_status
80
+ end
81
+
82
+ ##
83
+ # Check the status of asynchronous query execution.
84
+ #
85
+ # @param [String] query_id Query id from {Egis::QueryStatus} returned by {#execute_query} method
86
+ # @return [Egis::QueryStatus]
87
+
88
+ def query_status(query_id)
89
+ resp = aws_athena_client.get_query_execution(query_execution_id: query_id)
90
+
91
+ query_execution = resp.query_execution
92
+
93
+ Egis::QueryStatus.new(
94
+ query_execution.query_execution_id,
95
+ QUERY_STATUS_MAPPING.fetch(query_execution.status.state),
96
+ query_execution.status.state_change_reason,
97
+ parse_output_location(query_execution)
98
+ )
99
+ end
100
+
101
+ private
102
+
103
+ attr_reader :aws_athena_client, :s3_location_parser, :query_status_backoff
104
+
105
+ def query_execution_params(query, work_group, database, output_location)
106
+ work_group_params = work_group || Egis.configuration.work_group
107
+
108
+ params = {query_string: query}
109
+ params[:work_group] = work_group_params if work_group_params
110
+ params[:query_execution_context] = {database: database_name(database)} if database
111
+ params[:result_configuration] = {output_location: translate_path(output_location)} if output_location
112
+ params
113
+ end
114
+
115
+ def wait_for_query_to_finish(query_execution_id)
116
+ attempt = 1
117
+ loop do
118
+ sleep(query_status_backoff.call(attempt))
119
+ status = query_status(query_execution_id)
120
+ return status unless status.queued? || status.running?
121
+
122
+ attempt += 1
123
+ end
124
+ end
125
+
126
+ def parse_output_location(query_execution)
127
+ url = query_execution.result_configuration.output_location
128
+
129
+ bucket, path = s3_location_parser.parse_url(url)
130
+
131
+ QueryOutputLocation.new(url, bucket, path)
132
+ end
133
+
134
+ def translate_path(s3_url)
135
+ Egis.mode.s3_path(s3_url)
136
+ end
137
+
138
+ def database_name(name)
139
+ Egis.mode.database_name(name)
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class Configuration
6
+ attr_accessor :work_group, :aws_region, :aws_access_key_id, :aws_secret_access_key, :aws_profile,
7
+ :query_status_backoff, :testing_s3_bucket
8
+ end
9
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # Interface for database manipulation and querying.
6
+ #
7
+ # Extends the interface of {Egis::Client} but all the queries scheduled using {Egis::Database} are executed
8
+ # within the database's context. SQL table references without explicit database will implicitly refer to
9
+ # the database they are executed from.
10
+ #
11
+ # It is recommended to create database objects using {Egis::Client#database} method.
12
+ #
13
+ class Database
14
+ def initialize(database_name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new)
15
+ @client = client
16
+ @database_name = database_name
17
+ @output_downloader = output_downloader
18
+ end
19
+
20
+ ##
21
+ # Creates {Egis::Table} object. Executing it doesn't create Athena table yet.
22
+ #
23
+ # @param [String] table_name
24
+ # @param [Egis::TableSchema] table_schema
25
+ # @param [String] table_location S3 URL with table location (e.g. `s3://s3_bucket/table/location/`)
26
+ # @param [:tsv, :csv, :orc] format Table format (defaults to :tsv)
27
+ # @return [Egis::Table]
28
+
29
+ def table(table_name, table_schema, table_location, **options)
30
+ Table.new(self, table_name, table_schema, table_location, options: options)
31
+ end
32
+
33
+ ##
34
+ # Creates database in Athena.
35
+ #
36
+ # @return [void]
37
+
38
+ def create
39
+ client.execute_query("CREATE DATABASE IF NOT EXISTS #{translate_name(database_name)};", async: false)
40
+ end
41
+
42
+ ##
43
+ # The same as {#create} but raising error if it already exists.
44
+ #
45
+ # @return [void]
46
+
47
+ def create!
48
+ client.execute_query("CREATE DATABASE #{translate_name(database_name)};", async: false)
49
+ end
50
+
51
+ ##
52
+ # Removes database in Athena.
53
+ #
54
+ # @return [void]
55
+
56
+ def drop
57
+ client.execute_query("DROP DATABASE IF EXISTS #{translate_name(database_name)} CASCADE;", async: false)
58
+ end
59
+
60
+ ##
61
+ # The same as {#drop} but raising error if it the database does not exist.
62
+ #
63
+ # @return [void]
64
+
65
+ def drop!
66
+ client.execute_query("DROP DATABASE #{translate_name(database_name)} CASCADE;", async: false)
67
+ end
68
+
69
+ ##
70
+ # (see Egis::Client#execute_query)
71
+
72
+ def execute_query(query, **options)
73
+ client.execute_query(query, **{database: database_name, **options})
74
+ end
75
+
76
+ ##
77
+ # (see Egis::Client#query_status)
78
+
79
+ def query_status(query_id)
80
+ client.query_status(query_id)
81
+ end
82
+
83
+ ##
84
+ # Checks whether database with such name exists in Athena.
85
+ #
86
+ # @return [Boolean]
87
+
88
+ def exists?
89
+ query_status = client.execute_query("SHOW DATABASES LIKE '#{database_name}';", async: false)
90
+ parsed_result = output_downloader.download(query_status.output_location)
91
+ parsed_result.flatten.include?(database_name)
92
+ end
93
+
94
+ private
95
+
96
+ attr_reader :client, :database_name, :output_downloader
97
+
98
+ def translate_name(name)
99
+ Egis.mode.database_name(name)
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Errors
5
+ class Error < StandardError; end
6
+
7
+ class UnsupportedTableFormat < Error; end
8
+ class QueryExecutionError < Error; end
9
+ class PartitionError < Error; end
10
+ class TypeError < Error; end
11
+ end
12
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Egis
6
+ # @!visibility private
7
+ class OutputDownloader
8
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new)
9
+ @s3_client = aws_client_provider.s3_client
10
+ end
11
+
12
+ def download(output_location)
13
+ query_result = s3_client.get_object(bucket: output_location.bucket, key: output_location.key)
14
+ CSV.parse(query_result.body.read)
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :s3_client
20
+ end
21
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class OutputParser
6
+ def parse(output, types)
7
+ header, *content = output
8
+
9
+ serializers = serializers(header, types)
10
+
11
+ content.map do |row|
12
+ row.zip(serializers).map do |string, serializer|
13
+ serializer.load(string)
14
+ end
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def serializers(row, types)
21
+ row.zip(types).map { |_, type| type ? Types.serializer(type) : Types::DefaultSerializer.new }
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class PartitionsGenerator
6
+ def initialize(cartesian_product_generator: Egis::CartesianProductGenerator.new)
7
+ @cartesian_product_generator = cartesian_product_generator
8
+ end
9
+
10
+ def to_sql(table_name, values_by_partition, permissive: false)
11
+ validate_partition_values(values_by_partition)
12
+
13
+ <<~SQL
14
+ ALTER TABLE #{table_name} ADD #{permissive_statement(permissive)}
15
+ #{partitions_definition(values_by_partition)};
16
+ SQL
17
+ end
18
+
19
+ private
20
+
21
+ attr_reader :cartesian_product_generator
22
+
23
+ def validate_partition_values(values_by_partition)
24
+ raise Errors::PartitionError, 'Partition value(s) missing' if partition_values_missing?(values_by_partition)
25
+ end
26
+
27
+ def partition_values_missing?(values_by_partition)
28
+ values_by_partition.nil? || values_by_partition.empty? || values_by_partition.values.any?(&:empty?)
29
+ end
30
+
31
+ def permissive_statement(permissive)
32
+ 'IF NOT EXISTS' if permissive
33
+ end
34
+
35
+ def partitions_definition(values_by_partition)
36
+ cartesian_product_generator.cartesian_product(values_by_partition).
37
+ map { |partition_values_combination| partition_values_clause(partition_values_combination) }.
38
+ join("\n")
39
+ end
40
+
41
+ def partition_values_clause(partition_values_combination)
42
+ "PARTITION (#{partition_values(partition_values_combination).join(', ')})"
43
+ end
44
+
45
+ def partition_values(partition_values_combination)
46
+ partition_values_combination.map do |partition_name, value|
47
+ if value.is_a?(String)
48
+ "#{partition_name} = '#{value}'"
49
+ else
50
+ "#{partition_name} = #{value}"
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # @!attribute [r] url
6
+ # @return [String] Query output file's URL
7
+ # @!attribute [r] bucket
8
+ # @return [String] Query output's S3 bucket
9
+ # @!attribute [r] key
10
+ # @return [String] Query output's S3 path
11
+
12
+ QueryOutputLocation = Struct.new(:url, :bucket, :key)
13
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # @!attribute [r] id
6
+ # @return [String] Athena query execution ID
7
+ # @!attribute [r] status
8
+ # @return [:queued, :running, :finished, :failed, :cancelled]
9
+ # @!attribute [r] message
10
+ # @return [String]
11
+ # @!attribute [r] output_location
12
+ # @return [Egis::OutputLocation]
13
+ #
14
+ class QueryStatus
15
+ QUEUED = :queued
16
+ RUNNING = :running
17
+ FINISHED = :finished
18
+ FAILED = :failed
19
+ CANCELLED = :cancelled
20
+
21
+ STATUSES = [QUEUED, RUNNING, FINISHED, FAILED, CANCELLED].freeze
22
+
23
+ attr_reader :id, :status, :message, :output_location
24
+
25
+ def initialize(id, status, message, output_location,
26
+ output_downloader: Egis::OutputDownloader.new,
27
+ output_parser: Egis::OutputParser.new)
28
+ raise ArgumentError, "Unsupported status #{status}" unless STATUSES.include?(status)
29
+
30
+ @id = id
31
+ @status = status
32
+ @message = message
33
+ @output_location = output_location
34
+ @output_downloader = output_downloader
35
+ @output_parser = output_parser
36
+ end
37
+
38
+ def finished?
39
+ status == FINISHED
40
+ end
41
+
42
+ def failed?
43
+ status == FAILED
44
+ end
45
+
46
+ def queued?
47
+ status == QUEUED
48
+ end
49
+
50
+ def running?
51
+ status == RUNNING
52
+ end
53
+
54
+ def in_progress?
55
+ [RUNNING, QUEUED].include?(status)
56
+ end
57
+
58
+ ##
59
+ # Download query result.
60
+ #
61
+ # By default, Egis will just parse output CSV and return array of string arrays. Additionally, you
62
+ # can pass expected query result column types to parse them into Ruby objects accordingly.
63
+ #
64
+ # @param [Array] schema Array with expected query column types
65
+ # @return [Array] Array of row values
66
+
67
+ def fetch_result(schema: [])
68
+ output = output_downloader.download(output_location)
69
+ output_parser.parse(output, schema)
70
+ end
71
+
72
+ private
73
+
74
+ attr_reader :output_downloader, :output_parser
75
+ end
76
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class S3Cleaner
6
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new)
7
+ @s3_client = aws_client_provider.s3_client
8
+ end
9
+
10
+ def delete(bucket, prefix)
11
+ prefix_contents = s3_client.list_objects_v2(bucket: bucket, prefix: prefix).contents
12
+ return if prefix_contents.empty?
13
+
14
+ objects_to_remove = prefix_contents.map { |content| {key: content.key} }
15
+ s3_client.delete_objects(bucket: bucket, delete: {objects: objects_to_remove})
16
+ end
17
+
18
+ private
19
+
20
+ attr_reader :s3_client
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class S3LocationParser
6
+ S3_URL_PATTERN = %r{^s3://(?<bucket>\S+?)/(?<key>\S+)$}.freeze
7
+
8
+ def parse_url(url)
9
+ matched_data = S3_URL_PATTERN.match(url)
10
+
11
+ [matched_data['bucket'], matched_data['key']]
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class StandardMode
6
+ def s3_path(s3_url)
7
+ s3_url
8
+ end
9
+
10
+ def database_name(name)
11
+ name
12
+ end
13
+
14
+ def async(async_flag)
15
+ async_flag
16
+ end
17
+ end
18
+ end