egis 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 1d2255a76aef464d0d8faee0c39be753e1a928133a25de5c4a86f553e51e92ca
4
+ data.tar.gz: f18ae28053651576ccf941bd42f5a9bf40db32fed3c1c2025e1bc987844199f8
5
+ SHA512:
6
+ metadata.gz: 9b143dbc650f8c02ba39f1404a2d9b4ebe36c49182a36b947a1d091079b1c94839320c2d05dd9a0db1516d82c9b03694244e4fa467898302e888addb234e1291
7
+ data.tar.gz: a1577d30cbfd63632dbd55052adf0f863f7a5f571e8a8f8a210649446acc786764d8bec6a15be7cd91742f4f7681bfc1d379a9099934f62b8a30379d12a4bd77
data/egis.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ lib = File.expand_path('lib', __dir__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'egis/version'
6
+
7
+ Gem::Specification.new do |spec|
8
+ spec.name = 'egis'
9
+ spec.version = Egis::VERSION
10
+ spec.authors = ['Agnieszka Czereba', 'Marek Mateja']
11
+ spec.email = %w[agnieszka.czereba@u2i.com marek.mateja@u2i.com]
12
+
13
+ spec.summary = 'A handy wrapper for AWS Athena Ruby SDK.'
14
+ spec.homepage = 'https://github.com/u2i/egis'
15
+ spec.license = 'MIT'
16
+
17
+ spec.metadata['homepage_uri'] = spec.homepage
18
+ spec.metadata['source_code_uri'] = spec.homepage
19
+ spec.metadata['changelog_uri'] = 'https://github.com/u2i/egis/blob/master/CHANGELOG.md'
20
+
21
+ # Specify which files should be added to the gem when it is released.
22
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
23
+ spec.files = Dir.chdir(File.expand_path(__dir__)) do
24
+ `git ls-files -z egis.gemspec lib/`.split("\x0")
25
+ end
26
+ spec.require_paths = ['lib']
27
+
28
+ spec.add_dependency 'aws-sdk-athena', '~> 1.0'
29
+ spec.add_dependency 'aws-sdk-s3', '~> 1.0'
30
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'aws-sdk-s3'
4
+ require 'aws-sdk-athena'
5
+
6
+ module Egis
7
+ # @!visibility private
8
+ class AwsClientProvider
9
+ def s3_client
10
+ Aws::S3::Client.new(client_config)
11
+ end
12
+
13
+ def athena_client
14
+ Aws::Athena::Client.new(client_config)
15
+ end
16
+
17
+ private
18
+
19
+ def client_config
20
+ configuration = Egis.configuration
21
+
22
+ config = {}
23
+ config[:region] = configuration.aws_region if configuration.aws_region
24
+ config[:access_key_id] = configuration.aws_access_key_id if configuration.aws_access_key_id
25
+ config[:secret_access_key] = configuration.aws_secret_access_key if configuration.aws_secret_access_key
26
+ config[:profile] = configuration.aws_profile if configuration.aws_profile
27
+ config
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class CartesianProductGenerator
6
+ def cartesian_product(values_by_key)
7
+ keys = values_by_key.keys
8
+ values = values_by_key.values
9
+
10
+ head, *tail = values
11
+
12
+ return keys.zip(head) unless tail
13
+
14
+ head.product(*tail).map { |vals| keys.zip(vals) }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # The most fundamental {Egis} class. Provides an interface for executing Athena queries.
6
+ #
7
+ # See configuration instructions {Egis.configure}.
8
+ #
9
+ # @see Egis.configure
10
+ #
11
+ # @example Create client and execute asynchronous query
12
+ # client = Egis::Client.new
13
+ # status = client.execute_query('SELECT * FROM my_table;')
14
+ #
15
+ # while status.in_progress?
16
+ # # do something useful
17
+ # # ...
18
+ # status = client.query_status(status.id)
19
+ # end
20
+ #
21
+ # status.output_location.url # s3://my-bucket/result/path
22
+ #
23
+ # @example Execute synchronous query and fetch results
24
+ # status = client.execute_query('SELECT MAX(time), MIN(id) FROM my_table;', async: false)
25
+ # status.fetch_result(schema: [:timestamp, :int]) # [[2020-05-04 11:19:03 +0200, 7]]
26
+ #
27
+ class Client
28
+ QUERY_STATUS_MAPPING = {
29
+ 'QUEUED' => Egis::QueryStatus::QUEUED,
30
+ 'RUNNING' => Egis::QueryStatus::RUNNING,
31
+ 'SUCCEEDED' => Egis::QueryStatus::FINISHED,
32
+ 'FAILED' => Egis::QueryStatus::FAILED,
33
+ 'CANCELLED' => Egis::QueryStatus::CANCELLED
34
+ }.freeze
35
+
36
+ DEFAULT_QUERY_STATUS_BACKOFF = ->(attempt) { 1.5**attempt - 1 }
37
+
38
+ private_constant :QUERY_STATUS_MAPPING, :DEFAULT_QUERY_STATUS_BACKOFF
39
+
40
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new, s3_location_parser: Egis::S3LocationParser.new)
41
+ @aws_athena_client = aws_client_provider.athena_client
42
+ @s3_location_parser = s3_location_parser
43
+ @query_status_backoff = Egis.configuration.query_status_backoff || DEFAULT_QUERY_STATUS_BACKOFF
44
+ end
45
+
46
+ ##
47
+ # Creates {Egis::Database} object with a given name. Executing it doesn't create Athena database yet.
48
+ #
49
+ # @param [String] database_name
50
+ # @return [Egis::Database]
51
+
52
+ def database(database_name)
53
+ Database.new(database_name, client: self)
54
+ end
55
+
56
+ ##
57
+ # Executes Athena query. By default, queries are being executed asynchronously.
58
+ #
59
+ # @param [String] query SQL query to execute
60
+ # @param [Boolean] async Decide whether you want to run query asynchronously or block execution until it finishes
61
+ # @param [String] work_group Change Athena work group the query will be executed in.
62
+ # @param [String] database Run query in the context of a specific database (implicit table references are expected
63
+ # to be in given database).
64
+ # @param [String] output_location S3 url of the desired output location. By default, Athena uses location defined in
65
+ # by workgroup.
66
+ # @return [Egis::QueryStatus]
67
+
68
+ def execute_query(query, work_group: nil, database: nil, output_location: nil, async: true)
69
+ query_execution_id = aws_athena_client.start_query_execution(
70
+ query_execution_params(query, work_group, database, output_location)
71
+ ).query_execution_id
72
+
73
+ return query_status(query_execution_id) if Egis.mode.async(async)
74
+
75
+ query_status = wait_for_query_to_finish(query_execution_id)
76
+
77
+ raise Egis::Errors::QueryExecutionError, query_status.message unless query_status.finished?
78
+
79
+ query_status
80
+ end
81
+
82
+ ##
83
+ # Check the status of asynchronous query execution.
84
+ #
85
+ # @param [String] query_id Query id from {Egis::QueryStatus} returned by {#execute_query} method
86
+ # @return [Egis::QueryStatus]
87
+
88
+ def query_status(query_id)
89
+ resp = aws_athena_client.get_query_execution(query_execution_id: query_id)
90
+
91
+ query_execution = resp.query_execution
92
+
93
+ Egis::QueryStatus.new(
94
+ query_execution.query_execution_id,
95
+ QUERY_STATUS_MAPPING.fetch(query_execution.status.state),
96
+ query_execution.status.state_change_reason,
97
+ parse_output_location(query_execution)
98
+ )
99
+ end
100
+
101
+ private
102
+
103
+ attr_reader :aws_athena_client, :s3_location_parser, :query_status_backoff
104
+
105
+ def query_execution_params(query, work_group, database, output_location)
106
+ work_group_params = work_group || Egis.configuration.work_group
107
+
108
+ params = {query_string: query}
109
+ params[:work_group] = work_group_params if work_group_params
110
+ params[:query_execution_context] = {database: database_name(database)} if database
111
+ params[:result_configuration] = {output_location: translate_path(output_location)} if output_location
112
+ params
113
+ end
114
+
115
+ def wait_for_query_to_finish(query_execution_id)
116
+ attempt = 1
117
+ loop do
118
+ sleep(query_status_backoff.call(attempt))
119
+ status = query_status(query_execution_id)
120
+ return status unless status.queued? || status.running?
121
+
122
+ attempt += 1
123
+ end
124
+ end
125
+
126
+ def parse_output_location(query_execution)
127
+ url = query_execution.result_configuration.output_location
128
+
129
+ bucket, path = s3_location_parser.parse_url(url)
130
+
131
+ QueryOutputLocation.new(url, bucket, path)
132
+ end
133
+
134
+ def translate_path(s3_url)
135
+ Egis.mode.s3_path(s3_url)
136
+ end
137
+
138
+ def database_name(name)
139
+ Egis.mode.database_name(name)
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class Configuration
6
+ attr_accessor :work_group, :aws_region, :aws_access_key_id, :aws_secret_access_key, :aws_profile,
7
+ :query_status_backoff, :testing_s3_bucket
8
+ end
9
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # Interface for database manipulation and querying.
6
+ #
7
+ # Extends the interface of {Egis::Client} but all the queries scheduled using {Egis::Database} are executed
8
+ # within the database's context. SQL table references without explicit database will implicitly refer to
9
+ # the database they are executed from.
10
+ #
11
+ # It is recommended to create database objects using {Egis::Client#database} method.
12
+ #
13
+ class Database
14
+ def initialize(database_name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new)
15
+ @client = client
16
+ @database_name = database_name
17
+ @output_downloader = output_downloader
18
+ end
19
+
20
+ ##
21
+ # Creates {Egis::Table} object. Executing it doesn't create Athena table yet.
22
+ #
23
+ # @param [String] table_name
24
+ # @param [Egis::TableSchema] table_schema
25
+ # @param [String] table_location S3 URL with table location (e.g. `s3://s3_bucket/table/location/`)
26
+ # @param [:tsv, :csv, :orc] format Table format (defaults to :tsv)
27
+ # @return [Egis::Table]
28
+
29
+ def table(table_name, table_schema, table_location, **options)
30
+ Table.new(self, table_name, table_schema, table_location, options: options)
31
+ end
32
+
33
+ ##
34
+ # Creates database in Athena.
35
+ #
36
+ # @return [void]
37
+
38
+ def create
39
+ client.execute_query("CREATE DATABASE IF NOT EXISTS #{translate_name(database_name)};", async: false)
40
+ end
41
+
42
+ ##
43
+ # The same as {#create} but raising error if it already exists.
44
+ #
45
+ # @return [void]
46
+
47
+ def create!
48
+ client.execute_query("CREATE DATABASE #{translate_name(database_name)};", async: false)
49
+ end
50
+
51
+ ##
52
+ # Removes database in Athena.
53
+ #
54
+ # @return [void]
55
+
56
+ def drop
57
+ client.execute_query("DROP DATABASE IF EXISTS #{translate_name(database_name)} CASCADE;", async: false)
58
+ end
59
+
60
+ ##
61
+ # The same as {#drop} but raising error if it the database does not exist.
62
+ #
63
+ # @return [void]
64
+
65
+ def drop!
66
+ client.execute_query("DROP DATABASE #{translate_name(database_name)} CASCADE;", async: false)
67
+ end
68
+
69
+ ##
70
+ # (see Egis::Client#execute_query)
71
+
72
+ def execute_query(query, **options)
73
+ client.execute_query(query, **{database: database_name, **options})
74
+ end
75
+
76
+ ##
77
+ # (see Egis::Client#query_status)
78
+
79
+ def query_status(query_id)
80
+ client.query_status(query_id)
81
+ end
82
+
83
+ ##
84
+ # Checks whether database with such name exists in Athena.
85
+ #
86
+ # @return [Boolean]
87
+
88
+ def exists?
89
+ query_status = client.execute_query("SHOW DATABASES LIKE '#{database_name}';", async: false)
90
+ parsed_result = output_downloader.download(query_status.output_location)
91
+ parsed_result.flatten.include?(database_name)
92
+ end
93
+
94
+ private
95
+
96
+ attr_reader :client, :database_name, :output_downloader
97
+
98
+ def translate_name(name)
99
+ Egis.mode.database_name(name)
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Errors
5
+ class Error < StandardError; end
6
+
7
+ class UnsupportedTableFormat < Error; end
8
+ class QueryExecutionError < Error; end
9
+ class PartitionError < Error; end
10
+ class TypeError < Error; end
11
+ end
12
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'csv'
4
+
5
+ module Egis
6
+ # @!visibility private
7
+ class OutputDownloader
8
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new)
9
+ @s3_client = aws_client_provider.s3_client
10
+ end
11
+
12
+ def download(output_location)
13
+ query_result = s3_client.get_object(bucket: output_location.bucket, key: output_location.key)
14
+ CSV.parse(query_result.body.read)
15
+ end
16
+
17
+ private
18
+
19
+ attr_reader :s3_client
20
+ end
21
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class OutputParser
6
+ def parse(output, types)
7
+ header, *content = output
8
+
9
+ serializers = serializers(header, types)
10
+
11
+ content.map do |row|
12
+ row.zip(serializers).map do |string, serializer|
13
+ serializer.load(string)
14
+ end
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def serializers(row, types)
21
+ row.zip(types).map { |_, type| type ? Types.serializer(type) : Types::DefaultSerializer.new }
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class PartitionsGenerator
6
+ def initialize(cartesian_product_generator: Egis::CartesianProductGenerator.new)
7
+ @cartesian_product_generator = cartesian_product_generator
8
+ end
9
+
10
+ def to_sql(table_name, values_by_partition, permissive: false)
11
+ validate_partition_values(values_by_partition)
12
+
13
+ <<~SQL
14
+ ALTER TABLE #{table_name} ADD #{permissive_statement(permissive)}
15
+ #{partitions_definition(values_by_partition)};
16
+ SQL
17
+ end
18
+
19
+ private
20
+
21
+ attr_reader :cartesian_product_generator
22
+
23
+ def validate_partition_values(values_by_partition)
24
+ raise Errors::PartitionError, 'Partition value(s) missing' if partition_values_missing?(values_by_partition)
25
+ end
26
+
27
+ def partition_values_missing?(values_by_partition)
28
+ values_by_partition.nil? || values_by_partition.empty? || values_by_partition.values.any?(&:empty?)
29
+ end
30
+
31
+ def permissive_statement(permissive)
32
+ 'IF NOT EXISTS' if permissive
33
+ end
34
+
35
+ def partitions_definition(values_by_partition)
36
+ cartesian_product_generator.cartesian_product(values_by_partition).
37
+ map { |partition_values_combination| partition_values_clause(partition_values_combination) }.
38
+ join("\n")
39
+ end
40
+
41
+ def partition_values_clause(partition_values_combination)
42
+ "PARTITION (#{partition_values(partition_values_combination).join(', ')})"
43
+ end
44
+
45
+ def partition_values(partition_values_combination)
46
+ partition_values_combination.map do |partition_name, value|
47
+ if value.is_a?(String)
48
+ "#{partition_name} = '#{value}'"
49
+ else
50
+ "#{partition_name} = #{value}"
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # @!attribute [r] url
6
+ # @return [String] Query output file's URL
7
+ # @!attribute [r] bucket
8
+ # @return [String] Query output's S3 bucket
9
+ # @!attribute [r] key
10
+ # @return [String] Query output's S3 path
11
+
12
+ QueryOutputLocation = Struct.new(:url, :bucket, :key)
13
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # @!attribute [r] id
6
+ # @return [String] Athena query execution ID
7
+ # @!attribute [r] status
8
+ # @return [:queued, :running, :finished, :failed, :cancelled]
9
+ # @!attribute [r] message
10
+ # @return [String]
11
+ # @!attribute [r] output_location
12
+ # @return [Egis::OutputLocation]
13
+ #
14
+ class QueryStatus
15
+ QUEUED = :queued
16
+ RUNNING = :running
17
+ FINISHED = :finished
18
+ FAILED = :failed
19
+ CANCELLED = :cancelled
20
+
21
+ STATUSES = [QUEUED, RUNNING, FINISHED, FAILED, CANCELLED].freeze
22
+
23
+ attr_reader :id, :status, :message, :output_location
24
+
25
+ def initialize(id, status, message, output_location,
26
+ output_downloader: Egis::OutputDownloader.new,
27
+ output_parser: Egis::OutputParser.new)
28
+ raise ArgumentError, "Unsupported status #{status}" unless STATUSES.include?(status)
29
+
30
+ @id = id
31
+ @status = status
32
+ @message = message
33
+ @output_location = output_location
34
+ @output_downloader = output_downloader
35
+ @output_parser = output_parser
36
+ end
37
+
38
+ def finished?
39
+ status == FINISHED
40
+ end
41
+
42
+ def failed?
43
+ status == FAILED
44
+ end
45
+
46
+ def queued?
47
+ status == QUEUED
48
+ end
49
+
50
+ def running?
51
+ status == RUNNING
52
+ end
53
+
54
+ def in_progress?
55
+ [RUNNING, QUEUED].include?(status)
56
+ end
57
+
58
+ ##
59
+ # Download query result.
60
+ #
61
+ # By default, Egis will just parse output CSV and return array of string arrays. Additionally, you
62
+ # can pass expected query result column types to parse them into Ruby objects accordingly.
63
+ #
64
+ # @param [Array] schema Array with expected query column types
65
+ # @return [Array] Array of row values
66
+
67
+ def fetch_result(schema: [])
68
+ output = output_downloader.download(output_location)
69
+ output_parser.parse(output, schema)
70
+ end
71
+
72
+ private
73
+
74
+ attr_reader :output_downloader, :output_parser
75
+ end
76
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class S3Cleaner
6
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new)
7
+ @s3_client = aws_client_provider.s3_client
8
+ end
9
+
10
+ def delete(bucket, prefix)
11
+ prefix_contents = s3_client.list_objects_v2(bucket: bucket, prefix: prefix).contents
12
+ return if prefix_contents.empty?
13
+
14
+ objects_to_remove = prefix_contents.map { |content| {key: content.key} }
15
+ s3_client.delete_objects(bucket: bucket, delete: {objects: objects_to_remove})
16
+ end
17
+
18
+ private
19
+
20
+ attr_reader :s3_client
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class S3LocationParser
6
+ S3_URL_PATTERN = %r{^s3://(?<bucket>\S+?)/(?<key>\S+)$}.freeze
7
+
8
+ def parse_url(url)
9
+ matched_data = S3_URL_PATTERN.match(url)
10
+
11
+ [matched_data['bucket'], matched_data['key']]
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class StandardMode
6
+ def s3_path(s3_url)
7
+ s3_url
8
+ end
9
+
10
+ def database_name(name)
11
+ name
12
+ end
13
+
14
+ def async(async_flag)
15
+ async_flag
16
+ end
17
+ end
18
+ end