egis 1.2.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/egis/aws_client_provider.rb +11 -13
- data/lib/egis/client.rb +16 -9
- data/lib/egis/configuration.rb +6 -0
- data/lib/egis/database.rb +4 -4
- data/lib/egis/output_downloader.rb +2 -2
- data/lib/egis/query_status.rb +6 -1
- data/lib/egis/s3_cleaner.rb +2 -2
- data/lib/egis/table.rb +33 -10
- data/lib/egis/table_ddl_generator.rb +17 -0
- data/lib/egis/testing/testing_mode.rb +1 -1
- data/lib/egis/testing.rb +1 -1
- data/lib/egis/version.rb +1 -1
- data/lib/egis.rb +2 -2
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ca7464c31cc32115edb77470e2d5f2abe99ab9f4a64eef2ae0820ed8d5b982b7
|
4
|
+
data.tar.gz: 8002e611e1c59635365a3ca2ff54191e1a9671f4b5498c940438c85b7ea45d28
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5971ac98ab6bfdcbcac23a4b9e0e2b8b31b14cb06987ea6f3891fc0c1787087ebfca85eae026c8cb1c2952f11e4659cf36827492b8d9fdf7d43d3a6943d3bd7f
|
7
|
+
data.tar.gz: 98d379a3ddb1b3796b08e9c5a4cc584826324fa03e24d64ddb604405ec57a6f0d75a3d2138cf9123f154e275b5202c592814c4cea6a00049d75eda6ec5d2de8b
|
@@ -6,25 +6,23 @@ require 'aws-sdk-athena'
|
|
6
6
|
module Egis
|
7
7
|
# @!visibility private
|
8
8
|
class AwsClientProvider
|
9
|
-
def s3_client
|
10
|
-
Aws::S3::Client.new(client_config)
|
9
|
+
def s3_client(configuration)
|
10
|
+
Aws::S3::Client.new(client_config(configuration))
|
11
11
|
end
|
12
12
|
|
13
|
-
def athena_client
|
14
|
-
Aws::Athena::Client.new(client_config)
|
13
|
+
def athena_client(configuration)
|
14
|
+
Aws::Athena::Client.new(client_config(configuration))
|
15
15
|
end
|
16
16
|
|
17
17
|
private
|
18
18
|
|
19
|
-
def client_config
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
config[:profile] = configuration.aws_profile if configuration.aws_profile
|
27
|
-
config
|
19
|
+
def client_config(configuration)
|
20
|
+
{
|
21
|
+
region: configuration.aws_region,
|
22
|
+
access_key_id: configuration.aws_access_key_id,
|
23
|
+
secret_access_key: configuration.aws_secret_access_key,
|
24
|
+
profile: configuration.aws_profile
|
25
|
+
}.compact
|
28
26
|
end
|
29
27
|
end
|
30
28
|
end
|
data/lib/egis/client.rb
CHANGED
@@ -4,6 +4,9 @@ module Egis
|
|
4
4
|
##
|
5
5
|
# The most fundamental {Egis} class. Provides an interface for executing Athena queries.
|
6
6
|
#
|
7
|
+
# @yieldparam config [Egis::Configuration] Egis configuration block, if missing Egis will use global configuration
|
8
|
+
# provided by {Egis.configure}
|
9
|
+
#
|
7
10
|
# See configuration instructions {Egis.configure}.
|
8
11
|
#
|
9
12
|
# @see Egis.configure
|
@@ -33,14 +36,17 @@ module Egis
|
|
33
36
|
'CANCELLED' => Egis::QueryStatus::CANCELLED
|
34
37
|
}.freeze
|
35
38
|
|
36
|
-
|
39
|
+
private_constant :QUERY_STATUS_MAPPING
|
37
40
|
|
38
|
-
|
41
|
+
attr_reader :aws_s3_client
|
39
42
|
|
40
|
-
def initialize(aws_client_provider: Egis::AwsClientProvider.new,
|
41
|
-
|
43
|
+
def initialize(aws_client_provider: Egis::AwsClientProvider.new,
|
44
|
+
s3_location_parser: Egis::S3LocationParser.new,
|
45
|
+
&block)
|
46
|
+
@configuration = block_given? ? Egis.configuration.dup.configure(&block) : Egis.configuration
|
47
|
+
@aws_athena_client = aws_client_provider.athena_client(configuration)
|
48
|
+
@aws_s3_client = aws_client_provider.s3_client(configuration)
|
42
49
|
@s3_location_parser = s3_location_parser
|
43
|
-
@query_status_backoff = Egis.configuration.query_status_backoff || DEFAULT_QUERY_STATUS_BACKOFF
|
44
50
|
end
|
45
51
|
|
46
52
|
##
|
@@ -99,16 +105,17 @@ module Egis
|
|
99
105
|
query_execution.query_execution_id,
|
100
106
|
QUERY_STATUS_MAPPING.fetch(query_status),
|
101
107
|
query_execution.status.state_change_reason,
|
102
|
-
parse_output_location(query_execution)
|
108
|
+
parse_output_location(query_execution),
|
109
|
+
client: self
|
103
110
|
)
|
104
111
|
end
|
105
112
|
|
106
113
|
private
|
107
114
|
|
108
|
-
attr_reader :
|
115
|
+
attr_reader :configuration, :aws_athena_client, :s3_location_parser
|
109
116
|
|
110
117
|
def query_execution_params(query, work_group, database, output_location)
|
111
|
-
work_group_params = work_group ||
|
118
|
+
work_group_params = work_group || configuration.work_group
|
112
119
|
|
113
120
|
params = {query_string: query}
|
114
121
|
params[:work_group] = work_group_params if work_group_params
|
@@ -128,7 +135,7 @@ module Egis
|
|
128
135
|
def wait_for_query_to_finish(query_id)
|
129
136
|
attempt = 1
|
130
137
|
loop do
|
131
|
-
sleep(query_status_backoff.call(attempt))
|
138
|
+
sleep(configuration.query_status_backoff.call(attempt))
|
132
139
|
status = query_status(query_id)
|
133
140
|
|
134
141
|
return status unless status.queued? || status.running?
|
data/lib/egis/configuration.rb
CHANGED
data/lib/egis/database.rb
CHANGED
@@ -14,10 +14,10 @@ module Egis
|
|
14
14
|
# @return [String] Athena database name
|
15
15
|
#
|
16
16
|
class Database
|
17
|
-
def initialize(name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new)
|
17
|
+
def initialize(name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new(client.aws_s3_client))
|
18
18
|
@client = client
|
19
|
-
@name = name
|
20
19
|
@output_downloader = output_downloader
|
20
|
+
@name = name
|
21
21
|
end
|
22
22
|
|
23
23
|
attr_reader :name
|
@@ -28,11 +28,11 @@ module Egis
|
|
28
28
|
# @param [String] table_name
|
29
29
|
# @param [Egis::TableSchema] table_schema
|
30
30
|
# @param [String] table_location S3 URL with table location (e.g. `s3://s3_bucket/table/location/`)
|
31
|
-
# @param [:tsv, :csv, :orc] format Table
|
31
|
+
# @param [:tsv, :csv, :orc, :orc_index_access, :json, String] format Table Format (defaults to :tsv)
|
32
32
|
# @return [Egis::Table]
|
33
33
|
|
34
34
|
def table(table_name, table_schema, table_location, **options)
|
35
|
-
Table.new(self, table_name, table_schema, table_location, options: options)
|
35
|
+
Table.new(self, table_name, table_schema, table_location, client: client, options: options)
|
36
36
|
end
|
37
37
|
|
38
38
|
##
|
@@ -5,8 +5,8 @@ require 'csv'
|
|
5
5
|
module Egis
|
6
6
|
# @!visibility private
|
7
7
|
class OutputDownloader
|
8
|
-
def initialize(
|
9
|
-
@s3_client =
|
8
|
+
def initialize(aws_s3_client)
|
9
|
+
@s3_client = aws_s3_client
|
10
10
|
end
|
11
11
|
|
12
12
|
def download(output_location)
|
data/lib/egis/query_status.rb
CHANGED
@@ -23,7 +23,8 @@ module Egis
|
|
23
23
|
attr_reader :id, :status, :message, :output_location
|
24
24
|
|
25
25
|
def initialize(id, status, message, output_location,
|
26
|
-
|
26
|
+
client: Egis::Client.new,
|
27
|
+
output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
|
27
28
|
output_parser: Egis::OutputParser.new)
|
28
29
|
raise ArgumentError, "Unsupported status #{status}" unless STATUSES.include?(status)
|
29
30
|
|
@@ -51,6 +52,10 @@ module Egis
|
|
51
52
|
status == RUNNING
|
52
53
|
end
|
53
54
|
|
55
|
+
def cancelled?
|
56
|
+
status == CANCELLED
|
57
|
+
end
|
58
|
+
|
54
59
|
def in_progress?
|
55
60
|
[RUNNING, QUEUED].include?(status)
|
56
61
|
end
|
data/lib/egis/s3_cleaner.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
module Egis
|
4
4
|
# @!visibility private
|
5
5
|
class S3Cleaner
|
6
|
-
def initialize(
|
7
|
-
@s3_client =
|
6
|
+
def initialize(aws_s3_client)
|
7
|
+
@s3_client = aws_s3_client
|
8
8
|
end
|
9
9
|
|
10
10
|
def delete(bucket, prefix)
|
data/lib/egis/table.rb
CHANGED
@@ -17,11 +17,13 @@ module Egis
|
|
17
17
|
DEFAULT_OPTIONS = {format: :tsv}.freeze
|
18
18
|
|
19
19
|
def initialize(database, name, schema, location, options: {},
|
20
|
+
client: Egis::Client.new,
|
20
21
|
partitions_generator: Egis::PartitionsGenerator.new,
|
21
22
|
table_ddl_generator: Egis::TableDDLGenerator.new,
|
22
|
-
output_downloader: Egis::OutputDownloader.new,
|
23
|
+
output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
|
23
24
|
output_parser: Egis::OutputParser.new,
|
24
|
-
|
25
|
+
s3_cleaner: Egis::S3Cleaner.new(client.aws_s3_client),
|
26
|
+
table_data_wiper: Egis::TableDataWiper.new(s3_cleaner: s3_cleaner))
|
25
27
|
@database = database
|
26
28
|
@name = name
|
27
29
|
@schema = schema
|
@@ -97,7 +99,19 @@ module Egis
|
|
97
99
|
##
|
98
100
|
# Insert data into the table. Mostly useful for testing purposes.
|
99
101
|
#
|
100
|
-
# @
|
102
|
+
# @example Insert with array of arrays
|
103
|
+
# table.upload_data([
|
104
|
+
# ['hello world', 'mx', 1],
|
105
|
+
# ['hello again', 'us', 2]
|
106
|
+
# ])
|
107
|
+
#
|
108
|
+
# @example Insert with array of hashes
|
109
|
+
# table.upload_data([
|
110
|
+
# {message: 'hello world', country: 'mx', type: 1},
|
111
|
+
# {message: 'hello again', country: 'us', type: 2}
|
112
|
+
# ])
|
113
|
+
#
|
114
|
+
# @param [Array] rows Array of arrays or hashes with row values
|
101
115
|
# @return [void]
|
102
116
|
|
103
117
|
def upload_data(rows)
|
@@ -128,7 +142,6 @@ module Egis
|
|
128
142
|
|
129
143
|
##
|
130
144
|
# @return Table data format
|
131
|
-
|
132
145
|
def format
|
133
146
|
options.fetch(:format)
|
134
147
|
end
|
@@ -149,23 +162,33 @@ module Egis
|
|
149
162
|
Egis.logger.info { "Creating table #{database.name}.#{name} located in #{location}" }
|
150
163
|
end
|
151
164
|
|
152
|
-
def
|
153
|
-
|
165
|
+
def column_types
|
166
|
+
all_columns.map(&:type)
|
154
167
|
end
|
155
168
|
|
156
|
-
def
|
157
|
-
|
169
|
+
def all_columns
|
170
|
+
schema.columns + schema.partitions
|
158
171
|
end
|
159
172
|
|
160
173
|
def data_insert_query(rows)
|
174
|
+
insert_values = rows.map { |row| row_literal_values(row) }
|
175
|
+
row_clause = insert_values.map { |row| row_values_statement(row) }.join(",\n")
|
176
|
+
|
161
177
|
<<~SQL
|
162
178
|
INSERT INTO #{name} VALUES
|
163
|
-
#{
|
179
|
+
#{row_clause}
|
164
180
|
SQL
|
165
181
|
end
|
166
182
|
|
183
|
+
def row_literal_values(row)
|
184
|
+
all_columns.map.with_index do |column, index|
|
185
|
+
value = row.is_a?(Hash) ? row[column.name] : row[index]
|
186
|
+
Egis::Types.serializer(column.type).literal(value)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
167
190
|
def row_values_statement(row)
|
168
|
-
"(#{row.
|
191
|
+
"(#{row.join(', ')})"
|
169
192
|
end
|
170
193
|
end
|
171
194
|
end
|
@@ -35,13 +35,30 @@ module Egis
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def format_statement(format)
|
38
|
+
return format if format.is_a?(String)
|
39
|
+
|
40
|
+
format_preset(format)
|
41
|
+
end
|
42
|
+
|
43
|
+
def format_preset(format) # rubocop:disable Metrics/MethodLength
|
38
44
|
case format
|
39
45
|
when :csv
|
40
46
|
"ROW FORMAT DELIMITED FIELDS TERMINATED BY ','"
|
41
47
|
when :tsv
|
42
48
|
"ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t'"
|
43
49
|
when :orc
|
50
|
+
<<~SQL
|
51
|
+
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
|
52
|
+
WITH SERDEPROPERTIES (
|
53
|
+
'orc.column.index.access' = 'false'
|
54
|
+
)
|
55
|
+
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
|
56
|
+
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
|
57
|
+
SQL
|
58
|
+
when :orc_index_access
|
44
59
|
'STORED AS ORC'
|
60
|
+
when :json
|
61
|
+
"ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'"
|
45
62
|
else
|
46
63
|
raise Errors::UnsupportedTableFormat, format.to_s
|
47
64
|
end
|
@@ -6,7 +6,7 @@ module Egis
|
|
6
6
|
class TestingMode
|
7
7
|
def initialize(test_id, s3_bucket,
|
8
8
|
client: Egis::Client.new,
|
9
|
-
output_downloader: Egis::OutputDownloader.new,
|
9
|
+
output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
|
10
10
|
s3_location_parser: Egis::S3LocationParser.new)
|
11
11
|
@test_id = test_id
|
12
12
|
@s3_bucket = s3_bucket
|
data/lib/egis/testing.rb
CHANGED
data/lib/egis/version.rb
CHANGED
data/lib/egis.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: egis
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Agnieszka Czereba
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-12-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: aws-sdk-athena
|
@@ -99,7 +99,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
99
99
|
- !ruby/object:Gem::Version
|
100
100
|
version: '0'
|
101
101
|
requirements: []
|
102
|
-
rubygems_version: 3.1.
|
102
|
+
rubygems_version: 3.1.6
|
103
103
|
signing_key:
|
104
104
|
specification_version: 4
|
105
105
|
summary: A handy wrapper for AWS Athena Ruby SDK.
|