egis 1.2.1 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1ca49c2a770af01b7ff35956785bd77ca5df10446e229620f7cb21892b65e1f
4
- data.tar.gz: 450b5de96addf904a43f7b14bc5b55c3f876c982977e6b8ae51a6e58a0466a04
3
+ metadata.gz: ca7464c31cc32115edb77470e2d5f2abe99ab9f4a64eef2ae0820ed8d5b982b7
4
+ data.tar.gz: 8002e611e1c59635365a3ca2ff54191e1a9671f4b5498c940438c85b7ea45d28
5
5
  SHA512:
6
- metadata.gz: f542064733b5d9805a27dd40b4628527e3faf932788f96b7911db0726f729f8f89835c06ba767d5532cab603633838146113ae46dfd8b1089d70ba3288806312
7
- data.tar.gz: d833d875b4cee9b4d435aa0394908b1466ff1588bb9620fe237c72b95e16965ebf34299a8e5a63d1ce70bf1642ebfd053f6089688ffd83320dadcbb10cc2ca8d
6
+ metadata.gz: 5971ac98ab6bfdcbcac23a4b9e0e2b8b31b14cb06987ea6f3891fc0c1787087ebfca85eae026c8cb1c2952f11e4659cf36827492b8d9fdf7d43d3a6943d3bd7f
7
+ data.tar.gz: 98d379a3ddb1b3796b08e9c5a4cc584826324fa03e24d64ddb604405ec57a6f0d75a3d2138cf9123f154e275b5202c592814c4cea6a00049d75eda6ec5d2de8b
@@ -6,25 +6,23 @@ require 'aws-sdk-athena'
6
6
  module Egis
7
7
  # @!visibility private
8
8
  class AwsClientProvider
9
- def s3_client
10
- Aws::S3::Client.new(client_config)
9
+ def s3_client(configuration)
10
+ Aws::S3::Client.new(client_config(configuration))
11
11
  end
12
12
 
13
- def athena_client
14
- Aws::Athena::Client.new(client_config)
13
+ def athena_client(configuration)
14
+ Aws::Athena::Client.new(client_config(configuration))
15
15
  end
16
16
 
17
17
  private
18
18
 
19
- def client_config
20
- configuration = Egis.configuration
21
-
22
- config = {}
23
- config[:region] = configuration.aws_region if configuration.aws_region
24
- config[:access_key_id] = configuration.aws_access_key_id if configuration.aws_access_key_id
25
- config[:secret_access_key] = configuration.aws_secret_access_key if configuration.aws_secret_access_key
26
- config[:profile] = configuration.aws_profile if configuration.aws_profile
27
- config
19
+ def client_config(configuration)
20
+ {
21
+ region: configuration.aws_region,
22
+ access_key_id: configuration.aws_access_key_id,
23
+ secret_access_key: configuration.aws_secret_access_key,
24
+ profile: configuration.aws_profile
25
+ }.compact
28
26
  end
29
27
  end
30
28
  end
data/lib/egis/client.rb CHANGED
@@ -4,6 +4,9 @@ module Egis
4
4
  ##
5
5
  # The most fundamental {Egis} class. Provides an interface for executing Athena queries.
6
6
  #
7
+ # @yieldparam config [Egis::Configuration] Egis configuration block, if missing Egis will use global configuration
8
+ # provided by {Egis.configure}
9
+ #
7
10
  # See configuration instructions {Egis.configure}.
8
11
  #
9
12
  # @see Egis.configure
@@ -33,14 +36,17 @@ module Egis
33
36
  'CANCELLED' => Egis::QueryStatus::CANCELLED
34
37
  }.freeze
35
38
 
36
- DEFAULT_QUERY_STATUS_BACKOFF = ->(attempt) { 1.5**attempt - 1 }
39
+ private_constant :QUERY_STATUS_MAPPING
37
40
 
38
- private_constant :QUERY_STATUS_MAPPING, :DEFAULT_QUERY_STATUS_BACKOFF
41
+ attr_reader :aws_s3_client
39
42
 
40
- def initialize(aws_client_provider: Egis::AwsClientProvider.new, s3_location_parser: Egis::S3LocationParser.new)
41
- @aws_athena_client = aws_client_provider.athena_client
43
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new,
44
+ s3_location_parser: Egis::S3LocationParser.new,
45
+ &block)
46
+ @configuration = block_given? ? Egis.configuration.dup.configure(&block) : Egis.configuration
47
+ @aws_athena_client = aws_client_provider.athena_client(configuration)
48
+ @aws_s3_client = aws_client_provider.s3_client(configuration)
42
49
  @s3_location_parser = s3_location_parser
43
- @query_status_backoff = Egis.configuration.query_status_backoff || DEFAULT_QUERY_STATUS_BACKOFF
44
50
  end
45
51
 
46
52
  ##
@@ -99,16 +105,17 @@ module Egis
99
105
  query_execution.query_execution_id,
100
106
  QUERY_STATUS_MAPPING.fetch(query_status),
101
107
  query_execution.status.state_change_reason,
102
- parse_output_location(query_execution)
108
+ parse_output_location(query_execution),
109
+ client: self
103
110
  )
104
111
  end
105
112
 
106
113
  private
107
114
 
108
- attr_reader :aws_athena_client, :s3_location_parser, :query_status_backoff
115
+ attr_reader :configuration, :aws_athena_client, :s3_location_parser
109
116
 
110
117
  def query_execution_params(query, work_group, database, output_location)
111
- work_group_params = work_group || Egis.configuration.work_group
118
+ work_group_params = work_group || configuration.work_group
112
119
 
113
120
  params = {query_string: query}
114
121
  params[:work_group] = work_group_params if work_group_params
@@ -128,7 +135,7 @@ module Egis
128
135
  def wait_for_query_to_finish(query_id)
129
136
  attempt = 1
130
137
  loop do
131
- sleep(query_status_backoff.call(attempt))
138
+ sleep(configuration.query_status_backoff.call(attempt))
132
139
  status = query_status(query_id)
133
140
 
134
141
  return status unless status.queued? || status.running?
@@ -8,6 +8,12 @@ module Egis
8
8
 
9
9
  def initialize
10
10
  @logger = Logger.new(STDOUT, level: :info)
11
+ @query_status_backoff = ->(attempt) { 1.5**attempt - 1 }
12
+ end
13
+
14
+ def configure
15
+ yield(self)
16
+ self
11
17
  end
12
18
  end
13
19
  end
data/lib/egis/database.rb CHANGED
@@ -14,10 +14,10 @@ module Egis
14
14
  # @return [String] Athena database name
15
15
  #
16
16
  class Database
17
- def initialize(name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new)
17
+ def initialize(name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new(client.aws_s3_client))
18
18
  @client = client
19
- @name = name
20
19
  @output_downloader = output_downloader
20
+ @name = name
21
21
  end
22
22
 
23
23
  attr_reader :name
@@ -28,11 +28,11 @@ module Egis
28
28
  # @param [String] table_name
29
29
  # @param [Egis::TableSchema] table_schema
30
30
  # @param [String] table_location S3 URL with table location (e.g. `s3://s3_bucket/table/location/`)
31
- # @param [:tsv, :csv, :orc] format Table format (defaults to :tsv)
31
+ # @param [:tsv, :csv, :orc, :orc_index_access, :json, String] format Table Format (defaults to :tsv)
32
32
  # @return [Egis::Table]
33
33
 
34
34
  def table(table_name, table_schema, table_location, **options)
35
- Table.new(self, table_name, table_schema, table_location, options: options)
35
+ Table.new(self, table_name, table_schema, table_location, client: client, options: options)
36
36
  end
37
37
 
38
38
  ##
@@ -5,8 +5,8 @@ require 'csv'
5
5
  module Egis
6
6
  # @!visibility private
7
7
  class OutputDownloader
8
- def initialize(aws_client_provider: Egis::AwsClientProvider.new)
9
- @s3_client = aws_client_provider.s3_client
8
+ def initialize(aws_s3_client)
9
+ @s3_client = aws_s3_client
10
10
  end
11
11
 
12
12
  def download(output_location)
@@ -23,7 +23,8 @@ module Egis
23
23
  attr_reader :id, :status, :message, :output_location
24
24
 
25
25
  def initialize(id, status, message, output_location,
26
- output_downloader: Egis::OutputDownloader.new,
26
+ client: Egis::Client.new,
27
+ output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
27
28
  output_parser: Egis::OutputParser.new)
28
29
  raise ArgumentError, "Unsupported status #{status}" unless STATUSES.include?(status)
29
30
 
@@ -51,6 +52,10 @@ module Egis
51
52
  status == RUNNING
52
53
  end
53
54
 
55
+ def cancelled?
56
+ status == CANCELLED
57
+ end
58
+
54
59
  def in_progress?
55
60
  [RUNNING, QUEUED].include?(status)
56
61
  end
@@ -3,8 +3,8 @@
3
3
  module Egis
4
4
  # @!visibility private
5
5
  class S3Cleaner
6
- def initialize(aws_client_provider: Egis::AwsClientProvider.new)
7
- @s3_client = aws_client_provider.s3_client
6
+ def initialize(aws_s3_client)
7
+ @s3_client = aws_s3_client
8
8
  end
9
9
 
10
10
  def delete(bucket, prefix)
data/lib/egis/table.rb CHANGED
@@ -17,11 +17,13 @@ module Egis
17
17
  DEFAULT_OPTIONS = {format: :tsv}.freeze
18
18
 
19
19
  def initialize(database, name, schema, location, options: {},
20
+ client: Egis::Client.new,
20
21
  partitions_generator: Egis::PartitionsGenerator.new,
21
22
  table_ddl_generator: Egis::TableDDLGenerator.new,
22
- output_downloader: Egis::OutputDownloader.new,
23
+ output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
23
24
  output_parser: Egis::OutputParser.new,
24
- table_data_wiper: Egis::TableDataWiper.new)
25
+ s3_cleaner: Egis::S3Cleaner.new(client.aws_s3_client),
26
+ table_data_wiper: Egis::TableDataWiper.new(s3_cleaner: s3_cleaner))
25
27
  @database = database
26
28
  @name = name
27
29
  @schema = schema
@@ -97,7 +99,19 @@ module Egis
97
99
  ##
98
100
  # Insert data into the table. Mostly useful for testing purposes.
99
101
  #
100
- # @param [Array] rows Array of arrays with row values
102
+ # @example Insert with array of arrays
103
+ # table.upload_data([
104
+ # ['hello world', 'mx', 1],
105
+ # ['hello again', 'us', 2]
106
+ # ])
107
+ #
108
+ # @example Insert with array of hashes
109
+ # table.upload_data([
110
+ # {message: 'hello world', country: 'mx', type: 1},
111
+ # {message: 'hello again', country: 'us', type: 2}
112
+ # ])
113
+ #
114
+ # @param [Array] rows Array of arrays or hashes with row values
101
115
  # @return [void]
102
116
 
103
117
  def upload_data(rows)
@@ -128,7 +142,6 @@ module Egis
128
142
 
129
143
  ##
130
144
  # @return Table data format
131
-
132
145
  def format
133
146
  options.fetch(:format)
134
147
  end
@@ -149,23 +162,33 @@ module Egis
149
162
  Egis.logger.info { "Creating table #{database.name}.#{name} located in #{location}" }
150
163
  end
151
164
 
152
- def column_serializers
153
- @column_serializers ||= column_types.map { |type| Egis::Types.serializer(type) }
165
+ def column_types
166
+ all_columns.map(&:type)
154
167
  end
155
168
 
156
- def column_types
157
- (schema.columns + schema.partitions).map(&:type)
169
+ def all_columns
170
+ schema.columns + schema.partitions
158
171
  end
159
172
 
160
173
  def data_insert_query(rows)
174
+ insert_values = rows.map { |row| row_literal_values(row) }
175
+ row_clause = insert_values.map { |row| row_values_statement(row) }.join(",\n")
176
+
161
177
  <<~SQL
162
178
  INSERT INTO #{name} VALUES
163
- #{rows.map { |row| row_values_statement(row) }.join(",\n")};
179
+ #{row_clause}
164
180
  SQL
165
181
  end
166
182
 
183
+ def row_literal_values(row)
184
+ all_columns.map.with_index do |column, index|
185
+ value = row.is_a?(Hash) ? row[column.name] : row[index]
186
+ Egis::Types.serializer(column.type).literal(value)
187
+ end
188
+ end
189
+
167
190
  def row_values_statement(row)
168
- "(#{row.zip(column_serializers).map { |value, serializer| serializer.literal(value) }.join(', ')})"
191
+ "(#{row.join(', ')})"
169
192
  end
170
193
  end
171
194
  end
@@ -35,13 +35,30 @@ module Egis
35
35
  end
36
36
 
37
37
  def format_statement(format)
38
+ return format if format.is_a?(String)
39
+
40
+ format_preset(format)
41
+ end
42
+
43
+ def format_preset(format) # rubocop:disable Metrics/MethodLength
38
44
  case format
39
45
  when :csv
40
46
  "ROW FORMAT DELIMITED FIELDS TERMINATED BY ','"
41
47
  when :tsv
42
48
  "ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t'"
43
49
  when :orc
50
+ <<~SQL
51
+ ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
52
+ WITH SERDEPROPERTIES (
53
+ 'orc.column.index.access' = 'false'
54
+ )
55
+ STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
56
+ OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
57
+ SQL
58
+ when :orc_index_access
44
59
  'STORED AS ORC'
60
+ when :json
61
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'"
45
62
  else
46
63
  raise Errors::UnsupportedTableFormat, format.to_s
47
64
  end
@@ -6,7 +6,7 @@ module Egis
6
6
  class TestingMode
7
7
  def initialize(test_id, s3_bucket,
8
8
  client: Egis::Client.new,
9
- output_downloader: Egis::OutputDownloader.new,
9
+ output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
10
10
  s3_location_parser: Egis::S3LocationParser.new)
11
11
  @test_id = test_id
12
12
  @s3_bucket = s3_bucket
data/lib/egis/testing.rb CHANGED
@@ -43,6 +43,6 @@ module Egis # rubocop:disable Style/Documentation
43
43
  yield
44
44
  ensure
45
45
  @mode = previous_mode
46
- test_mode.cleanup if test_mode
46
+ test_mode&.cleanup
47
47
  end
48
48
  end
data/lib/egis/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Egis
4
- VERSION = '1.2.1'
4
+ VERSION = '2.0.0'
5
5
  end
data/lib/egis.rb CHANGED
@@ -50,8 +50,8 @@ require 'egis/s3_location_parser'
50
50
  #
51
51
  module Egis
52
52
  class << self
53
- def configure
54
- yield(configuration)
53
+ def configure(&block)
54
+ configuration.configure(&block)
55
55
  end
56
56
 
57
57
  # @!visibility private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: egis
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Agnieszka Czereba
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2020-05-29 00:00:00.000000000 Z
12
+ date: 2021-12-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: aws-sdk-athena
@@ -99,7 +99,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
- rubygems_version: 3.1.2
102
+ rubygems_version: 3.1.6
103
103
  signing_key:
104
104
  specification_version: 4
105
105
  summary: A handy wrapper for AWS Athena Ruby SDK.