egis 1.2.1 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1ca49c2a770af01b7ff35956785bd77ca5df10446e229620f7cb21892b65e1f
4
- data.tar.gz: 450b5de96addf904a43f7b14bc5b55c3f876c982977e6b8ae51a6e58a0466a04
3
+ metadata.gz: ca7464c31cc32115edb77470e2d5f2abe99ab9f4a64eef2ae0820ed8d5b982b7
4
+ data.tar.gz: 8002e611e1c59635365a3ca2ff54191e1a9671f4b5498c940438c85b7ea45d28
5
5
  SHA512:
6
- metadata.gz: f542064733b5d9805a27dd40b4628527e3faf932788f96b7911db0726f729f8f89835c06ba767d5532cab603633838146113ae46dfd8b1089d70ba3288806312
7
- data.tar.gz: d833d875b4cee9b4d435aa0394908b1466ff1588bb9620fe237c72b95e16965ebf34299a8e5a63d1ce70bf1642ebfd053f6089688ffd83320dadcbb10cc2ca8d
6
+ metadata.gz: 5971ac98ab6bfdcbcac23a4b9e0e2b8b31b14cb06987ea6f3891fc0c1787087ebfca85eae026c8cb1c2952f11e4659cf36827492b8d9fdf7d43d3a6943d3bd7f
7
+ data.tar.gz: 98d379a3ddb1b3796b08e9c5a4cc584826324fa03e24d64ddb604405ec57a6f0d75a3d2138cf9123f154e275b5202c592814c4cea6a00049d75eda6ec5d2de8b
@@ -6,25 +6,23 @@ require 'aws-sdk-athena'
6
6
  module Egis
7
7
  # @!visibility private
8
8
  class AwsClientProvider
9
- def s3_client
10
- Aws::S3::Client.new(client_config)
9
+ def s3_client(configuration)
10
+ Aws::S3::Client.new(client_config(configuration))
11
11
  end
12
12
 
13
- def athena_client
14
- Aws::Athena::Client.new(client_config)
13
+ def athena_client(configuration)
14
+ Aws::Athena::Client.new(client_config(configuration))
15
15
  end
16
16
 
17
17
  private
18
18
 
19
- def client_config
20
- configuration = Egis.configuration
21
-
22
- config = {}
23
- config[:region] = configuration.aws_region if configuration.aws_region
24
- config[:access_key_id] = configuration.aws_access_key_id if configuration.aws_access_key_id
25
- config[:secret_access_key] = configuration.aws_secret_access_key if configuration.aws_secret_access_key
26
- config[:profile] = configuration.aws_profile if configuration.aws_profile
27
- config
19
+ def client_config(configuration)
20
+ {
21
+ region: configuration.aws_region,
22
+ access_key_id: configuration.aws_access_key_id,
23
+ secret_access_key: configuration.aws_secret_access_key,
24
+ profile: configuration.aws_profile
25
+ }.compact
28
26
  end
29
27
  end
30
28
  end
data/lib/egis/client.rb CHANGED
@@ -4,6 +4,9 @@ module Egis
4
4
  ##
5
5
  # The most fundamental {Egis} class. Provides an interface for executing Athena queries.
6
6
  #
7
+ # @yieldparam config [Egis::Configuration] Egis configuration block, if missing Egis will use global configuration
8
+ # provided by {Egis.configure}
9
+ #
7
10
  # See configuration instructions {Egis.configure}.
8
11
  #
9
12
  # @see Egis.configure
@@ -33,14 +36,17 @@ module Egis
33
36
  'CANCELLED' => Egis::QueryStatus::CANCELLED
34
37
  }.freeze
35
38
 
36
- DEFAULT_QUERY_STATUS_BACKOFF = ->(attempt) { 1.5**attempt - 1 }
39
+ private_constant :QUERY_STATUS_MAPPING
37
40
 
38
- private_constant :QUERY_STATUS_MAPPING, :DEFAULT_QUERY_STATUS_BACKOFF
41
+ attr_reader :aws_s3_client
39
42
 
40
- def initialize(aws_client_provider: Egis::AwsClientProvider.new, s3_location_parser: Egis::S3LocationParser.new)
41
- @aws_athena_client = aws_client_provider.athena_client
43
+ def initialize(aws_client_provider: Egis::AwsClientProvider.new,
44
+ s3_location_parser: Egis::S3LocationParser.new,
45
+ &block)
46
+ @configuration = block_given? ? Egis.configuration.dup.configure(&block) : Egis.configuration
47
+ @aws_athena_client = aws_client_provider.athena_client(configuration)
48
+ @aws_s3_client = aws_client_provider.s3_client(configuration)
42
49
  @s3_location_parser = s3_location_parser
43
- @query_status_backoff = Egis.configuration.query_status_backoff || DEFAULT_QUERY_STATUS_BACKOFF
44
50
  end
45
51
 
46
52
  ##
@@ -99,16 +105,17 @@ module Egis
99
105
  query_execution.query_execution_id,
100
106
  QUERY_STATUS_MAPPING.fetch(query_status),
101
107
  query_execution.status.state_change_reason,
102
- parse_output_location(query_execution)
108
+ parse_output_location(query_execution),
109
+ client: self
103
110
  )
104
111
  end
105
112
 
106
113
  private
107
114
 
108
- attr_reader :aws_athena_client, :s3_location_parser, :query_status_backoff
115
+ attr_reader :configuration, :aws_athena_client, :s3_location_parser
109
116
 
110
117
  def query_execution_params(query, work_group, database, output_location)
111
- work_group_params = work_group || Egis.configuration.work_group
118
+ work_group_params = work_group || configuration.work_group
112
119
 
113
120
  params = {query_string: query}
114
121
  params[:work_group] = work_group_params if work_group_params
@@ -128,7 +135,7 @@ module Egis
128
135
  def wait_for_query_to_finish(query_id)
129
136
  attempt = 1
130
137
  loop do
131
- sleep(query_status_backoff.call(attempt))
138
+ sleep(configuration.query_status_backoff.call(attempt))
132
139
  status = query_status(query_id)
133
140
 
134
141
  return status unless status.queued? || status.running?
@@ -8,6 +8,12 @@ module Egis
8
8
 
9
9
  def initialize
10
10
  @logger = Logger.new(STDOUT, level: :info)
11
+ @query_status_backoff = ->(attempt) { 1.5**attempt - 1 }
12
+ end
13
+
14
+ def configure
15
+ yield(self)
16
+ self
11
17
  end
12
18
  end
13
19
  end
data/lib/egis/database.rb CHANGED
@@ -14,10 +14,10 @@ module Egis
14
14
  # @return [String] Athena database name
15
15
  #
16
16
  class Database
17
- def initialize(name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new)
17
+ def initialize(name, client: Egis::Client.new, output_downloader: Egis::OutputDownloader.new(client.aws_s3_client))
18
18
  @client = client
19
- @name = name
20
19
  @output_downloader = output_downloader
20
+ @name = name
21
21
  end
22
22
 
23
23
  attr_reader :name
@@ -28,11 +28,11 @@ module Egis
28
28
  # @param [String] table_name
29
29
  # @param [Egis::TableSchema] table_schema
30
30
  # @param [String] table_location S3 URL with table location (e.g. `s3://s3_bucket/table/location/`)
31
- # @param [:tsv, :csv, :orc] format Table format (defaults to :tsv)
31
+ # @param [:tsv, :csv, :orc, :orc_index_access, :json, String] format Table Format (defaults to :tsv)
32
32
  # @return [Egis::Table]
33
33
 
34
34
  def table(table_name, table_schema, table_location, **options)
35
- Table.new(self, table_name, table_schema, table_location, options: options)
35
+ Table.new(self, table_name, table_schema, table_location, client: client, options: options)
36
36
  end
37
37
 
38
38
  ##
@@ -5,8 +5,8 @@ require 'csv'
5
5
  module Egis
6
6
  # @!visibility private
7
7
  class OutputDownloader
8
- def initialize(aws_client_provider: Egis::AwsClientProvider.new)
9
- @s3_client = aws_client_provider.s3_client
8
+ def initialize(aws_s3_client)
9
+ @s3_client = aws_s3_client
10
10
  end
11
11
 
12
12
  def download(output_location)
@@ -23,7 +23,8 @@ module Egis
23
23
  attr_reader :id, :status, :message, :output_location
24
24
 
25
25
  def initialize(id, status, message, output_location,
26
- output_downloader: Egis::OutputDownloader.new,
26
+ client: Egis::Client.new,
27
+ output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
27
28
  output_parser: Egis::OutputParser.new)
28
29
  raise ArgumentError, "Unsupported status #{status}" unless STATUSES.include?(status)
29
30
 
@@ -51,6 +52,10 @@ module Egis
51
52
  status == RUNNING
52
53
  end
53
54
 
55
+ def cancelled?
56
+ status == CANCELLED
57
+ end
58
+
54
59
  def in_progress?
55
60
  [RUNNING, QUEUED].include?(status)
56
61
  end
@@ -3,8 +3,8 @@
3
3
  module Egis
4
4
  # @!visibility private
5
5
  class S3Cleaner
6
- def initialize(aws_client_provider: Egis::AwsClientProvider.new)
7
- @s3_client = aws_client_provider.s3_client
6
+ def initialize(aws_s3_client)
7
+ @s3_client = aws_s3_client
8
8
  end
9
9
 
10
10
  def delete(bucket, prefix)
data/lib/egis/table.rb CHANGED
@@ -17,11 +17,13 @@ module Egis
17
17
  DEFAULT_OPTIONS = {format: :tsv}.freeze
18
18
 
19
19
  def initialize(database, name, schema, location, options: {},
20
+ client: Egis::Client.new,
20
21
  partitions_generator: Egis::PartitionsGenerator.new,
21
22
  table_ddl_generator: Egis::TableDDLGenerator.new,
22
- output_downloader: Egis::OutputDownloader.new,
23
+ output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
23
24
  output_parser: Egis::OutputParser.new,
24
- table_data_wiper: Egis::TableDataWiper.new)
25
+ s3_cleaner: Egis::S3Cleaner.new(client.aws_s3_client),
26
+ table_data_wiper: Egis::TableDataWiper.new(s3_cleaner: s3_cleaner))
25
27
  @database = database
26
28
  @name = name
27
29
  @schema = schema
@@ -97,7 +99,19 @@ module Egis
97
99
  ##
98
100
  # Insert data into the table. Mostly useful for testing purposes.
99
101
  #
100
- # @param [Array] rows Array of arrays with row values
102
+ # @example Insert with array of arrays
103
+ # table.upload_data([
104
+ # ['hello world', 'mx', 1],
105
+ # ['hello again', 'us', 2]
106
+ # ])
107
+ #
108
+ # @example Insert with array of hashes
109
+ # table.upload_data([
110
+ # {message: 'hello world', country: 'mx', type: 1},
111
+ # {message: 'hello again', country: 'us', type: 2}
112
+ # ])
113
+ #
114
+ # @param [Array] rows Array of arrays or hashes with row values
101
115
  # @return [void]
102
116
 
103
117
  def upload_data(rows)
@@ -128,7 +142,6 @@ module Egis
128
142
 
129
143
  ##
130
144
  # @return Table data format
131
-
132
145
  def format
133
146
  options.fetch(:format)
134
147
  end
@@ -149,23 +162,33 @@ module Egis
149
162
  Egis.logger.info { "Creating table #{database.name}.#{name} located in #{location}" }
150
163
  end
151
164
 
152
- def column_serializers
153
- @column_serializers ||= column_types.map { |type| Egis::Types.serializer(type) }
165
+ def column_types
166
+ all_columns.map(&:type)
154
167
  end
155
168
 
156
- def column_types
157
- (schema.columns + schema.partitions).map(&:type)
169
+ def all_columns
170
+ schema.columns + schema.partitions
158
171
  end
159
172
 
160
173
  def data_insert_query(rows)
174
+ insert_values = rows.map { |row| row_literal_values(row) }
175
+ row_clause = insert_values.map { |row| row_values_statement(row) }.join(",\n")
176
+
161
177
  <<~SQL
162
178
  INSERT INTO #{name} VALUES
163
- #{rows.map { |row| row_values_statement(row) }.join(",\n")};
179
+ #{row_clause}
164
180
  SQL
165
181
  end
166
182
 
183
+ def row_literal_values(row)
184
+ all_columns.map.with_index do |column, index|
185
+ value = row.is_a?(Hash) ? row[column.name] : row[index]
186
+ Egis::Types.serializer(column.type).literal(value)
187
+ end
188
+ end
189
+
167
190
  def row_values_statement(row)
168
- "(#{row.zip(column_serializers).map { |value, serializer| serializer.literal(value) }.join(', ')})"
191
+ "(#{row.join(', ')})"
169
192
  end
170
193
  end
171
194
  end
@@ -35,13 +35,30 @@ module Egis
35
35
  end
36
36
 
37
37
  def format_statement(format)
38
+ return format if format.is_a?(String)
39
+
40
+ format_preset(format)
41
+ end
42
+
43
+ def format_preset(format) # rubocop:disable Metrics/MethodLength
38
44
  case format
39
45
  when :csv
40
46
  "ROW FORMAT DELIMITED FIELDS TERMINATED BY ','"
41
47
  when :tsv
42
48
  "ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t'"
43
49
  when :orc
50
+ <<~SQL
51
+ ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
52
+ WITH SERDEPROPERTIES (
53
+ 'orc.column.index.access' = 'false'
54
+ )
55
+ STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
56
+ OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
57
+ SQL
58
+ when :orc_index_access
44
59
  'STORED AS ORC'
60
+ when :json
61
+ "ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'"
45
62
  else
46
63
  raise Errors::UnsupportedTableFormat, format.to_s
47
64
  end
@@ -6,7 +6,7 @@ module Egis
6
6
  class TestingMode
7
7
  def initialize(test_id, s3_bucket,
8
8
  client: Egis::Client.new,
9
- output_downloader: Egis::OutputDownloader.new,
9
+ output_downloader: Egis::OutputDownloader.new(client.aws_s3_client),
10
10
  s3_location_parser: Egis::S3LocationParser.new)
11
11
  @test_id = test_id
12
12
  @s3_bucket = s3_bucket
data/lib/egis/testing.rb CHANGED
@@ -43,6 +43,6 @@ module Egis # rubocop:disable Style/Documentation
43
43
  yield
44
44
  ensure
45
45
  @mode = previous_mode
46
- test_mode.cleanup if test_mode
46
+ test_mode&.cleanup
47
47
  end
48
48
  end
data/lib/egis/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Egis
4
- VERSION = '1.2.1'
4
+ VERSION = '2.0.0'
5
5
  end
data/lib/egis.rb CHANGED
@@ -50,8 +50,8 @@ require 'egis/s3_location_parser'
50
50
  #
51
51
  module Egis
52
52
  class << self
53
- def configure
54
- yield(configuration)
53
+ def configure(&block)
54
+ configuration.configure(&block)
55
55
  end
56
56
 
57
57
  # @!visibility private
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: egis
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Agnieszka Czereba
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2020-05-29 00:00:00.000000000 Z
12
+ date: 2021-12-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: aws-sdk-athena
@@ -99,7 +99,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
- rubygems_version: 3.1.2
102
+ rubygems_version: 3.1.6
103
103
  signing_key:
104
104
  specification_version: 4
105
105
  summary: A handy wrapper for AWS Athena Ruby SDK.