egis 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/egis/table.rb ADDED
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # Interface for Athena table manipulation.
6
+ #
7
+ # It is recommended to create table objects using {Egis::Database#table} method.
8
+ #
9
+ # @!attribute [r] database
10
+ # @return [Egis::Database]
11
+ # @!attribute [r] name
12
+ # @return [String] Athena database name
13
+ # @!attribute [r] schema
14
+ # @return [Egis::TableSchema] table's schema object
15
+ #
16
+ class Table
17
+ DEFAULT_OPTIONS = {format: :tsv}.freeze
18
+
19
+ def initialize(database, name, schema, location, options: {},
20
+ partitions_generator: Egis::PartitionsGenerator.new,
21
+ table_ddl_generator: Egis::TableDDLGenerator.new,
22
+ output_downloader: Egis::OutputDownloader.new,
23
+ output_parser: Egis::OutputParser.new,
24
+ table_data_wiper: Egis::TableDataWiper.new)
25
+ @database = database
26
+ @name = name
27
+ @schema = schema
28
+ @location = location
29
+ @options = DEFAULT_OPTIONS.merge(options)
30
+ @partitions_generator = partitions_generator
31
+ @table_ddl_generator = table_ddl_generator
32
+ @output_downloader = output_downloader
33
+ @output_parser = output_parser
34
+ @table_data_wiper = table_data_wiper
35
+ end
36
+
37
+ attr_reader :database, :name, :schema
38
+
39
+ ##
40
+ # Creates table in Athena.
41
+ #
42
+ # @return [void]
43
+
44
+ def create
45
+ create_table_sql = table_ddl_generator.create_table_sql(self, permissive: true)
46
+ database.execute_query(create_table_sql, async: false)
47
+ end
48
+
49
+ ##
50
+ # The same as {#create} but raising error when table with a given name already exists.
51
+ #
52
+ # @return [void]
53
+
54
+ def create!
55
+ create_table_sql = table_ddl_generator.create_table_sql(self, permissive: false)
56
+ database.execute_query(create_table_sql, async: false)
57
+ end
58
+
59
+ ##
60
+ # Creates partitions with all possible combinations of given partition values.
61
+ #
62
+ # @example
63
+ # table.add_partitions(year: [2000, 2001], type: ['user'])
64
+ #
65
+ # @param [Hash] partitions
66
+ # @return [void]
67
+
68
+ def add_partitions(partitions)
69
+ load_partitions_query = partitions_generator.to_sql(name, partitions, permissive: true)
70
+ database.execute_query(load_partitions_query, async: false)
71
+ end
72
+
73
+ ##
74
+ # (see add_partitions)
75
+ # It raises error when a partition already exists.
76
+
77
+ def add_partitions!(partitions)
78
+ load_partitions_query = partitions_generator.to_sql(name, partitions, permissive: false)
79
+ database.execute_query(load_partitions_query, async: false)
80
+ end
81
+
82
+ ##
83
+ # Tells Athena to automatically discover table's partitions by scanning table's S3 location.
84
+ # This operation might take long time with big number of partitions. If that's the case, instead of this method use
85
+ # {#add_partitions} to define partitions manually.
86
+ #
87
+ # @return [void]
88
+
89
+ def discover_partitions
90
+ database.execute_query("MSCK REPAIR TABLE #{name};", async: false)
91
+ end
92
+
93
+ ##
94
+ # Insert data into the table. Mostly useful for testing purposes.
95
+ #
96
+ # @param [Array] rows Array of arrays with row values
97
+ # @return [void]
98
+
99
+ def upload_data(rows)
100
+ query = data_insert_query(rows)
101
+ database.execute_query(query, async: false)
102
+ end
103
+
104
+ ##
105
+ # Downloads table contents into memory. Mostly useful for testing purposes.
106
+ #
107
+ # @return [Array] Array of arrays with row values.
108
+
109
+ def download_data
110
+ result = database.execute_query("SELECT * FROM #{name};", async: false)
111
+ content = output_downloader.download(result.output_location)
112
+ output_parser.parse(content, column_types)
113
+ end
114
+
115
+ ##
116
+ # Removes table's content on S3. Optionally, you can limit files removed to specific partitions.
117
+ #
118
+ # @param [Hash] partitions Partitions values to remove. Follows the same argument format as {#add_partitions}.
119
+ # @return [void]
120
+
121
+ def wipe_data(partitions: nil)
122
+ table_data_wiper.wipe_table_data(self, partitions)
123
+ end
124
+
125
+ ##
126
+ # @return Table data format
127
+
128
+ def format
129
+ options.fetch(:format)
130
+ end
131
+
132
+ ##
133
+ # @return [String] table location URL
134
+
135
+ def location
136
+ Egis.mode.s3_path(@location)
137
+ end
138
+
139
+ private
140
+
141
+ attr_reader :options, :partitions_generator, :table_ddl_generator, :output_downloader, :output_parser,
142
+ :table_data_wiper
143
+
144
+ def column_serializers
145
+ @column_serializers ||= column_types.map { |type| Egis::Types.serializer(type) }
146
+ end
147
+
148
+ def column_types
149
+ (schema.columns + schema.partitions).map(&:type)
150
+ end
151
+
152
+ def data_insert_query(rows)
153
+ <<~SQL
154
+ INSERT INTO #{name} VALUES
155
+ #{rows.map { |row| row_values_statement(row) }.join(",\n")};
156
+ SQL
157
+ end
158
+
159
+ def row_values_statement(row)
160
+ "(#{row.zip(column_serializers).map { |value, serializer| serializer.literal(value) }.join(', ')})"
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class TableDataWiper
6
+ def initialize(s3_location_parser: Egis::S3LocationParser.new,
7
+ s3_cleaner: Egis::S3Cleaner.new,
8
+ cartesian_product_generator: Egis::CartesianProductGenerator.new)
9
+ @s3_location_parser = s3_location_parser
10
+ @s3_cleaner = s3_cleaner
11
+ @cartesian_product_generator = cartesian_product_generator
12
+ end
13
+
14
+ def wipe_table_data(table, partitions)
15
+ bucket, location = s3_location_parser.parse_url(table.location)
16
+
17
+ return s3_cleaner.delete(bucket, location) unless partitions
18
+
19
+ partition_values_to_remove = partition_values_to_remove(table, partitions)
20
+
21
+ validate_partition_values(partition_values_to_remove, partitions)
22
+
23
+ remove_partition_files(bucket, location, partition_values_to_remove)
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :s3_location_parser, :s3_cleaner, :cartesian_product_generator
29
+
30
+ def partition_values_to_remove(table, partitions)
31
+ table_partitions = table.schema.partitions.map(&:name)
32
+ given_partitions = partitions.keys
33
+
34
+ partitions_to_delete = table_partitions.take_while { |partition| given_partitions.include?(partition) }
35
+ partitions_to_delete.map { |partition_name| [partition_name, partitions.fetch(partition_name)] }.to_h
36
+ end
37
+
38
+ def validate_partition_values(removed_partition_values, partitions)
39
+ return unless removed_partition_values.empty? || removed_partition_values.values.any?(&:empty?)
40
+
41
+ raise Egis::Errors::PartitionError, "Incorrect partitions given: #{partitions}"
42
+ end
43
+
44
+ def remove_partition_files(bucket, location, partitions_with_values)
45
+ cartesian_product_generator.cartesian_product(partitions_with_values).each do |partition_value_set|
46
+ partition_prefix = partition_value_set.map { |name_value| name_value.join('=') }.join('/')
47
+ s3_cleaner.delete(bucket, "#{location}/#{partition_prefix}")
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class TableDDLGenerator
6
+ def create_table_sql(table, permissive: false)
7
+ <<~SQL
8
+ CREATE EXTERNAL TABLE #{permissive_statement(permissive)}#{table.name} (
9
+ #{column_definition_sql(table.schema.columns)}
10
+ )
11
+ #{partition_statement(table.schema)}
12
+ #{format_statement(table.format)}
13
+ LOCATION '#{table.location}';
14
+ SQL
15
+ end
16
+
17
+ private
18
+
19
+ def permissive_statement(permissive_flag)
20
+ 'IF NOT EXISTS ' if permissive_flag
21
+ end
22
+
23
+ def partition_statement(table_schema)
24
+ return if table_schema.partitions.empty?
25
+
26
+ <<~SQL
27
+ PARTITIONED BY (
28
+ #{column_definition_sql(table_schema.partitions)}
29
+ )
30
+ SQL
31
+ end
32
+
33
+ def column_definition_sql(columns)
34
+ columns.map { |column| "`#{column.name}` #{column.type}" }.join(",\n")
35
+ end
36
+
37
+ def format_statement(format)
38
+ case format
39
+ when :csv
40
+ "ROW FORMAT DELIMITED FIELDS TERMINATED BY ','"
41
+ when :tsv
42
+ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t'"
43
+ when :orc
44
+ 'STORED AS ORC'
45
+ else
46
+ raise Errors::UnsupportedTableFormat, format.to_s
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # Provides DSL for defining table schemas.
6
+ #
7
+ # @example Table schema definition
8
+ # schema = Egis::TableSchema.define do
9
+ # column :id, :int
10
+ # column :message, :string
11
+ #
12
+ # partition :country, :string
13
+ # partition :type, :int
14
+ # end
15
+ #
16
+ # @!attribute [r] columns
17
+ # @return [Egis::TableSchema::Column]
18
+ # @!attribute [r] partitions
19
+ # @return [Egis::TableSchema::Column]
20
+ #
21
+ class TableSchema
22
+ ##
23
+ # @return [Egis::TableSchema]
24
+
25
+ def self.define(&block)
26
+ new(&block)
27
+ end
28
+
29
+ def initialize(&block)
30
+ @columns = []
31
+ @partitions = []
32
+ instance_eval(&block)
33
+ end
34
+
35
+ attr_reader :columns, :partitions
36
+
37
+ private
38
+
39
+ def column(name, type)
40
+ @columns << Column.new(name, type)
41
+ end
42
+
43
+ def partition(name, type)
44
+ @partitions << Column.new(name, type)
45
+ end
46
+
47
+ Column = Struct.new(:name, :type)
48
+ end
49
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Testing
5
+ # @!visibility private
6
+ class TestingMode
7
+ def initialize(test_id, s3_bucket,
8
+ client: Egis::Client.new,
9
+ output_downloader: Egis::OutputDownloader.new,
10
+ s3_location_parser: Egis::S3LocationParser.new)
11
+ @test_id = test_id
12
+ @s3_bucket = s3_bucket
13
+ @dirty = false
14
+ @client = client
15
+ @output_downloader = output_downloader
16
+ @s3_location_parser = s3_location_parser
17
+ end
18
+
19
+ def s3_path(s3_url)
20
+ dirty!
21
+
22
+ bucket, key = s3_location_parser.parse_url(s3_url)
23
+
24
+ "s3://#{s3_bucket}/#{test_id}/#{bucket}/#{key}"
25
+ end
26
+
27
+ def database_name(name)
28
+ dirty!
29
+
30
+ "#{test_id}_#{name}"
31
+ end
32
+
33
+ def async(_async_flag)
34
+ dirty!
35
+
36
+ false
37
+ end
38
+
39
+ def cleanup
40
+ remove_test_databases if dirty?
41
+ end
42
+
43
+ private
44
+
45
+ attr_reader :test_id, :s3_bucket, :client, :output_downloader, :s3_location_parser
46
+
47
+ def remove_test_databases
48
+ result = client.execute_query("SHOW DATABASES LIKE '#{test_id}.*';", async: false)
49
+ query_result = output_downloader.download(result.output_location)
50
+ query_result.flatten.each { |database| client.database(database).drop }
51
+ end
52
+
53
+ def dirty!
54
+ @dirty = true
55
+ end
56
+
57
+ def dirty?
58
+ @dirty
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+
5
+ require 'egis/testing/testing_mode'
6
+
7
+ module Egis # rubocop:disable Style/Documentation
8
+ # @!visibility private
9
+ module Testing
10
+ end
11
+
12
+ ##
13
+ # Egis testing mode.
14
+ # Every table and created within method's block is mapped to a "virtual" table space in your testing S3 bucket.
15
+ # Using it, you can insert test data to your production tables and they will be simulated within the testing bucket,
16
+ # not touching actual locations.
17
+ #
18
+ # @example RSpec configuration
19
+ # # spec_helper.rb
20
+ #
21
+ # require 'egis/testing'
22
+ #
23
+ # Egis.configure do |config|
24
+ # config.testing_s3_bucket = 'testing-bucket'
25
+ # end
26
+ #
27
+ # RSpec.configure do |config|
28
+ # config.around(:each) do |example|
29
+ # Egis.testing do
30
+ # example.run
31
+ # end
32
+ # end
33
+ # end
34
+ #
35
+ # @return [void]
36
+
37
+ def self.testing
38
+ test_id = SecureRandom.hex
39
+ test_mode = Egis::Testing::TestingMode.new(test_id, Egis.configuration.testing_s3_bucket)
40
+
41
+ previous_mode = Egis.mode
42
+ @mode = test_mode
43
+ yield
44
+ ensure
45
+ @mode = previous_mode
46
+ test_mode.cleanup
47
+ end
48
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class BooleanSerializer
7
+ TRUE_LITERAL = 'TRUE'
8
+ FALSE_LITERAL = 'FALSE'
9
+
10
+ TRUE_VALUE = 'true'
11
+ FALSE_VALUE = 'false'
12
+
13
+ def literal(value)
14
+ case value
15
+ when true
16
+ TRUE_LITERAL
17
+ when false
18
+ FALSE_LITERAL
19
+ else
20
+ illegal_value_error(value)
21
+ end
22
+ end
23
+
24
+ def dump(value)
25
+ case value
26
+ when true
27
+ TRUE_VALUE
28
+ when false
29
+ FALSE_VALUE
30
+ else
31
+ illegal_value_error(value)
32
+ end
33
+ end
34
+
35
+ def load(string)
36
+ case string
37
+ when TRUE_VALUE
38
+ true
39
+ when FALSE_VALUE
40
+ false
41
+ else
42
+ illegal_value_error(string)
43
+ end
44
+ end
45
+
46
+ private
47
+
48
+ def illegal_value_error(value)
49
+ raise Egis::TypeError, "Illegal value '#{value}' for type boolean"
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class DefaultSerializer
7
+ def literal(value)
8
+ "'#{value}'"
9
+ end
10
+
11
+ def dump(value)
12
+ value
13
+ end
14
+
15
+ def load(string)
16
+ string
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class IntegerSerializer
7
+ def literal(integer)
8
+ integer.to_s
9
+ end
10
+
11
+ def dump(integer)
12
+ integer.to_s
13
+ end
14
+
15
+ def load(string)
16
+ string.to_i
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class NullSerializer
7
+ NULL_LITERAL = 'NULL'
8
+
9
+ def initialize(wrapped_serializer)
10
+ @wrapped_serializer = wrapped_serializer
11
+ end
12
+
13
+ def literal(value)
14
+ return NULL_LITERAL if value.nil?
15
+
16
+ wrapped_serializer.literal(value)
17
+ end
18
+
19
+ def dump(value)
20
+ return nil if value.nil?
21
+
22
+ wrapped_serializer.dump(value)
23
+ end
24
+
25
+ def load(string)
26
+ return nil if string.nil?
27
+
28
+ wrapped_serializer.load(string)
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :wrapped_serializer
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class StringSerializer
7
+ def literal(string)
8
+ "'#{string.gsub("'", "''")}'"
9
+ end
10
+
11
+ def dump(string)
12
+ string
13
+ end
14
+
15
+ def load(string)
16
+ string
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class TimestampSerializer
7
+ ATHENA_TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
8
+
9
+ def literal(time)
10
+ "timestamp '#{dump(time)}'"
11
+ end
12
+
13
+ def dump(time)
14
+ time.strftime(ATHENA_TIME_FORMAT)
15
+ end
16
+
17
+ def load(string)
18
+ Time.strptime(string, ATHENA_TIME_FORMAT)
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/egis/types.rb ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'egis/types/boolean_serializer'
4
+ require 'egis/types/default_serializer'
5
+ require 'egis/types/integer_serializer'
6
+ require 'egis/types/string_serializer'
7
+ require 'egis/types/timestamp_serializer'
8
+ require 'egis/types/null_serializer'
9
+
10
+ module Egis
11
+ # @!visibility private
12
+ module Types
13
+ def self.serializer(type)
14
+ type_serializer = case type
15
+ when :timestamp
16
+ TimestampSerializer.new
17
+ when :string
18
+ StringSerializer.new
19
+ when :int, :bigint
20
+ IntegerSerializer.new
21
+ when :boolean
22
+ BooleanSerializer.new
23
+ else
24
+ raise Errors::TypeError, "Unsupported type: #{type}"
25
+ end
26
+
27
+ NullSerializer.new(type_serializer)
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ VERSION = '1.1.0'
5
+ end
data/lib/egis.rb ADDED
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'egis/version'
4
+ require 'egis/errors'
5
+ require 'egis/configuration'
6
+ require 'egis/types'
7
+ require 'egis/query_status'
8
+ require 'egis/aws_client_provider'
9
+ require 'egis/s3_cleaner'
10
+ require 'egis/output_downloader'
11
+ require 'egis/output_parser'
12
+ require 'egis/client'
13
+ require 'egis/cartesian_product_generator'
14
+ require 'egis/partitions_generator'
15
+ require 'egis/table_data_wiper'
16
+ require 'egis/table'
17
+ require 'egis/database'
18
+ require 'egis/query_output_location'
19
+ require 'egis/table_ddl_generator'
20
+ require 'egis/table_schema'
21
+ require 'egis/standard_mode'
22
+ require 'egis/s3_location_parser'
23
+
24
+ ##
25
+ # Egis is configured using Egis.configure block.
26
+ #
27
+ # @example Configuration using AWS access key ID and secret
28
+ # Egis.configure do |config|
29
+ # config.aws_region = 'AWS region'
30
+ # config.aws_access_key_id = 'AWS key ID'
31
+ # config.aws_secret_access_key = 'AWS secret key'
32
+ # config.work_group = 'egis-integration-testing'
33
+ # end
34
+ #
35
+ # If you don't specify credentials they will be looked up in the default locations. For more information see
36
+ # {https://docs.aws.amazon.com/sdk-for-ruby/v3/developer-guide/setup-config.html}
37
+ #
38
+ # @example Use specific credentials profile from `~/.aws/credentials`
39
+ # Egis.configure do |config|
40
+ # config.aws_profile = 'my-profile'
41
+ # end
42
+ #
43
+ # @yield [Egis::Configuration]
44
+ # @return [void]
45
+ #
46
+ module Egis
47
+ class << self
48
+ def configure
49
+ yield(configuration)
50
+ end
51
+
52
+ # @!visibility private
53
+ def configuration
54
+ @configuration ||= Configuration.new
55
+ end
56
+
57
+ # @!visibility private
58
+ def mode
59
+ @mode ||= Egis::StandardMode.new
60
+ end
61
+ end
62
+ end