egis 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/egis/table.rb ADDED
@@ -0,0 +1,163 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # Interface for Athena table manipulation.
6
+ #
7
+ # It is recommended to create table objects using {Egis::Database#table} method.
8
+ #
9
+ # @!attribute [r] database
10
+ # @return [Egis::Database]
11
+ # @!attribute [r] name
12
+ # @return [String] Athena database name
13
+ # @!attribute [r] schema
14
+ # @return [Egis::TableSchema] table's schema object
15
+ #
16
+ class Table
17
+ DEFAULT_OPTIONS = {format: :tsv}.freeze
18
+
19
+ def initialize(database, name, schema, location, options: {},
20
+ partitions_generator: Egis::PartitionsGenerator.new,
21
+ table_ddl_generator: Egis::TableDDLGenerator.new,
22
+ output_downloader: Egis::OutputDownloader.new,
23
+ output_parser: Egis::OutputParser.new,
24
+ table_data_wiper: Egis::TableDataWiper.new)
25
+ @database = database
26
+ @name = name
27
+ @schema = schema
28
+ @location = location
29
+ @options = DEFAULT_OPTIONS.merge(options)
30
+ @partitions_generator = partitions_generator
31
+ @table_ddl_generator = table_ddl_generator
32
+ @output_downloader = output_downloader
33
+ @output_parser = output_parser
34
+ @table_data_wiper = table_data_wiper
35
+ end
36
+
37
+ attr_reader :database, :name, :schema
38
+
39
+ ##
40
+ # Creates table in Athena.
41
+ #
42
+ # @return [void]
43
+
44
+ def create
45
+ create_table_sql = table_ddl_generator.create_table_sql(self, permissive: true)
46
+ database.execute_query(create_table_sql, async: false)
47
+ end
48
+
49
+ ##
50
+ # The same as {#create} but raising error when table with a given name already exists.
51
+ #
52
+ # @return [void]
53
+
54
+ def create!
55
+ create_table_sql = table_ddl_generator.create_table_sql(self, permissive: false)
56
+ database.execute_query(create_table_sql, async: false)
57
+ end
58
+
59
+ ##
60
+ # Creates partitions with all possible combinations of given partition values.
61
+ #
62
+ # @example
63
+ # table.add_partitions(year: [2000, 2001], type: ['user'])
64
+ #
65
+ # @param [Hash] partitions
66
+ # @return [void]
67
+
68
+ def add_partitions(partitions)
69
+ load_partitions_query = partitions_generator.to_sql(name, partitions, permissive: true)
70
+ database.execute_query(load_partitions_query, async: false)
71
+ end
72
+
73
+ ##
74
+ # (see add_partitions)
75
+ # It raises error when a partition already exists.
76
+
77
+ def add_partitions!(partitions)
78
+ load_partitions_query = partitions_generator.to_sql(name, partitions, permissive: false)
79
+ database.execute_query(load_partitions_query, async: false)
80
+ end
81
+
82
+ ##
83
+ # Tells Athena to automatically discover table's partitions by scanning table's S3 location.
84
+ # This operation might take long time with big number of partitions. If that's the case, instead of this method use
85
+ # {#add_partitions} to define partitions manually.
86
+ #
87
+ # @return [void]
88
+
89
+ def discover_partitions
90
+ database.execute_query("MSCK REPAIR TABLE #{name};", async: false)
91
+ end
92
+
93
+ ##
94
+ # Insert data into the table. Mostly useful for testing purposes.
95
+ #
96
+ # @param [Array] rows Array of arrays with row values
97
+ # @return [void]
98
+
99
+ def upload_data(rows)
100
+ query = data_insert_query(rows)
101
+ database.execute_query(query, async: false)
102
+ end
103
+
104
+ ##
105
+ # Downloads table contents into memory. Mostly useful for testing purposes.
106
+ #
107
+ # @return [Array] Array of arrays with row values.
108
+
109
+ def download_data
110
+ result = database.execute_query("SELECT * FROM #{name};", async: false)
111
+ content = output_downloader.download(result.output_location)
112
+ output_parser.parse(content, column_types)
113
+ end
114
+
115
+ ##
116
+ # Removes table's content on S3. Optionally, you can limit files removed to specific partitions.
117
+ #
118
+ # @param [Hash] partitions Partitions values to remove. Follows the same argument format as {#add_partitions}.
119
+ # @return [void]
120
+
121
+ def wipe_data(partitions: nil)
122
+ table_data_wiper.wipe_table_data(self, partitions)
123
+ end
124
+
125
+ ##
126
+ # @return Table data format
127
+
128
+ def format
129
+ options.fetch(:format)
130
+ end
131
+
132
+ ##
133
+ # @return [String] table location URL
134
+
135
+ def location
136
+ Egis.mode.s3_path(@location)
137
+ end
138
+
139
+ private
140
+
141
+ attr_reader :options, :partitions_generator, :table_ddl_generator, :output_downloader, :output_parser,
142
+ :table_data_wiper
143
+
144
+ def column_serializers
145
+ @column_serializers ||= column_types.map { |type| Egis::Types.serializer(type) }
146
+ end
147
+
148
+ def column_types
149
+ (schema.columns + schema.partitions).map(&:type)
150
+ end
151
+
152
+ def data_insert_query(rows)
153
+ <<~SQL
154
+ INSERT INTO #{name} VALUES
155
+ #{rows.map { |row| row_values_statement(row) }.join(",\n")};
156
+ SQL
157
+ end
158
+
159
+ def row_values_statement(row)
160
+ "(#{row.zip(column_serializers).map { |value, serializer| serializer.literal(value) }.join(', ')})"
161
+ end
162
+ end
163
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class TableDataWiper
6
+ def initialize(s3_location_parser: Egis::S3LocationParser.new,
7
+ s3_cleaner: Egis::S3Cleaner.new,
8
+ cartesian_product_generator: Egis::CartesianProductGenerator.new)
9
+ @s3_location_parser = s3_location_parser
10
+ @s3_cleaner = s3_cleaner
11
+ @cartesian_product_generator = cartesian_product_generator
12
+ end
13
+
14
+ def wipe_table_data(table, partitions)
15
+ bucket, location = s3_location_parser.parse_url(table.location)
16
+
17
+ return s3_cleaner.delete(bucket, location) unless partitions
18
+
19
+ partition_values_to_remove = partition_values_to_remove(table, partitions)
20
+
21
+ validate_partition_values(partition_values_to_remove, partitions)
22
+
23
+ remove_partition_files(bucket, location, partition_values_to_remove)
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :s3_location_parser, :s3_cleaner, :cartesian_product_generator
29
+
30
+ def partition_values_to_remove(table, partitions)
31
+ table_partitions = table.schema.partitions.map(&:name)
32
+ given_partitions = partitions.keys
33
+
34
+ partitions_to_delete = table_partitions.take_while { |partition| given_partitions.include?(partition) }
35
+ partitions_to_delete.map { |partition_name| [partition_name, partitions.fetch(partition_name)] }.to_h
36
+ end
37
+
38
+ def validate_partition_values(removed_partition_values, partitions)
39
+ return unless removed_partition_values.empty? || removed_partition_values.values.any?(&:empty?)
40
+
41
+ raise Egis::Errors::PartitionError, "Incorrect partitions given: #{partitions}"
42
+ end
43
+
44
+ def remove_partition_files(bucket, location, partitions_with_values)
45
+ cartesian_product_generator.cartesian_product(partitions_with_values).each do |partition_value_set|
46
+ partition_prefix = partition_value_set.map { |name_value| name_value.join('=') }.join('/')
47
+ s3_cleaner.delete(bucket, "#{location}/#{partition_prefix}")
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ # @!visibility private
5
+ class TableDDLGenerator
6
+ def create_table_sql(table, permissive: false)
7
+ <<~SQL
8
+ CREATE EXTERNAL TABLE #{permissive_statement(permissive)}#{table.name} (
9
+ #{column_definition_sql(table.schema.columns)}
10
+ )
11
+ #{partition_statement(table.schema)}
12
+ #{format_statement(table.format)}
13
+ LOCATION '#{table.location}';
14
+ SQL
15
+ end
16
+
17
+ private
18
+
19
+ def permissive_statement(permissive_flag)
20
+ 'IF NOT EXISTS ' if permissive_flag
21
+ end
22
+
23
+ def partition_statement(table_schema)
24
+ return if table_schema.partitions.empty?
25
+
26
+ <<~SQL
27
+ PARTITIONED BY (
28
+ #{column_definition_sql(table_schema.partitions)}
29
+ )
30
+ SQL
31
+ end
32
+
33
+ def column_definition_sql(columns)
34
+ columns.map { |column| "`#{column.name}` #{column.type}" }.join(",\n")
35
+ end
36
+
37
+ def format_statement(format)
38
+ case format
39
+ when :csv
40
+ "ROW FORMAT DELIMITED FIELDS TERMINATED BY ','"
41
+ when :tsv
42
+ "ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t'"
43
+ when :orc
44
+ 'STORED AS ORC'
45
+ else
46
+ raise Errors::UnsupportedTableFormat, format.to_s
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ ##
5
+ # Provides DSL for defining table schemas.
6
+ #
7
+ # @example Table schema definition
8
+ # schema = Egis::TableSchema.define do
9
+ # column :id, :int
10
+ # column :message, :string
11
+ #
12
+ # partition :country, :string
13
+ # partition :type, :int
14
+ # end
15
+ #
16
+ # @!attribute [r] columns
17
+ # @return [Egis::TableSchema::Column]
18
+ # @!attribute [r] partitions
19
+ # @return [Egis::TableSchema::Column]
20
+ #
21
+ class TableSchema
22
+ ##
23
+ # @return [Egis::TableSchema]
24
+
25
+ def self.define(&block)
26
+ new(&block)
27
+ end
28
+
29
+ def initialize(&block)
30
+ @columns = []
31
+ @partitions = []
32
+ instance_eval(&block)
33
+ end
34
+
35
+ attr_reader :columns, :partitions
36
+
37
+ private
38
+
39
+ def column(name, type)
40
+ @columns << Column.new(name, type)
41
+ end
42
+
43
+ def partition(name, type)
44
+ @partitions << Column.new(name, type)
45
+ end
46
+
47
+ Column = Struct.new(:name, :type)
48
+ end
49
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Testing
5
+ # @!visibility private
6
+ class TestingMode
7
+ def initialize(test_id, s3_bucket,
8
+ client: Egis::Client.new,
9
+ output_downloader: Egis::OutputDownloader.new,
10
+ s3_location_parser: Egis::S3LocationParser.new)
11
+ @test_id = test_id
12
+ @s3_bucket = s3_bucket
13
+ @dirty = false
14
+ @client = client
15
+ @output_downloader = output_downloader
16
+ @s3_location_parser = s3_location_parser
17
+ end
18
+
19
+ def s3_path(s3_url)
20
+ dirty!
21
+
22
+ bucket, key = s3_location_parser.parse_url(s3_url)
23
+
24
+ "s3://#{s3_bucket}/#{test_id}/#{bucket}/#{key}"
25
+ end
26
+
27
+ def database_name(name)
28
+ dirty!
29
+
30
+ "#{test_id}_#{name}"
31
+ end
32
+
33
+ def async(_async_flag)
34
+ dirty!
35
+
36
+ false
37
+ end
38
+
39
+ def cleanup
40
+ remove_test_databases if dirty?
41
+ end
42
+
43
+ private
44
+
45
+ attr_reader :test_id, :s3_bucket, :client, :output_downloader, :s3_location_parser
46
+
47
+ def remove_test_databases
48
+ result = client.execute_query("SHOW DATABASES LIKE '#{test_id}.*';", async: false)
49
+ query_result = output_downloader.download(result.output_location)
50
+ query_result.flatten.each { |database| client.database(database).drop }
51
+ end
52
+
53
+ def dirty!
54
+ @dirty = true
55
+ end
56
+
57
+ def dirty?
58
+ @dirty
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'securerandom'
4
+
5
+ require 'egis/testing/testing_mode'
6
+
7
+ module Egis # rubocop:disable Style/Documentation
8
+ # @!visibility private
9
+ module Testing
10
+ end
11
+
12
+ ##
13
+ # Egis testing mode.
14
+ # Every table and created within method's block is mapped to a "virtual" table space in your testing S3 bucket.
15
+ # Using it, you can insert test data to your production tables and they will be simulated within the testing bucket,
16
+ # not touching actual locations.
17
+ #
18
+ # @example RSpec configuration
19
+ # # spec_helper.rb
20
+ #
21
+ # require 'egis/testing'
22
+ #
23
+ # Egis.configure do |config|
24
+ # config.testing_s3_bucket = 'testing-bucket'
25
+ # end
26
+ #
27
+ # RSpec.configure do |config|
28
+ # config.around(:each) do |example|
29
+ # Egis.testing do
30
+ # example.run
31
+ # end
32
+ # end
33
+ # end
34
+ #
35
+ # @return [void]
36
+
37
+ def self.testing
38
+ test_id = SecureRandom.hex
39
+ test_mode = Egis::Testing::TestingMode.new(test_id, Egis.configuration.testing_s3_bucket)
40
+
41
+ previous_mode = Egis.mode
42
+ @mode = test_mode
43
+ yield
44
+ ensure
45
+ @mode = previous_mode
46
+ test_mode.cleanup
47
+ end
48
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class BooleanSerializer
7
+ TRUE_LITERAL = 'TRUE'
8
+ FALSE_LITERAL = 'FALSE'
9
+
10
+ TRUE_VALUE = 'true'
11
+ FALSE_VALUE = 'false'
12
+
13
+ def literal(value)
14
+ case value
15
+ when true
16
+ TRUE_LITERAL
17
+ when false
18
+ FALSE_LITERAL
19
+ else
20
+ illegal_value_error(value)
21
+ end
22
+ end
23
+
24
+ def dump(value)
25
+ case value
26
+ when true
27
+ TRUE_VALUE
28
+ when false
29
+ FALSE_VALUE
30
+ else
31
+ illegal_value_error(value)
32
+ end
33
+ end
34
+
35
+ def load(string)
36
+ case string
37
+ when TRUE_VALUE
38
+ true
39
+ when FALSE_VALUE
40
+ false
41
+ else
42
+ illegal_value_error(string)
43
+ end
44
+ end
45
+
46
+ private
47
+
48
+ def illegal_value_error(value)
49
+ raise Egis::TypeError, "Illegal value '#{value}' for type boolean"
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class DefaultSerializer
7
+ def literal(value)
8
+ "'#{value}'"
9
+ end
10
+
11
+ def dump(value)
12
+ value
13
+ end
14
+
15
+ def load(string)
16
+ string
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class IntegerSerializer
7
+ def literal(integer)
8
+ integer.to_s
9
+ end
10
+
11
+ def dump(integer)
12
+ integer.to_s
13
+ end
14
+
15
+ def load(string)
16
+ string.to_i
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,36 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class NullSerializer
7
+ NULL_LITERAL = 'NULL'
8
+
9
+ def initialize(wrapped_serializer)
10
+ @wrapped_serializer = wrapped_serializer
11
+ end
12
+
13
+ def literal(value)
14
+ return NULL_LITERAL if value.nil?
15
+
16
+ wrapped_serializer.literal(value)
17
+ end
18
+
19
+ def dump(value)
20
+ return nil if value.nil?
21
+
22
+ wrapped_serializer.dump(value)
23
+ end
24
+
25
+ def load(string)
26
+ return nil if string.nil?
27
+
28
+ wrapped_serializer.load(string)
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :wrapped_serializer
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class StringSerializer
7
+ def literal(string)
8
+ "'#{string.gsub("'", "''")}'"
9
+ end
10
+
11
+ def dump(string)
12
+ string
13
+ end
14
+
15
+ def load(string)
16
+ string
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ module Types
5
+ # @!visibility private
6
+ class TimestampSerializer
7
+ ATHENA_TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
8
+
9
+ def literal(time)
10
+ "timestamp '#{dump(time)}'"
11
+ end
12
+
13
+ def dump(time)
14
+ time.strftime(ATHENA_TIME_FORMAT)
15
+ end
16
+
17
+ def load(string)
18
+ Time.strptime(string, ATHENA_TIME_FORMAT)
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/egis/types.rb ADDED
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'egis/types/boolean_serializer'
4
+ require 'egis/types/default_serializer'
5
+ require 'egis/types/integer_serializer'
6
+ require 'egis/types/string_serializer'
7
+ require 'egis/types/timestamp_serializer'
8
+ require 'egis/types/null_serializer'
9
+
10
+ module Egis
11
+ # @!visibility private
12
+ module Types
13
+ def self.serializer(type)
14
+ type_serializer = case type
15
+ when :timestamp
16
+ TimestampSerializer.new
17
+ when :string
18
+ StringSerializer.new
19
+ when :int, :bigint
20
+ IntegerSerializer.new
21
+ when :boolean
22
+ BooleanSerializer.new
23
+ else
24
+ raise Errors::TypeError, "Unsupported type: #{type}"
25
+ end
26
+
27
+ NullSerializer.new(type_serializer)
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Egis
4
+ VERSION = '1.1.0'
5
+ end
data/lib/egis.rb ADDED
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'egis/version'
4
+ require 'egis/errors'
5
+ require 'egis/configuration'
6
+ require 'egis/types'
7
+ require 'egis/query_status'
8
+ require 'egis/aws_client_provider'
9
+ require 'egis/s3_cleaner'
10
+ require 'egis/output_downloader'
11
+ require 'egis/output_parser'
12
+ require 'egis/client'
13
+ require 'egis/cartesian_product_generator'
14
+ require 'egis/partitions_generator'
15
+ require 'egis/table_data_wiper'
16
+ require 'egis/table'
17
+ require 'egis/database'
18
+ require 'egis/query_output_location'
19
+ require 'egis/table_ddl_generator'
20
+ require 'egis/table_schema'
21
+ require 'egis/standard_mode'
22
+ require 'egis/s3_location_parser'
23
+
24
+ ##
25
+ # Egis is configured using Egis.configure block.
26
+ #
27
+ # @example Configuration using AWS access key ID and secret
28
+ # Egis.configure do |config|
29
+ # config.aws_region = 'AWS region'
30
+ # config.aws_access_key_id = 'AWS key ID'
31
+ # config.aws_secret_access_key = 'AWS secret key'
32
+ # config.work_group = 'egis-integration-testing'
33
+ # end
34
+ #
35
+ # If you don't specify credentials they will be looked up in the default locations. For more information see
36
+ # {https://docs.aws.amazon.com/sdk-for-ruby/v3/developer-guide/setup-config.html}
37
+ #
38
+ # @example Use specific credentials profile from `~/.aws/credentials`
39
+ # Egis.configure do |config|
40
+ # config.aws_profile = 'my-profile'
41
+ # end
42
+ #
43
+ # @yield [Egis::Configuration]
44
+ # @return [void]
45
+ #
46
+ module Egis
47
+ class << self
48
+ def configure
49
+ yield(configuration)
50
+ end
51
+
52
+ # @!visibility private
53
+ def configuration
54
+ @configuration ||= Configuration.new
55
+ end
56
+
57
+ # @!visibility private
58
+ def mode
59
+ @mode ||= Egis::StandardMode.new
60
+ end
61
+ end
62
+ end