redshift-connector 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +10 -0
  3. data/lib/redshift-connector.rb +31 -0
  4. data/lib/redshift-connector/connector.rb +146 -0
  5. data/lib/redshift-connector/exporter.rb +116 -0
  6. data/lib/redshift-connector/importer.rb +89 -0
  7. data/lib/redshift-connector/importer/activerecord-import.rb +2 -0
  8. data/lib/redshift-connector/importer/insert_delta.rb +32 -0
  9. data/lib/redshift-connector/importer/rebuild_rename.rb +41 -0
  10. data/lib/redshift-connector/importer/rebuild_truncate.rb +31 -0
  11. data/lib/redshift-connector/importer/upsert.rb +25 -0
  12. data/lib/redshift-connector/logger.rb +20 -0
  13. data/lib/redshift-connector/query.rb +93 -0
  14. data/lib/redshift-connector/reader.rb +18 -0
  15. data/lib/redshift-connector/reader/abstract.rb +18 -0
  16. data/lib/redshift-connector/reader/csv.rb +24 -0
  17. data/lib/redshift-connector/reader/exception.rb +3 -0
  18. data/lib/redshift-connector/reader/redshift_csv.rb +54 -0
  19. data/lib/redshift-connector/reader/tsv.rb +24 -0
  20. data/lib/redshift-connector/s3_bucket.rb +72 -0
  21. data/lib/redshift-connector/s3_data_file.rb +34 -0
  22. data/lib/redshift-connector/s3_data_file_bundle.rb +101 -0
  23. data/lib/redshift-connector/version.rb +3 -0
  24. data/test/all.rb +3 -0
  25. data/test/config.rb +13 -0
  26. data/test/config.rb.example +18 -0
  27. data/test/database.yml +15 -0
  28. data/test/database.yml.example +15 -0
  29. data/test/foreach.rb +5 -0
  30. data/test/helper.rb +25 -0
  31. data/test/item_pvs.ct.mysql +11 -0
  32. data/test/item_pvs.ct.redshift +9 -0
  33. data/test/reader/test_redshift_csv.rb +30 -0
  34. data/test/test_connector.rb +148 -0
  35. data/test/test_reader.rb +10 -0
  36. data/test/test_s3_import.rb +32 -0
  37. metadata +190 -0
@@ -0,0 +1,31 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::RebuildTruncate
6
+ def initialize(dao:, bundle:, columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute
14
+ truncate_table(@dao.table_name)
15
+ import
16
+ end
17
+
18
+ def truncate_table(table_name)
19
+ @logger.info "TRUNCATE #{table_name}"
20
+ @dao.connection.execute("truncate #{table_name}")
21
+ @logger.info "truncated."
22
+ end
23
+
24
+ def import
25
+ @logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
26
+ @bundle.each_batch do |rows|
27
+ @dao.import(@columns, rows)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,25 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::Upsert
6
+ def initialize(dao:, bundle:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @upsert_columns = upsert_columns
11
+ @logger = logger
12
+ end
13
+
14
+ def execute
15
+ import
16
+ end
17
+
18
+ def import
19
+ @logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
20
+ @bundle.each_batch do |rows|
21
+ @dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,20 @@
1
+ module RedshiftConnector
2
+ @logger = nil
3
+
4
+ def RedshiftConnector.logger
5
+ # Defer to access Rails
6
+ @logger || Rails.logger
7
+ end
8
+
9
+ def RedshiftConnector.logger=(logger)
10
+ @logger = logger
11
+ end
12
+
13
+ class NullLogger
14
+ def noop(*args) end
15
+ alias error noop
16
+ alias warn noop
17
+ alias info noop
18
+ alias debug noop
19
+ end
20
+ end
@@ -0,0 +1,93 @@
1
+ module RedshiftConnector
2
+ class DeltaQuery
3
+ def initialize(schema:, table:, columns:, condition: nil)
4
+ @schema = schema
5
+ @table = table
6
+ @columns = columns
7
+ @condition = condition
8
+ end
9
+
10
+ def table_spec
11
+ "#{@schema}.#{@table}"
12
+ end
13
+
14
+ def description
15
+ "#{table_spec} (#{@columns.join(', ')}) where (#{@condition})"
16
+ end
17
+
18
+ def to_sql
19
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
20
+ + " from #{table_spec}" \
21
+ + (@condition ? " where #{@condition}" : '')
22
+ end
23
+ end
24
+
25
+ class SelectAllQuery
26
+ def initialize(schema:, table:, columns:)
27
+ @schema = schema
28
+ @table = table
29
+ @columns = columns
30
+ end
31
+
32
+ def table_spec
33
+ "#{@schema}.#{@table}"
34
+ end
35
+
36
+ def description
37
+ "#{table_spec} (#{@columns.join(', ')})"
38
+ end
39
+
40
+ def to_sql
41
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
42
+ + " from #{table_spec}"
43
+ end
44
+ end
45
+
46
+ class UnloadQuery
47
+ def UnloadQuery.wrap(query:, bundle:)
48
+ new(query: ArbitraryQuery.new(query), bundle: bundle)
49
+ end
50
+
51
+ def initialize(query:, bundle:)
52
+ @query = query
53
+ @bundle = bundle
54
+ end
55
+
56
+ def table_spec
57
+ @query.table_spec
58
+ end
59
+
60
+ def description
61
+ @query.description
62
+ end
63
+
64
+ def to_sql
65
+ <<-EndSQL.gsub(/^\s+/, '')
66
+ unload ('#{escape_query(@query.to_sql)}')
67
+ to '#{@bundle.url}'
68
+ credentials '#{@bundle.credential_string}'
69
+ gzip
70
+ allowoverwrite
71
+ delimiter ',' escape addquotes
72
+ EndSQL
73
+ end
74
+
75
+ def escape_query(query)
76
+ query.gsub("'", "\\\\'")
77
+ end
78
+ end
79
+
80
+ class ArbitraryQuery
81
+ def initialize(query)
82
+ @query = query
83
+ end
84
+
85
+ def description
86
+ @query
87
+ end
88
+
89
+ def to_sql
90
+ @query
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/reader/redshift_csv'
8
+ require 'redshift-connector/reader/csv'
9
+ require 'redshift-connector/reader/tsv'
10
+ require 'redshift-connector/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(obj)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+
4
+ module RedshiftConnector
5
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
6
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
7
+ class Reader::RedshiftCSV < Reader::Abstract
8
+ declare_reader :redshift_csv
9
+
10
+ def self.data_object?(obj)
11
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ # f :: IO
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each
20
+ # We can use simple #each_line to read single row
21
+ # because line terminators are always escaped by UNLOAD.
22
+ @f.each_line do |line|
23
+ yield parse_row(line, @f.lineno)
24
+ end
25
+ end
26
+
27
+ def parse_row(line, lineno = nil)
28
+ row = []
29
+ s = StringScanner.new(line)
30
+ s.skip(/\s+/)
31
+ until s.eos?
32
+ col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
33
+ row.push unescape_column(col)
34
+ s.skip(/\s*/) # skip line terminator on line ends
35
+ s.skip(/,\s*/)
36
+ end
37
+ row
38
+ end
39
+
40
+ UNESCAPE_MAP = {
41
+ '\\"' => '"',
42
+ "\\'" => "'",
43
+ '\\,' => ',',
44
+ '\\r' => "\r",
45
+ '\\n' => "\n",
46
+ '\\\\' => '\\'
47
+ }
48
+
49
+ def unescape_column(col)
50
+ charmap = UNESCAPE_MAP
51
+ col[1...-1].gsub(/\\./) {|s| charmap[s] }
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(obj)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,72 @@
1
+ require 'aws-sdk'
2
+
3
+ module RedshiftConnector
4
+ class S3Bucket
5
+ @buckets = {}
6
+ @default = nil
7
+
8
+ def S3Bucket.add(name, default: false, **params)
9
+ instance = new(**params)
10
+ @buckets[name.to_s] = instance
11
+ if !@default or default
12
+ @default = instance
13
+ end
14
+ end
15
+
16
+ def S3Bucket.default
17
+ @default or raise ArgumentError, "no default S3 bucket configured"
18
+ end
19
+
20
+ def S3Bucket.get(name)
21
+ @buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
22
+ end
23
+
24
+ def initialize(bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
25
+ @name = bucket
26
+ @prefix = prefix
27
+ @access_key_id = access_key_id
28
+ @secret_access_key = secret_access_key
29
+ @iam_role = iam_role
30
+ end
31
+
32
+ attr_reader :name
33
+ attr_reader :prefix
34
+
35
+ def url
36
+ "s3://#{@bucket.name}/#{@prefix}/"
37
+ end
38
+
39
+ def client
40
+ @client ||= Aws::S3::Client.new(access_key_id: @access_key_id, secret_access_key: @secret_access_key)
41
+ end
42
+
43
+ def bucket
44
+ @bucket ||= begin
45
+ resource = Aws::S3::Resource.new(client: client)
46
+ resource.bucket(@name)
47
+ end
48
+ end
49
+
50
+ def object(key)
51
+ bucket.object(key)
52
+ end
53
+
54
+ def objects(prefix:)
55
+ bucket.objects(prefix: prefix)
56
+ end
57
+
58
+ def delete_objects(keys)
59
+ bucket.delete_objects(delete: {objects: keys.map {|k| {key: k} }})
60
+ end
61
+
62
+ def credential_string
63
+ if @iam_role
64
+ "aws_iam_role=#{@iam_role}"
65
+ elsif @access_key_id
66
+ "aws_access_key_id=#{@access_key_id};aws_secret_access_key=#{@secret_access_key}"
67
+ else
68
+ raise ArgumentError, "no credential given for Redshift S3 access"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,34 @@
1
+ require 'zlib'
2
+
3
+ module RedshiftConnector
4
+ class S3DataFile
5
+ def initialize(object, reader_class:)
6
+ @object = object
7
+ @reader_class = reader_class
8
+ end
9
+
10
+ def key
11
+ @object.key
12
+ end
13
+
14
+ def each_row(&block)
15
+ response = @object.get
16
+ f = if gzipped_object?
17
+ Zlib::GzipReader.new(response.body)
18
+ else
19
+ response.body
20
+ end
21
+ @reader_class.new(f).each(&block)
22
+ ensure
23
+ response.body.close if response
24
+ end
25
+
26
+ def data_object?
27
+ @reader_class.data_object?(@object)
28
+ end
29
+
30
+ def gzipped_object?
31
+ File.extname(@object.key) == '.gz'
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,101 @@
1
+ require 'redshift-connector/s3_bucket'
2
+ require 'redshift-connector/s3_data_file'
3
+ require 'redshift-connector/reader'
4
+ require 'redshift-connector/logger'
5
+ require 'aws-sdk'
6
+
7
+ module RedshiftConnector
8
+ class S3DataFileBundle
9
+ def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
10
+ real_prefix = "#{bucket.prefix}/#{prefix}"
11
+ new(bucket, real_prefix, format: format, filter: filter, batch_size: batch_size, logger: logger)
12
+ end
13
+
14
+ def self.for_table(bucket: S3Bucket.default, schema:, table:, txn_id:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
15
+ prefix = "#{bucket.prefix}/#{schema}_export/#{table}/#{txn_id}/#{table}.csv."
16
+ new(bucket, prefix, format: :redshift_csv, filter: filter, batch_size: batch_size, logger: logger)
17
+ end
18
+
19
+ def initialize(bucket, prefix, format: :csv, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
20
+ @bucket = bucket
21
+ @prefix = prefix
22
+ @format = format
23
+ @filter = filter || lambda {|*row| row }
24
+ @batch_size = batch_size
25
+ @logger = logger
26
+ @reader_class = Reader.get(format)
27
+ end
28
+
29
+ attr_reader :bucket
30
+ attr_reader :prefix
31
+
32
+ def url
33
+ "s3://#{@bucket.name}/#{@prefix}"
34
+ end
35
+
36
+ def credential_string
37
+ @bucket.credential_string
38
+ end
39
+
40
+ REPORT_SIZE = 10_0000
41
+
42
+ def each_batch(report: true)
43
+ @logger.info "reader: #{@reader_class}"
44
+ n = 0
45
+ reported = 0
46
+ do_each_batch(@batch_size) do |rows|
47
+ yield rows
48
+ n += rows.size
49
+ if n / REPORT_SIZE > reported
50
+ @logger.info "#{n} rows processed" if report
51
+ reported = n / REPORT_SIZE
52
+ end
53
+ end
54
+ @logger.info "total #{n} rows processed" if report
55
+ end
56
+
57
+ def do_each_batch(batch_size)
58
+ filter = @filter
59
+ buf = []
60
+ each_row do |row|
61
+ buf.push filter.(*row)
62
+ if buf.size == batch_size
63
+ yield buf
64
+ buf = []
65
+ end
66
+ end
67
+ yield buf unless buf.empty?
68
+ end
69
+ private :do_each_batch
70
+
71
+ def each_row(&block)
72
+ each_object do |obj|
73
+ obj.each_row(&block)
74
+ end
75
+ end
76
+
77
+ alias each each_row
78
+
79
+ def each_object(&block)
80
+ all_data_objects.each do |obj|
81
+ @logger.info "processing s3 object: #{obj.key}"
82
+ yield obj
83
+ end
84
+ end
85
+
86
+ def all_data_objects
87
+ @bucket.objects(prefix: @prefix)
88
+ .map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
89
+ .select {|obj| obj.data_object? }
90
+ end
91
+
92
+ def clear
93
+ pref = File.dirname(@prefix) + '/'
94
+ keys = @bucket.objects(prefix: pref).map(&:key)
95
+ unless keys.empty?
96
+ @logger.info "DELETE #{pref}*"
97
+ @bucket.delete_objects(keys)
98
+ end
99
+ end
100
+ end
101
+ end