redshift-connector 4.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +10 -0
  3. data/lib/redshift-connector.rb +31 -0
  4. data/lib/redshift-connector/connector.rb +146 -0
  5. data/lib/redshift-connector/exporter.rb +116 -0
  6. data/lib/redshift-connector/importer.rb +89 -0
  7. data/lib/redshift-connector/importer/activerecord-import.rb +2 -0
  8. data/lib/redshift-connector/importer/insert_delta.rb +32 -0
  9. data/lib/redshift-connector/importer/rebuild_rename.rb +41 -0
  10. data/lib/redshift-connector/importer/rebuild_truncate.rb +31 -0
  11. data/lib/redshift-connector/importer/upsert.rb +25 -0
  12. data/lib/redshift-connector/logger.rb +20 -0
  13. data/lib/redshift-connector/query.rb +93 -0
  14. data/lib/redshift-connector/reader.rb +18 -0
  15. data/lib/redshift-connector/reader/abstract.rb +18 -0
  16. data/lib/redshift-connector/reader/csv.rb +24 -0
  17. data/lib/redshift-connector/reader/exception.rb +3 -0
  18. data/lib/redshift-connector/reader/redshift_csv.rb +54 -0
  19. data/lib/redshift-connector/reader/tsv.rb +24 -0
  20. data/lib/redshift-connector/s3_bucket.rb +72 -0
  21. data/lib/redshift-connector/s3_data_file.rb +34 -0
  22. data/lib/redshift-connector/s3_data_file_bundle.rb +101 -0
  23. data/lib/redshift-connector/version.rb +3 -0
  24. data/test/all.rb +3 -0
  25. data/test/config.rb +13 -0
  26. data/test/config.rb.example +18 -0
  27. data/test/database.yml +15 -0
  28. data/test/database.yml.example +15 -0
  29. data/test/foreach.rb +5 -0
  30. data/test/helper.rb +25 -0
  31. data/test/item_pvs.ct.mysql +11 -0
  32. data/test/item_pvs.ct.redshift +9 -0
  33. data/test/reader/test_redshift_csv.rb +30 -0
  34. data/test/test_connector.rb +148 -0
  35. data/test/test_reader.rb +10 -0
  36. data/test/test_s3_import.rb +32 -0
  37. metadata +190 -0
@@ -0,0 +1,31 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::RebuildTruncate
6
+ def initialize(dao:, bundle:, columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute
14
+ truncate_table(@dao.table_name)
15
+ import
16
+ end
17
+
18
+ def truncate_table(table_name)
19
+ @logger.info "TRUNCATE #{table_name}"
20
+ @dao.connection.execute("truncate #{table_name}")
21
+ @logger.info "truncated."
22
+ end
23
+
24
+ def import
25
+ @logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
26
+ @bundle.each_batch do |rows|
27
+ @dao.import(@columns, rows)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,25 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::Upsert
6
+ def initialize(dao:, bundle:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @upsert_columns = upsert_columns
11
+ @logger = logger
12
+ end
13
+
14
+ def execute
15
+ import
16
+ end
17
+
18
+ def import
19
+ @logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
20
+ @bundle.each_batch do |rows|
21
+ @dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,20 @@
1
+ module RedshiftConnector
2
+ @logger = nil
3
+
4
+ def RedshiftConnector.logger
5
+ # Defer to access Rails
6
+ @logger || Rails.logger
7
+ end
8
+
9
+ def RedshiftConnector.logger=(logger)
10
+ @logger = logger
11
+ end
12
+
13
+ class NullLogger
14
+ def noop(*args) end
15
+ alias error noop
16
+ alias warn noop
17
+ alias info noop
18
+ alias debug noop
19
+ end
20
+ end
@@ -0,0 +1,93 @@
1
+ module RedshiftConnector
2
+ class DeltaQuery
3
+ def initialize(schema:, table:, columns:, condition: nil)
4
+ @schema = schema
5
+ @table = table
6
+ @columns = columns
7
+ @condition = condition
8
+ end
9
+
10
+ def table_spec
11
+ "#{@schema}.#{@table}"
12
+ end
13
+
14
+ def description
15
+ "#{table_spec} (#{@columns.join(', ')}) where (#{@condition})"
16
+ end
17
+
18
+ def to_sql
19
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
20
+ + " from #{table_spec}" \
21
+ + (@condition ? " where #{@condition}" : '')
22
+ end
23
+ end
24
+
25
+ class SelectAllQuery
26
+ def initialize(schema:, table:, columns:)
27
+ @schema = schema
28
+ @table = table
29
+ @columns = columns
30
+ end
31
+
32
+ def table_spec
33
+ "#{@schema}.#{@table}"
34
+ end
35
+
36
+ def description
37
+ "#{table_spec} (#{@columns.join(', ')})"
38
+ end
39
+
40
+ def to_sql
41
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
42
+ + " from #{table_spec}"
43
+ end
44
+ end
45
+
46
+ class UnloadQuery
47
+ def UnloadQuery.wrap(query:, bundle:)
48
+ new(query: ArbitraryQuery.new(query), bundle: bundle)
49
+ end
50
+
51
+ def initialize(query:, bundle:)
52
+ @query = query
53
+ @bundle = bundle
54
+ end
55
+
56
+ def table_spec
57
+ @query.table_spec
58
+ end
59
+
60
+ def description
61
+ @query.description
62
+ end
63
+
64
+ def to_sql
65
+ <<-EndSQL.gsub(/^\s+/, '')
66
+ unload ('#{escape_query(@query.to_sql)}')
67
+ to '#{@bundle.url}'
68
+ credentials '#{@bundle.credential_string}'
69
+ gzip
70
+ allowoverwrite
71
+ delimiter ',' escape addquotes
72
+ EndSQL
73
+ end
74
+
75
+ def escape_query(query)
76
+ query.gsub("'", "\\\\'")
77
+ end
78
+ end
79
+
80
+ class ArbitraryQuery
81
+ def initialize(query)
82
+ @query = query
83
+ end
84
+
85
+ def description
86
+ @query
87
+ end
88
+
89
+ def to_sql
90
+ @query
91
+ end
92
+ end
93
+ end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/reader/redshift_csv'
8
+ require 'redshift-connector/reader/csv'
9
+ require 'redshift-connector/reader/tsv'
10
+ require 'redshift-connector/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(obj)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+
4
+ module RedshiftConnector
5
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
6
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
7
+ class Reader::RedshiftCSV < Reader::Abstract
8
+ declare_reader :redshift_csv
9
+
10
+ def self.data_object?(obj)
11
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ # f :: IO
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each
20
+ # We can use simple #each_line to read single row
21
+ # because line terminators are always escaped by UNLOAD.
22
+ @f.each_line do |line|
23
+ yield parse_row(line, @f.lineno)
24
+ end
25
+ end
26
+
27
+ def parse_row(line, lineno = nil)
28
+ row = []
29
+ s = StringScanner.new(line)
30
+ s.skip(/\s+/)
31
+ until s.eos?
32
+ col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
33
+ row.push unescape_column(col)
34
+ s.skip(/\s*/) # skip line terminator on line ends
35
+ s.skip(/,\s*/)
36
+ end
37
+ row
38
+ end
39
+
40
+ UNESCAPE_MAP = {
41
+ '\\"' => '"',
42
+ "\\'" => "'",
43
+ '\\,' => ',',
44
+ '\\r' => "\r",
45
+ '\\n' => "\n",
46
+ '\\\\' => '\\'
47
+ }
48
+
49
+ def unescape_column(col)
50
+ charmap = UNESCAPE_MAP
51
+ col[1...-1].gsub(/\\./) {|s| charmap[s] }
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(obj)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,72 @@
1
+ require 'aws-sdk'
2
+
3
+ module RedshiftConnector
4
+ class S3Bucket
5
+ @buckets = {}
6
+ @default = nil
7
+
8
+ def S3Bucket.add(name, default: false, **params)
9
+ instance = new(**params)
10
+ @buckets[name.to_s] = instance
11
+ if !@default or default
12
+ @default = instance
13
+ end
14
+ end
15
+
16
+ def S3Bucket.default
17
+ @default or raise ArgumentError, "no default S3 bucket configured"
18
+ end
19
+
20
+ def S3Bucket.get(name)
21
+ @buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
22
+ end
23
+
24
+ def initialize(bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
25
+ @name = bucket
26
+ @prefix = prefix
27
+ @access_key_id = access_key_id
28
+ @secret_access_key = secret_access_key
29
+ @iam_role = iam_role
30
+ end
31
+
32
+ attr_reader :name
33
+ attr_reader :prefix
34
+
35
+ def url
36
+ "s3://#{@bucket.name}/#{@prefix}/"
37
+ end
38
+
39
+ def client
40
+ @client ||= Aws::S3::Client.new(access_key_id: @access_key_id, secret_access_key: @secret_access_key)
41
+ end
42
+
43
+ def bucket
44
+ @bucket ||= begin
45
+ resource = Aws::S3::Resource.new(client: client)
46
+ resource.bucket(@name)
47
+ end
48
+ end
49
+
50
+ def object(key)
51
+ bucket.object(key)
52
+ end
53
+
54
+ def objects(prefix:)
55
+ bucket.objects(prefix: prefix)
56
+ end
57
+
58
+ def delete_objects(keys)
59
+ bucket.delete_objects(delete: {objects: keys.map {|k| {key: k} }})
60
+ end
61
+
62
+ def credential_string
63
+ if @iam_role
64
+ "aws_iam_role=#{@iam_role}"
65
+ elsif @access_key_id
66
+ "aws_access_key_id=#{@access_key_id};aws_secret_access_key=#{@secret_access_key}"
67
+ else
68
+ raise ArgumentError, "no credential given for Redshift S3 access"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,34 @@
1
+ require 'zlib'
2
+
3
+ module RedshiftConnector
4
+ class S3DataFile
5
+ def initialize(object, reader_class:)
6
+ @object = object
7
+ @reader_class = reader_class
8
+ end
9
+
10
+ def key
11
+ @object.key
12
+ end
13
+
14
+ def each_row(&block)
15
+ response = @object.get
16
+ f = if gzipped_object?
17
+ Zlib::GzipReader.new(response.body)
18
+ else
19
+ response.body
20
+ end
21
+ @reader_class.new(f).each(&block)
22
+ ensure
23
+ response.body.close if response
24
+ end
25
+
26
+ def data_object?
27
+ @reader_class.data_object?(@object)
28
+ end
29
+
30
+ def gzipped_object?
31
+ File.extname(@object.key) == '.gz'
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,101 @@
1
+ require 'redshift-connector/s3_bucket'
2
+ require 'redshift-connector/s3_data_file'
3
+ require 'redshift-connector/reader'
4
+ require 'redshift-connector/logger'
5
+ require 'aws-sdk'
6
+
7
+ module RedshiftConnector
8
+ class S3DataFileBundle
9
+ def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
10
+ real_prefix = "#{bucket.prefix}/#{prefix}"
11
+ new(bucket, real_prefix, format: format, filter: filter, batch_size: batch_size, logger: logger)
12
+ end
13
+
14
+ def self.for_table(bucket: S3Bucket.default, schema:, table:, txn_id:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
15
+ prefix = "#{bucket.prefix}/#{schema}_export/#{table}/#{txn_id}/#{table}.csv."
16
+ new(bucket, prefix, format: :redshift_csv, filter: filter, batch_size: batch_size, logger: logger)
17
+ end
18
+
19
+ def initialize(bucket, prefix, format: :csv, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
20
+ @bucket = bucket
21
+ @prefix = prefix
22
+ @format = format
23
+ @filter = filter || lambda {|*row| row }
24
+ @batch_size = batch_size
25
+ @logger = logger
26
+ @reader_class = Reader.get(format)
27
+ end
28
+
29
+ attr_reader :bucket
30
+ attr_reader :prefix
31
+
32
+ def url
33
+ "s3://#{@bucket.name}/#{@prefix}"
34
+ end
35
+
36
+ def credential_string
37
+ @bucket.credential_string
38
+ end
39
+
40
+ REPORT_SIZE = 10_0000
41
+
42
+ def each_batch(report: true)
43
+ @logger.info "reader: #{@reader_class}"
44
+ n = 0
45
+ reported = 0
46
+ do_each_batch(@batch_size) do |rows|
47
+ yield rows
48
+ n += rows.size
49
+ if n / REPORT_SIZE > reported
50
+ @logger.info "#{n} rows processed" if report
51
+ reported = n / REPORT_SIZE
52
+ end
53
+ end
54
+ @logger.info "total #{n} rows processed" if report
55
+ end
56
+
57
+ def do_each_batch(batch_size)
58
+ filter = @filter
59
+ buf = []
60
+ each_row do |row|
61
+ buf.push filter.(*row)
62
+ if buf.size == batch_size
63
+ yield buf
64
+ buf = []
65
+ end
66
+ end
67
+ yield buf unless buf.empty?
68
+ end
69
+ private :do_each_batch
70
+
71
+ def each_row(&block)
72
+ each_object do |obj|
73
+ obj.each_row(&block)
74
+ end
75
+ end
76
+
77
+ alias each each_row
78
+
79
+ def each_object(&block)
80
+ all_data_objects.each do |obj|
81
+ @logger.info "processing s3 object: #{obj.key}"
82
+ yield obj
83
+ end
84
+ end
85
+
86
+ def all_data_objects
87
+ @bucket.objects(prefix: @prefix)
88
+ .map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
89
+ .select {|obj| obj.data_object? }
90
+ end
91
+
92
+ def clear
93
+ pref = File.dirname(@prefix) + '/'
94
+ keys = @bucket.objects(prefix: pref).map(&:key)
95
+ unless keys.empty?
96
+ @logger.info "DELETE #{pref}*"
97
+ @bucket.delete_objects(keys)
98
+ end
99
+ end
100
+ end
101
+ end