redshift-connector 4.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +10 -0
- data/lib/redshift-connector.rb +31 -0
- data/lib/redshift-connector/connector.rb +146 -0
- data/lib/redshift-connector/exporter.rb +116 -0
- data/lib/redshift-connector/importer.rb +89 -0
- data/lib/redshift-connector/importer/activerecord-import.rb +2 -0
- data/lib/redshift-connector/importer/insert_delta.rb +32 -0
- data/lib/redshift-connector/importer/rebuild_rename.rb +41 -0
- data/lib/redshift-connector/importer/rebuild_truncate.rb +31 -0
- data/lib/redshift-connector/importer/upsert.rb +25 -0
- data/lib/redshift-connector/logger.rb +20 -0
- data/lib/redshift-connector/query.rb +93 -0
- data/lib/redshift-connector/reader.rb +18 -0
- data/lib/redshift-connector/reader/abstract.rb +18 -0
- data/lib/redshift-connector/reader/csv.rb +24 -0
- data/lib/redshift-connector/reader/exception.rb +3 -0
- data/lib/redshift-connector/reader/redshift_csv.rb +54 -0
- data/lib/redshift-connector/reader/tsv.rb +24 -0
- data/lib/redshift-connector/s3_bucket.rb +72 -0
- data/lib/redshift-connector/s3_data_file.rb +34 -0
- data/lib/redshift-connector/s3_data_file_bundle.rb +101 -0
- data/lib/redshift-connector/version.rb +3 -0
- data/test/all.rb +3 -0
- data/test/config.rb +13 -0
- data/test/config.rb.example +18 -0
- data/test/database.yml +15 -0
- data/test/database.yml.example +15 -0
- data/test/foreach.rb +5 -0
- data/test/helper.rb +25 -0
- data/test/item_pvs.ct.mysql +11 -0
- data/test/item_pvs.ct.redshift +9 -0
- data/test/reader/test_redshift_csv.rb +30 -0
- data/test/test_connector.rb +148 -0
- data/test/test_reader.rb +10 -0
- data/test/test_s3_import.rb +32 -0
- metadata +190 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'redshift-connector/importer/activerecord-import'
|
2
|
+
require 'redshift-connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::RebuildTruncate
|
6
|
+
def initialize(dao:, bundle:, columns:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@bundle = bundle
|
9
|
+
@columns = columns
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute
|
14
|
+
truncate_table(@dao.table_name)
|
15
|
+
import
|
16
|
+
end
|
17
|
+
|
18
|
+
def truncate_table(table_name)
|
19
|
+
@logger.info "TRUNCATE #{table_name}"
|
20
|
+
@dao.connection.execute("truncate #{table_name}")
|
21
|
+
@logger.info "truncated."
|
22
|
+
end
|
23
|
+
|
24
|
+
def import
|
25
|
+
@logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
|
26
|
+
@bundle.each_batch do |rows|
|
27
|
+
@dao.import(@columns, rows)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'redshift-connector/importer/activerecord-import'
|
2
|
+
require 'redshift-connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::Upsert
|
6
|
+
def initialize(dao:, bundle:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@bundle = bundle
|
9
|
+
@columns = columns
|
10
|
+
@upsert_columns = upsert_columns
|
11
|
+
@logger = logger
|
12
|
+
end
|
13
|
+
|
14
|
+
def execute
|
15
|
+
import
|
16
|
+
end
|
17
|
+
|
18
|
+
def import
|
19
|
+
@logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
|
20
|
+
@bundle.each_batch do |rows|
|
21
|
+
@dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module RedshiftConnector
|
2
|
+
@logger = nil
|
3
|
+
|
4
|
+
def RedshiftConnector.logger
|
5
|
+
# Defer to access Rails
|
6
|
+
@logger || Rails.logger
|
7
|
+
end
|
8
|
+
|
9
|
+
def RedshiftConnector.logger=(logger)
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
class NullLogger
|
14
|
+
def noop(*args) end
|
15
|
+
alias error noop
|
16
|
+
alias warn noop
|
17
|
+
alias info noop
|
18
|
+
alias debug noop
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module RedshiftConnector
|
2
|
+
class DeltaQuery
|
3
|
+
def initialize(schema:, table:, columns:, condition: nil)
|
4
|
+
@schema = schema
|
5
|
+
@table = table
|
6
|
+
@columns = columns
|
7
|
+
@condition = condition
|
8
|
+
end
|
9
|
+
|
10
|
+
def table_spec
|
11
|
+
"#{@schema}.#{@table}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def description
|
15
|
+
"#{table_spec} (#{@columns.join(', ')}) where (#{@condition})"
|
16
|
+
end
|
17
|
+
|
18
|
+
def to_sql
|
19
|
+
"select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
|
20
|
+
+ " from #{table_spec}" \
|
21
|
+
+ (@condition ? " where #{@condition}" : '')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class SelectAllQuery
|
26
|
+
def initialize(schema:, table:, columns:)
|
27
|
+
@schema = schema
|
28
|
+
@table = table
|
29
|
+
@columns = columns
|
30
|
+
end
|
31
|
+
|
32
|
+
def table_spec
|
33
|
+
"#{@schema}.#{@table}"
|
34
|
+
end
|
35
|
+
|
36
|
+
def description
|
37
|
+
"#{table_spec} (#{@columns.join(', ')})"
|
38
|
+
end
|
39
|
+
|
40
|
+
def to_sql
|
41
|
+
"select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
|
42
|
+
+ " from #{table_spec}"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class UnloadQuery
|
47
|
+
def UnloadQuery.wrap(query:, bundle:)
|
48
|
+
new(query: ArbitraryQuery.new(query), bundle: bundle)
|
49
|
+
end
|
50
|
+
|
51
|
+
def initialize(query:, bundle:)
|
52
|
+
@query = query
|
53
|
+
@bundle = bundle
|
54
|
+
end
|
55
|
+
|
56
|
+
def table_spec
|
57
|
+
@query.table_spec
|
58
|
+
end
|
59
|
+
|
60
|
+
def description
|
61
|
+
@query.description
|
62
|
+
end
|
63
|
+
|
64
|
+
def to_sql
|
65
|
+
<<-EndSQL.gsub(/^\s+/, '')
|
66
|
+
unload ('#{escape_query(@query.to_sql)}')
|
67
|
+
to '#{@bundle.url}'
|
68
|
+
credentials '#{@bundle.credential_string}'
|
69
|
+
gzip
|
70
|
+
allowoverwrite
|
71
|
+
delimiter ',' escape addquotes
|
72
|
+
EndSQL
|
73
|
+
end
|
74
|
+
|
75
|
+
def escape_query(query)
|
76
|
+
query.gsub("'", "\\\\'")
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class ArbitraryQuery
|
81
|
+
def initialize(query)
|
82
|
+
@query = query
|
83
|
+
end
|
84
|
+
|
85
|
+
def description
|
86
|
+
@query
|
87
|
+
end
|
88
|
+
|
89
|
+
def to_sql
|
90
|
+
@query
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# create module
|
2
|
+
module RedshiftConnector
|
3
|
+
module Reader
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'redshift-connector/reader/redshift_csv'
|
8
|
+
require 'redshift-connector/reader/csv'
|
9
|
+
require 'redshift-connector/reader/tsv'
|
10
|
+
require 'redshift-connector/reader/exception'
|
11
|
+
|
12
|
+
module RedshiftConnector
|
13
|
+
module Reader
|
14
|
+
def Reader.get(id)
|
15
|
+
Abstract.get_reader_class(id)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module RedshiftConnector
|
2
|
+
class Reader::Abstract
|
3
|
+
READER_CLASSES = {} # {Symbol => Class}
|
4
|
+
|
5
|
+
def self.declare_reader(id)
|
6
|
+
READER_CLASSES[id.to_sym] = self
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.get_reader_class(id)
|
10
|
+
READER_CLASSES[id.to_sym] or
|
11
|
+
raise ArgumentError, "unknown data file reader type: #{id.inspect}"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.get_reader_class(id)
|
16
|
+
Reader::Abstract.get_reader_class(id)
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'redshift-connector/reader/abstract'
|
2
|
+
require 'redshift-connector/reader/exception'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module RedshiftConnector
|
6
|
+
# Parses (standard) CSV files.
|
7
|
+
# For UNLOAD-generated CSV, use RedshiftCSV class.
|
8
|
+
class Reader::CSV < Reader::Abstract
|
9
|
+
declare_reader :csv
|
10
|
+
|
11
|
+
def self.data_object?(obj)
|
12
|
+
/\.csv(?:\.|\z)/ =~ File.basename(obj.key)
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(f)
|
16
|
+
@f = f
|
17
|
+
end
|
18
|
+
|
19
|
+
def each(&block)
|
20
|
+
csv = CSV.new(@f)
|
21
|
+
csv.each(&block)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'redshift-connector/reader/abstract'
|
2
|
+
require 'redshift-connector/reader/exception'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
# Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
|
6
|
+
# UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
|
7
|
+
class Reader::RedshiftCSV < Reader::Abstract
|
8
|
+
declare_reader :redshift_csv
|
9
|
+
|
10
|
+
def self.data_object?(obj)
|
11
|
+
/\.csv(?:\.|\z)/ =~ File.basename(obj.key)
|
12
|
+
end
|
13
|
+
|
14
|
+
# f :: IO
|
15
|
+
def initialize(f)
|
16
|
+
@f = f
|
17
|
+
end
|
18
|
+
|
19
|
+
def each
|
20
|
+
# We can use simple #each_line to read single row
|
21
|
+
# because line terminators are always escaped by UNLOAD.
|
22
|
+
@f.each_line do |line|
|
23
|
+
yield parse_row(line, @f.lineno)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_row(line, lineno = nil)
|
28
|
+
row = []
|
29
|
+
s = StringScanner.new(line)
|
30
|
+
s.skip(/\s+/)
|
31
|
+
until s.eos?
|
32
|
+
col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
|
33
|
+
row.push unescape_column(col)
|
34
|
+
s.skip(/\s*/) # skip line terminator on line ends
|
35
|
+
s.skip(/,\s*/)
|
36
|
+
end
|
37
|
+
row
|
38
|
+
end
|
39
|
+
|
40
|
+
UNESCAPE_MAP = {
|
41
|
+
'\\"' => '"',
|
42
|
+
"\\'" => "'",
|
43
|
+
'\\,' => ',',
|
44
|
+
'\\r' => "\r",
|
45
|
+
'\\n' => "\n",
|
46
|
+
'\\\\' => '\\'
|
47
|
+
}
|
48
|
+
|
49
|
+
def unescape_column(col)
|
50
|
+
charmap = UNESCAPE_MAP
|
51
|
+
col[1...-1].gsub(/\\./) {|s| charmap[s] }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'redshift-connector/reader/abstract'
|
2
|
+
require 'redshift-connector/reader/exception'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module RedshiftConnector
|
6
|
+
# Parses TSV (Tab Separated Format) files.
|
7
|
+
class Reader::TSV < Reader::Abstract
|
8
|
+
declare_reader :tsv
|
9
|
+
|
10
|
+
def self.data_object?(obj)
|
11
|
+
/\.tsv(?:\.|\z)/ =~ File.basename(obj.key)
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(f)
|
15
|
+
@f = f
|
16
|
+
end
|
17
|
+
|
18
|
+
def each(&block)
|
19
|
+
@f.each_line do |line|
|
20
|
+
yield line.chomp.split("\t", -1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
|
3
|
+
module RedshiftConnector
|
4
|
+
class S3Bucket
|
5
|
+
@buckets = {}
|
6
|
+
@default = nil
|
7
|
+
|
8
|
+
def S3Bucket.add(name, default: false, **params)
|
9
|
+
instance = new(**params)
|
10
|
+
@buckets[name.to_s] = instance
|
11
|
+
if !@default or default
|
12
|
+
@default = instance
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def S3Bucket.default
|
17
|
+
@default or raise ArgumentError, "no default S3 bucket configured"
|
18
|
+
end
|
19
|
+
|
20
|
+
def S3Bucket.get(name)
|
21
|
+
@buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
|
22
|
+
end
|
23
|
+
|
24
|
+
def initialize(bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
|
25
|
+
@name = bucket
|
26
|
+
@prefix = prefix
|
27
|
+
@access_key_id = access_key_id
|
28
|
+
@secret_access_key = secret_access_key
|
29
|
+
@iam_role = iam_role
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :name
|
33
|
+
attr_reader :prefix
|
34
|
+
|
35
|
+
def url
|
36
|
+
"s3://#{@bucket.name}/#{@prefix}/"
|
37
|
+
end
|
38
|
+
|
39
|
+
def client
|
40
|
+
@client ||= Aws::S3::Client.new(access_key_id: @access_key_id, secret_access_key: @secret_access_key)
|
41
|
+
end
|
42
|
+
|
43
|
+
def bucket
|
44
|
+
@bucket ||= begin
|
45
|
+
resource = Aws::S3::Resource.new(client: client)
|
46
|
+
resource.bucket(@name)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def object(key)
|
51
|
+
bucket.object(key)
|
52
|
+
end
|
53
|
+
|
54
|
+
def objects(prefix:)
|
55
|
+
bucket.objects(prefix: prefix)
|
56
|
+
end
|
57
|
+
|
58
|
+
def delete_objects(keys)
|
59
|
+
bucket.delete_objects(delete: {objects: keys.map {|k| {key: k} }})
|
60
|
+
end
|
61
|
+
|
62
|
+
def credential_string
|
63
|
+
if @iam_role
|
64
|
+
"aws_iam_role=#{@iam_role}"
|
65
|
+
elsif @access_key_id
|
66
|
+
"aws_access_key_id=#{@access_key_id};aws_secret_access_key=#{@secret_access_key}"
|
67
|
+
else
|
68
|
+
raise ArgumentError, "no credential given for Redshift S3 access"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
|
3
|
+
module RedshiftConnector
|
4
|
+
class S3DataFile
|
5
|
+
def initialize(object, reader_class:)
|
6
|
+
@object = object
|
7
|
+
@reader_class = reader_class
|
8
|
+
end
|
9
|
+
|
10
|
+
def key
|
11
|
+
@object.key
|
12
|
+
end
|
13
|
+
|
14
|
+
def each_row(&block)
|
15
|
+
response = @object.get
|
16
|
+
f = if gzipped_object?
|
17
|
+
Zlib::GzipReader.new(response.body)
|
18
|
+
else
|
19
|
+
response.body
|
20
|
+
end
|
21
|
+
@reader_class.new(f).each(&block)
|
22
|
+
ensure
|
23
|
+
response.body.close if response
|
24
|
+
end
|
25
|
+
|
26
|
+
def data_object?
|
27
|
+
@reader_class.data_object?(@object)
|
28
|
+
end
|
29
|
+
|
30
|
+
def gzipped_object?
|
31
|
+
File.extname(@object.key) == '.gz'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'redshift-connector/s3_bucket'
|
2
|
+
require 'redshift-connector/s3_data_file'
|
3
|
+
require 'redshift-connector/reader'
|
4
|
+
require 'redshift-connector/logger'
|
5
|
+
require 'aws-sdk'
|
6
|
+
|
7
|
+
module RedshiftConnector
|
8
|
+
class S3DataFileBundle
|
9
|
+
def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
|
10
|
+
real_prefix = "#{bucket.prefix}/#{prefix}"
|
11
|
+
new(bucket, real_prefix, format: format, filter: filter, batch_size: batch_size, logger: logger)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.for_table(bucket: S3Bucket.default, schema:, table:, txn_id:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
|
15
|
+
prefix = "#{bucket.prefix}/#{schema}_export/#{table}/#{txn_id}/#{table}.csv."
|
16
|
+
new(bucket, prefix, format: :redshift_csv, filter: filter, batch_size: batch_size, logger: logger)
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(bucket, prefix, format: :csv, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
|
20
|
+
@bucket = bucket
|
21
|
+
@prefix = prefix
|
22
|
+
@format = format
|
23
|
+
@filter = filter || lambda {|*row| row }
|
24
|
+
@batch_size = batch_size
|
25
|
+
@logger = logger
|
26
|
+
@reader_class = Reader.get(format)
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :bucket
|
30
|
+
attr_reader :prefix
|
31
|
+
|
32
|
+
def url
|
33
|
+
"s3://#{@bucket.name}/#{@prefix}"
|
34
|
+
end
|
35
|
+
|
36
|
+
def credential_string
|
37
|
+
@bucket.credential_string
|
38
|
+
end
|
39
|
+
|
40
|
+
REPORT_SIZE = 10_0000
|
41
|
+
|
42
|
+
def each_batch(report: true)
|
43
|
+
@logger.info "reader: #{@reader_class}"
|
44
|
+
n = 0
|
45
|
+
reported = 0
|
46
|
+
do_each_batch(@batch_size) do |rows|
|
47
|
+
yield rows
|
48
|
+
n += rows.size
|
49
|
+
if n / REPORT_SIZE > reported
|
50
|
+
@logger.info "#{n} rows processed" if report
|
51
|
+
reported = n / REPORT_SIZE
|
52
|
+
end
|
53
|
+
end
|
54
|
+
@logger.info "total #{n} rows processed" if report
|
55
|
+
end
|
56
|
+
|
57
|
+
def do_each_batch(batch_size)
|
58
|
+
filter = @filter
|
59
|
+
buf = []
|
60
|
+
each_row do |row|
|
61
|
+
buf.push filter.(*row)
|
62
|
+
if buf.size == batch_size
|
63
|
+
yield buf
|
64
|
+
buf = []
|
65
|
+
end
|
66
|
+
end
|
67
|
+
yield buf unless buf.empty?
|
68
|
+
end
|
69
|
+
private :do_each_batch
|
70
|
+
|
71
|
+
def each_row(&block)
|
72
|
+
each_object do |obj|
|
73
|
+
obj.each_row(&block)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
alias each each_row
|
78
|
+
|
79
|
+
def each_object(&block)
|
80
|
+
all_data_objects.each do |obj|
|
81
|
+
@logger.info "processing s3 object: #{obj.key}"
|
82
|
+
yield obj
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def all_data_objects
|
87
|
+
@bucket.objects(prefix: @prefix)
|
88
|
+
.map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
|
89
|
+
.select {|obj| obj.data_object? }
|
90
|
+
end
|
91
|
+
|
92
|
+
def clear
|
93
|
+
pref = File.dirname(@prefix) + '/'
|
94
|
+
keys = @bucket.objects(prefix: pref).map(&:key)
|
95
|
+
unless keys.empty?
|
96
|
+
@logger.info "DELETE #{pref}*"
|
97
|
+
@bucket.delete_objects(keys)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|