redshift_connector 8.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/Gemfile +7 -0
- data/LICENSE +21 -0
- data/README.md +42 -0
- data/RELEASE.md +89 -0
- data/Rakefile +3 -0
- data/lib/redshift_connector.rb +35 -0
- data/lib/redshift_connector/active_record_data_source.rb +23 -0
- data/lib/redshift_connector/active_record_exporter.rb +47 -0
- data/lib/redshift_connector/connector.rb +189 -0
- data/lib/redshift_connector/data_file.rb +32 -0
- data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
- data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
- data/lib/redshift_connector/exception.rb +5 -0
- data/lib/redshift_connector/exporter.rb +40 -0
- data/lib/redshift_connector/exporter_builder.rb +49 -0
- data/lib/redshift_connector/immediate_exporter.rb +19 -0
- data/lib/redshift_connector/importer.rb +58 -0
- data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
- data/lib/redshift_connector/importer/insert_delta.rb +31 -0
- data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
- data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
- data/lib/redshift_connector/importer/upsert.rb +24 -0
- data/lib/redshift_connector/logger.rb +20 -0
- data/lib/redshift_connector/query.rb +95 -0
- data/lib/redshift_connector/reader.rb +18 -0
- data/lib/redshift_connector/reader/abstract.rb +18 -0
- data/lib/redshift_connector/reader/csv.rb +24 -0
- data/lib/redshift_connector/reader/exception.rb +3 -0
- data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
- data/lib/redshift_connector/reader/tsv.rb +24 -0
- data/lib/redshift_connector/s3_bucket.rb +76 -0
- data/lib/redshift_connector/s3_data_file.rb +20 -0
- data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
- data/lib/redshift_connector/version.rb +3 -0
- data/redshift_connector.gemspec +27 -0
- metadata +190 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'redshift_connector/logger'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class DataFile
|
6
|
+
def initialize(reader_class:)
|
7
|
+
@reader_class = reader_class
|
8
|
+
end
|
9
|
+
|
10
|
+
def each_row(&block)
|
11
|
+
f = open
|
12
|
+
begin
|
13
|
+
if gzipped_object?
|
14
|
+
f = Zlib::GzipReader.new(f)
|
15
|
+
end
|
16
|
+
@reader_class.new(f).each(&block)
|
17
|
+
ensure
|
18
|
+
f.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# abstract open
|
23
|
+
|
24
|
+
def data_object?
|
25
|
+
@reader_class.data_object?(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def gzipped_object?
|
29
|
+
File.extname(key) == '.gz'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'redshift_connector/logger'
|
2
|
+
|
3
|
+
module RedshiftConnector
|
4
|
+
class DataFileBundleParams
|
5
|
+
def initialize(
|
6
|
+
bucket: nil,
|
7
|
+
schema:,
|
8
|
+
table:,
|
9
|
+
txn_id: nil,
|
10
|
+
logger: RedshiftConnector.logger
|
11
|
+
)
|
12
|
+
@bucket = bucket
|
13
|
+
@schema = schema
|
14
|
+
@table = table
|
15
|
+
@txn_id = txn_id
|
16
|
+
@logger = logger
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :bucket
|
20
|
+
attr_reader :schema
|
21
|
+
attr_reader :table
|
22
|
+
attr_reader :txn_id
|
23
|
+
attr_reader :logger
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'redshift_connector/logger'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class DataFileBundleReader
|
6
|
+
extend Forwardable
|
7
|
+
|
8
|
+
DEFAULT_BATCH_SIZE = 1000
|
9
|
+
|
10
|
+
def initialize(bundle, filter: nil, batch_size: DEFAULT_BATCH_SIZE, logger: RedshiftConnector.logger)
|
11
|
+
@bundle = bundle
|
12
|
+
@filter = filter || lambda {|*row| row }
|
13
|
+
@batch_size = batch_size || 1000
|
14
|
+
@logger = logger
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :bundle
|
18
|
+
attr_reader :batch_size
|
19
|
+
attr_reader :logger
|
20
|
+
|
21
|
+
def_delegators '@bundle', :url, :bucket, :key
|
22
|
+
|
23
|
+
def each_row(&block)
|
24
|
+
each_object do |obj|
|
25
|
+
obj.each_row(&block)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
alias each each_row
|
30
|
+
|
31
|
+
def each_object(&block)
|
32
|
+
all_data_objects.each do |obj|
|
33
|
+
@logger.info "processing s3 object: #{obj.key}"
|
34
|
+
yield obj
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def all_data_objects
|
39
|
+
@bundle.data_files.select {|obj| obj.data_object? }
|
40
|
+
end
|
41
|
+
|
42
|
+
REPORT_SIZE = 10_0000
|
43
|
+
|
44
|
+
def each_batch(report: true)
|
45
|
+
n = 0
|
46
|
+
reported = 0
|
47
|
+
do_each_batch(@batch_size) do |rows|
|
48
|
+
yield rows
|
49
|
+
n += rows.size
|
50
|
+
if n / REPORT_SIZE > reported
|
51
|
+
@logger.info "#{n} rows processed" if report
|
52
|
+
reported = n / REPORT_SIZE
|
53
|
+
end
|
54
|
+
end
|
55
|
+
@logger.info "total #{n} rows processed" if report
|
56
|
+
end
|
57
|
+
|
58
|
+
def do_each_batch(batch_size)
|
59
|
+
filter = @filter
|
60
|
+
buf = []
|
61
|
+
each_row do |row|
|
62
|
+
buf.push filter.(*row)
|
63
|
+
if buf.size == batch_size
|
64
|
+
yield buf
|
65
|
+
buf = []
|
66
|
+
end
|
67
|
+
end
|
68
|
+
yield buf unless buf.empty?
|
69
|
+
end
|
70
|
+
private :do_each_batch
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module RedshiftConnector
|
2
|
+
module Exporter
|
3
|
+
@default_data_source = nil
|
4
|
+
|
5
|
+
def Exporter.default_data_source=(ds)
|
6
|
+
@default_data_source = ds
|
7
|
+
end
|
8
|
+
|
9
|
+
def Exporter.default_data_source
|
10
|
+
@default_data_source or raise ArgumentError, "RedshiftConnector::Exporter.default_data_source was not set"
|
11
|
+
end
|
12
|
+
|
13
|
+
def Exporter.builder
|
14
|
+
default_data_source.exporter_builder
|
15
|
+
end
|
16
|
+
|
17
|
+
def Exporter.for_table_delta(**params)
|
18
|
+
builder.build_for_table_delta(**params)
|
19
|
+
end
|
20
|
+
|
21
|
+
def Exporter.for_table(**params)
|
22
|
+
builder.build_for_table(**params)
|
23
|
+
end
|
24
|
+
|
25
|
+
def Exporter.for_query(**params)
|
26
|
+
builder.build_for_query(**params)
|
27
|
+
end
|
28
|
+
|
29
|
+
def Exporter.foreach(**params, &block)
|
30
|
+
exporter = for_query(**params)
|
31
|
+
bundle = exporter.execute
|
32
|
+
r = DataFileBundleReader.new(bundle, logger: bundle.logger)
|
33
|
+
begin
|
34
|
+
r.each_row(&block)
|
35
|
+
ensure
|
36
|
+
bundle.clear if bundle.respond_to?(:clear)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'redshift_connector/query'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class ExporterBuilder
|
6
|
+
def initialize(ds:, exporter_class:)
|
7
|
+
@ds = ds
|
8
|
+
@exporter_class = exporter_class
|
9
|
+
end
|
10
|
+
|
11
|
+
def build_for_table_delta(schema:, table:, condition:, columns:, bundle_params:, logger: RedshiftConnector.logger)
|
12
|
+
query = DeltaQuery.new(schema: schema, table: table, columns: columns, condition: condition)
|
13
|
+
@exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
|
14
|
+
end
|
15
|
+
|
16
|
+
def build_for_table(schema:, table:, columns:, bundle_params:, logger: RedshiftConnector.logger)
|
17
|
+
query = SelectAllQuery.new(schema: schema, table: table, columns: columns)
|
18
|
+
@exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
|
19
|
+
end
|
20
|
+
|
21
|
+
def build_for_query(
|
22
|
+
schema:,
|
23
|
+
table:,
|
24
|
+
bucket: nil,
|
25
|
+
query:,
|
26
|
+
txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
|
27
|
+
enable_sort: false,
|
28
|
+
logger: RedshiftConnector.logger,
|
29
|
+
quiet: false
|
30
|
+
)
|
31
|
+
logger = NullLogger.new if quiet
|
32
|
+
bundle_params = DataFileBundleParams.new(
|
33
|
+
bucket: bucket,
|
34
|
+
schema: schema,
|
35
|
+
table: table,
|
36
|
+
txn_id: txn_id,
|
37
|
+
logger: logger
|
38
|
+
)
|
39
|
+
@exporter_class.new(
|
40
|
+
ds: @ds,
|
41
|
+
query: ArbitraryQuery.new(query),
|
42
|
+
bundle_params: bundle_params,
|
43
|
+
enable_sort: enable_sort,
|
44
|
+
logger: logger
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'redshift_connector/s3_data_file_bundle'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class ImmediateExporter
|
6
|
+
def initialize(bundle:, logger: RedshiftConnector.logger)
|
7
|
+
@bundle = bundle
|
8
|
+
@logger = logger
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :bundle
|
12
|
+
attr_reader :logger
|
13
|
+
|
14
|
+
def execute
|
15
|
+
@logger.info "USE #{@bundle.url}*"
|
16
|
+
@bundle
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# create module
|
2
|
+
module RedshiftConnector
|
3
|
+
module Importer
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'redshift_connector/importer/upsert'
|
8
|
+
require 'redshift_connector/importer/insert_delta'
|
9
|
+
require 'redshift_connector/importer/rebuild_rename'
|
10
|
+
require 'redshift_connector/importer/rebuild_truncate'
|
11
|
+
require 'redshift_connector/logger'
|
12
|
+
|
13
|
+
module RedshiftConnector
|
14
|
+
module Importer
|
15
|
+
def Importer.for_delta_upsert(table:, columns:, delete_cond: nil, upsert_columns: nil, logger: RedshiftConnector.logger)
|
16
|
+
if delete_cond and upsert_columns
|
17
|
+
raise ArgumentError, "delete_cond and upsert_columns are exclusive"
|
18
|
+
end
|
19
|
+
importer =
|
20
|
+
if delete_cond
|
21
|
+
Importer::InsertDelta.new(
|
22
|
+
dao: table.classify.constantize,
|
23
|
+
columns: columns,
|
24
|
+
delete_cond: delete_cond,
|
25
|
+
logger: logger
|
26
|
+
)
|
27
|
+
elsif upsert_columns
|
28
|
+
Importer::Upsert.new(
|
29
|
+
dao: table.classify.constantize,
|
30
|
+
columns: columns,
|
31
|
+
upsert_columns: upsert_columns,
|
32
|
+
logger: logger
|
33
|
+
)
|
34
|
+
else
|
35
|
+
raise ArgumentError, "either of delete_cond or upsert_columns is required for delta import"
|
36
|
+
end
|
37
|
+
importer
|
38
|
+
end
|
39
|
+
|
40
|
+
def Importer.for_rebuild(strategy: 'rename', table:, columns:, logger: RedshiftConnector.logger)
|
41
|
+
c = get_rebuild_class(strategy)
|
42
|
+
c.new(
|
43
|
+
dao: table.classify.constantize,
|
44
|
+
columns: columns,
|
45
|
+
logger: logger
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
def Importer.get_rebuild_class(strategy)
|
50
|
+
case strategy.to_s
|
51
|
+
when 'rename' then RebuildRename
|
52
|
+
when 'truncate' then RebuildTruncate
|
53
|
+
else
|
54
|
+
raise ArgumentError, "unsupported rebuild strategy: #{strategy.inspect}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::InsertDelta
|
6
|
+
def initialize(dao:, columns:, delete_cond:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@columns = columns
|
9
|
+
@delete_cond = delete_cond
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute(bundle)
|
14
|
+
delete_rows(@delete_cond)
|
15
|
+
import(bundle)
|
16
|
+
end
|
17
|
+
|
18
|
+
def delete_rows(cond_expr)
|
19
|
+
@logger.info "DELETE #{@dao.table_name} where (#{cond_expr})"
|
20
|
+
@dao.connection.execute("delete from #{@dao.table_name} where #{cond_expr}")
|
21
|
+
@logger.info "deleted."
|
22
|
+
end
|
23
|
+
|
24
|
+
def import(bundle)
|
25
|
+
@logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
|
26
|
+
bundle.each_batch do |rows|
|
27
|
+
@dao.import(@columns, rows)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
require 'thread'
|
4
|
+
|
5
|
+
module RedshiftConnector
|
6
|
+
class Importer::RebuildRename
|
7
|
+
def initialize(dao:, columns:, logger: RedshiftConnector.logger)
|
8
|
+
@dao = dao
|
9
|
+
@columns = columns
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute(bundle)
|
14
|
+
dest_table = @dao.table_name
|
15
|
+
tmp_table = "#{dest_table}_new"
|
16
|
+
old_table = "#{dest_table}_old"
|
17
|
+
|
18
|
+
tmp_dao = self.class.make_temporary_dao(@dao)
|
19
|
+
tmp_dao.table_name = tmp_table
|
20
|
+
|
21
|
+
exec_update "drop table if exists #{tmp_table}"
|
22
|
+
exec_update "create table #{tmp_table} like #{dest_table}"
|
23
|
+
import(tmp_dao, bundle)
|
24
|
+
exec_update "drop table if exists #{old_table}"
|
25
|
+
# Atomic table exchange
|
26
|
+
exec_update "rename table #{dest_table} to #{old_table}, #{tmp_table} to #{dest_table}"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Duplicates DAO (ActiveRecord class) and names it.
|
30
|
+
# Newer activerecord-import requires a class name (not a table name),
|
31
|
+
# we must prepare some name for temporary DAO class.
|
32
|
+
def self.make_temporary_dao(orig)
|
33
|
+
tmp = orig.dup
|
34
|
+
const_set("TemporaryDAO_#{get_unique_sequence}", tmp)
|
35
|
+
tmp.name # fix class name
|
36
|
+
tmp
|
37
|
+
end
|
38
|
+
|
39
|
+
@dao_seq = 0
|
40
|
+
@dao_seq_lock = Mutex.new
|
41
|
+
|
42
|
+
def self.get_unique_sequence
|
43
|
+
@dao_seq_lock.synchronize { @dao_seq += 1 }
|
44
|
+
end
|
45
|
+
|
46
|
+
def exec_update(query)
|
47
|
+
@logger.info query
|
48
|
+
@dao.connection.execute(query)
|
49
|
+
end
|
50
|
+
|
51
|
+
def import(dao, bundle)
|
52
|
+
@logger.info "IMPORT #{bundle.url}* -> #{dao.table_name} (#{@columns.join(', ')})"
|
53
|
+
bundle.each_batch do |rows|
|
54
|
+
dao.import(@columns, rows)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::RebuildTruncate
|
6
|
+
def initialize(dao:, columns:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@columns = columns
|
9
|
+
@logger = logger
|
10
|
+
end
|
11
|
+
|
12
|
+
def execute(bundle)
|
13
|
+
truncate_table(@dao.table_name)
|
14
|
+
import(bundle)
|
15
|
+
end
|
16
|
+
|
17
|
+
def truncate_table(table_name)
|
18
|
+
@logger.info "TRUNCATE #{table_name}"
|
19
|
+
@dao.connection.execute("truncate #{table_name}")
|
20
|
+
@logger.info "truncated."
|
21
|
+
end
|
22
|
+
|
23
|
+
def import(bundle)
|
24
|
+
@logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
|
25
|
+
bundle.each_batch do |rows|
|
26
|
+
@dao.import(@columns, rows)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::Upsert
|
6
|
+
def initialize(dao:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@columns = columns
|
9
|
+
@upsert_columns = upsert_columns
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute(bundle)
|
14
|
+
import(bundle)
|
15
|
+
end
|
16
|
+
|
17
|
+
def import(bundle)
|
18
|
+
@logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
|
19
|
+
bundle.each_batch do |rows|
|
20
|
+
@dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|