redshift_connector 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/Gemfile +7 -0
- data/LICENSE +21 -0
- data/README.md +42 -0
- data/RELEASE.md +89 -0
- data/Rakefile +3 -0
- data/lib/redshift_connector.rb +35 -0
- data/lib/redshift_connector/active_record_data_source.rb +23 -0
- data/lib/redshift_connector/active_record_exporter.rb +47 -0
- data/lib/redshift_connector/connector.rb +189 -0
- data/lib/redshift_connector/data_file.rb +32 -0
- data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
- data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
- data/lib/redshift_connector/exception.rb +5 -0
- data/lib/redshift_connector/exporter.rb +40 -0
- data/lib/redshift_connector/exporter_builder.rb +49 -0
- data/lib/redshift_connector/immediate_exporter.rb +19 -0
- data/lib/redshift_connector/importer.rb +58 -0
- data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
- data/lib/redshift_connector/importer/insert_delta.rb +31 -0
- data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
- data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
- data/lib/redshift_connector/importer/upsert.rb +24 -0
- data/lib/redshift_connector/logger.rb +20 -0
- data/lib/redshift_connector/query.rb +95 -0
- data/lib/redshift_connector/reader.rb +18 -0
- data/lib/redshift_connector/reader/abstract.rb +18 -0
- data/lib/redshift_connector/reader/csv.rb +24 -0
- data/lib/redshift_connector/reader/exception.rb +3 -0
- data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
- data/lib/redshift_connector/reader/tsv.rb +24 -0
- data/lib/redshift_connector/s3_bucket.rb +76 -0
- data/lib/redshift_connector/s3_data_file.rb +20 -0
- data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
- data/lib/redshift_connector/version.rb +3 -0
- data/redshift_connector.gemspec +27 -0
- metadata +190 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'redshift_connector/logger'
|
2
|
+
require 'zlib'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class DataFile
|
6
|
+
def initialize(reader_class:)
|
7
|
+
@reader_class = reader_class
|
8
|
+
end
|
9
|
+
|
10
|
+
def each_row(&block)
|
11
|
+
f = open
|
12
|
+
begin
|
13
|
+
if gzipped_object?
|
14
|
+
f = Zlib::GzipReader.new(f)
|
15
|
+
end
|
16
|
+
@reader_class.new(f).each(&block)
|
17
|
+
ensure
|
18
|
+
f.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# abstract open
|
23
|
+
|
24
|
+
def data_object?
|
25
|
+
@reader_class.data_object?(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def gzipped_object?
|
29
|
+
File.extname(key) == '.gz'
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'redshift_connector/logger'
|
2
|
+
|
3
|
+
module RedshiftConnector
|
4
|
+
class DataFileBundleParams
|
5
|
+
def initialize(
|
6
|
+
bucket: nil,
|
7
|
+
schema:,
|
8
|
+
table:,
|
9
|
+
txn_id: nil,
|
10
|
+
logger: RedshiftConnector.logger
|
11
|
+
)
|
12
|
+
@bucket = bucket
|
13
|
+
@schema = schema
|
14
|
+
@table = table
|
15
|
+
@txn_id = txn_id
|
16
|
+
@logger = logger
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_reader :bucket
|
20
|
+
attr_reader :schema
|
21
|
+
attr_reader :table
|
22
|
+
attr_reader :txn_id
|
23
|
+
attr_reader :logger
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'redshift_connector/logger'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class DataFileBundleReader
|
6
|
+
extend Forwardable
|
7
|
+
|
8
|
+
DEFAULT_BATCH_SIZE = 1000
|
9
|
+
|
10
|
+
def initialize(bundle, filter: nil, batch_size: DEFAULT_BATCH_SIZE, logger: RedshiftConnector.logger)
|
11
|
+
@bundle = bundle
|
12
|
+
@filter = filter || lambda {|*row| row }
|
13
|
+
@batch_size = batch_size || 1000
|
14
|
+
@logger = logger
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :bundle
|
18
|
+
attr_reader :batch_size
|
19
|
+
attr_reader :logger
|
20
|
+
|
21
|
+
def_delegators '@bundle', :url, :bucket, :key
|
22
|
+
|
23
|
+
def each_row(&block)
|
24
|
+
each_object do |obj|
|
25
|
+
obj.each_row(&block)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
alias each each_row
|
30
|
+
|
31
|
+
def each_object(&block)
|
32
|
+
all_data_objects.each do |obj|
|
33
|
+
@logger.info "processing s3 object: #{obj.key}"
|
34
|
+
yield obj
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def all_data_objects
|
39
|
+
@bundle.data_files.select {|obj| obj.data_object? }
|
40
|
+
end
|
41
|
+
|
42
|
+
REPORT_SIZE = 10_0000
|
43
|
+
|
44
|
+
def each_batch(report: true)
|
45
|
+
n = 0
|
46
|
+
reported = 0
|
47
|
+
do_each_batch(@batch_size) do |rows|
|
48
|
+
yield rows
|
49
|
+
n += rows.size
|
50
|
+
if n / REPORT_SIZE > reported
|
51
|
+
@logger.info "#{n} rows processed" if report
|
52
|
+
reported = n / REPORT_SIZE
|
53
|
+
end
|
54
|
+
end
|
55
|
+
@logger.info "total #{n} rows processed" if report
|
56
|
+
end
|
57
|
+
|
58
|
+
def do_each_batch(batch_size)
|
59
|
+
filter = @filter
|
60
|
+
buf = []
|
61
|
+
each_row do |row|
|
62
|
+
buf.push filter.(*row)
|
63
|
+
if buf.size == batch_size
|
64
|
+
yield buf
|
65
|
+
buf = []
|
66
|
+
end
|
67
|
+
end
|
68
|
+
yield buf unless buf.empty?
|
69
|
+
end
|
70
|
+
private :do_each_batch
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module RedshiftConnector
|
2
|
+
module Exporter
|
3
|
+
@default_data_source = nil
|
4
|
+
|
5
|
+
def Exporter.default_data_source=(ds)
|
6
|
+
@default_data_source = ds
|
7
|
+
end
|
8
|
+
|
9
|
+
def Exporter.default_data_source
|
10
|
+
@default_data_source or raise ArgumentError, "RedshiftConnector::Exporter.default_data_source was not set"
|
11
|
+
end
|
12
|
+
|
13
|
+
def Exporter.builder
|
14
|
+
default_data_source.exporter_builder
|
15
|
+
end
|
16
|
+
|
17
|
+
def Exporter.for_table_delta(**params)
|
18
|
+
builder.build_for_table_delta(**params)
|
19
|
+
end
|
20
|
+
|
21
|
+
def Exporter.for_table(**params)
|
22
|
+
builder.build_for_table(**params)
|
23
|
+
end
|
24
|
+
|
25
|
+
def Exporter.for_query(**params)
|
26
|
+
builder.build_for_query(**params)
|
27
|
+
end
|
28
|
+
|
29
|
+
def Exporter.foreach(**params, &block)
|
30
|
+
exporter = for_query(**params)
|
31
|
+
bundle = exporter.execute
|
32
|
+
r = DataFileBundleReader.new(bundle, logger: bundle.logger)
|
33
|
+
begin
|
34
|
+
r.each_row(&block)
|
35
|
+
ensure
|
36
|
+
bundle.clear if bundle.respond_to?(:clear)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'redshift_connector/query'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class ExporterBuilder
|
6
|
+
def initialize(ds:, exporter_class:)
|
7
|
+
@ds = ds
|
8
|
+
@exporter_class = exporter_class
|
9
|
+
end
|
10
|
+
|
11
|
+
def build_for_table_delta(schema:, table:, condition:, columns:, bundle_params:, logger: RedshiftConnector.logger)
|
12
|
+
query = DeltaQuery.new(schema: schema, table: table, columns: columns, condition: condition)
|
13
|
+
@exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
|
14
|
+
end
|
15
|
+
|
16
|
+
def build_for_table(schema:, table:, columns:, bundle_params:, logger: RedshiftConnector.logger)
|
17
|
+
query = SelectAllQuery.new(schema: schema, table: table, columns: columns)
|
18
|
+
@exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
|
19
|
+
end
|
20
|
+
|
21
|
+
def build_for_query(
|
22
|
+
schema:,
|
23
|
+
table:,
|
24
|
+
bucket: nil,
|
25
|
+
query:,
|
26
|
+
txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
|
27
|
+
enable_sort: false,
|
28
|
+
logger: RedshiftConnector.logger,
|
29
|
+
quiet: false
|
30
|
+
)
|
31
|
+
logger = NullLogger.new if quiet
|
32
|
+
bundle_params = DataFileBundleParams.new(
|
33
|
+
bucket: bucket,
|
34
|
+
schema: schema,
|
35
|
+
table: table,
|
36
|
+
txn_id: txn_id,
|
37
|
+
logger: logger
|
38
|
+
)
|
39
|
+
@exporter_class.new(
|
40
|
+
ds: @ds,
|
41
|
+
query: ArbitraryQuery.new(query),
|
42
|
+
bundle_params: bundle_params,
|
43
|
+
enable_sort: enable_sort,
|
44
|
+
logger: logger
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'redshift_connector/s3_data_file_bundle'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class ImmediateExporter
|
6
|
+
def initialize(bundle:, logger: RedshiftConnector.logger)
|
7
|
+
@bundle = bundle
|
8
|
+
@logger = logger
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_reader :bundle
|
12
|
+
attr_reader :logger
|
13
|
+
|
14
|
+
def execute
|
15
|
+
@logger.info "USE #{@bundle.url}*"
|
16
|
+
@bundle
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# create module
|
2
|
+
module RedshiftConnector
|
3
|
+
module Importer
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'redshift_connector/importer/upsert'
|
8
|
+
require 'redshift_connector/importer/insert_delta'
|
9
|
+
require 'redshift_connector/importer/rebuild_rename'
|
10
|
+
require 'redshift_connector/importer/rebuild_truncate'
|
11
|
+
require 'redshift_connector/logger'
|
12
|
+
|
13
|
+
module RedshiftConnector
|
14
|
+
module Importer
|
15
|
+
def Importer.for_delta_upsert(table:, columns:, delete_cond: nil, upsert_columns: nil, logger: RedshiftConnector.logger)
|
16
|
+
if delete_cond and upsert_columns
|
17
|
+
raise ArgumentError, "delete_cond and upsert_columns are exclusive"
|
18
|
+
end
|
19
|
+
importer =
|
20
|
+
if delete_cond
|
21
|
+
Importer::InsertDelta.new(
|
22
|
+
dao: table.classify.constantize,
|
23
|
+
columns: columns,
|
24
|
+
delete_cond: delete_cond,
|
25
|
+
logger: logger
|
26
|
+
)
|
27
|
+
elsif upsert_columns
|
28
|
+
Importer::Upsert.new(
|
29
|
+
dao: table.classify.constantize,
|
30
|
+
columns: columns,
|
31
|
+
upsert_columns: upsert_columns,
|
32
|
+
logger: logger
|
33
|
+
)
|
34
|
+
else
|
35
|
+
raise ArgumentError, "either of delete_cond or upsert_columns is required for delta import"
|
36
|
+
end
|
37
|
+
importer
|
38
|
+
end
|
39
|
+
|
40
|
+
def Importer.for_rebuild(strategy: 'rename', table:, columns:, logger: RedshiftConnector.logger)
|
41
|
+
c = get_rebuild_class(strategy)
|
42
|
+
c.new(
|
43
|
+
dao: table.classify.constantize,
|
44
|
+
columns: columns,
|
45
|
+
logger: logger
|
46
|
+
)
|
47
|
+
end
|
48
|
+
|
49
|
+
def Importer.get_rebuild_class(strategy)
|
50
|
+
case strategy.to_s
|
51
|
+
when 'rename' then RebuildRename
|
52
|
+
when 'truncate' then RebuildTruncate
|
53
|
+
else
|
54
|
+
raise ArgumentError, "unsupported rebuild strategy: #{strategy.inspect}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::InsertDelta
|
6
|
+
def initialize(dao:, columns:, delete_cond:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@columns = columns
|
9
|
+
@delete_cond = delete_cond
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute(bundle)
|
14
|
+
delete_rows(@delete_cond)
|
15
|
+
import(bundle)
|
16
|
+
end
|
17
|
+
|
18
|
+
def delete_rows(cond_expr)
|
19
|
+
@logger.info "DELETE #{@dao.table_name} where (#{cond_expr})"
|
20
|
+
@dao.connection.execute("delete from #{@dao.table_name} where #{cond_expr}")
|
21
|
+
@logger.info "deleted."
|
22
|
+
end
|
23
|
+
|
24
|
+
def import(bundle)
|
25
|
+
@logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
|
26
|
+
bundle.each_batch do |rows|
|
27
|
+
@dao.import(@columns, rows)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
require 'thread'
|
4
|
+
|
5
|
+
module RedshiftConnector
|
6
|
+
class Importer::RebuildRename
|
7
|
+
def initialize(dao:, columns:, logger: RedshiftConnector.logger)
|
8
|
+
@dao = dao
|
9
|
+
@columns = columns
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute(bundle)
|
14
|
+
dest_table = @dao.table_name
|
15
|
+
tmp_table = "#{dest_table}_new"
|
16
|
+
old_table = "#{dest_table}_old"
|
17
|
+
|
18
|
+
tmp_dao = self.class.make_temporary_dao(@dao)
|
19
|
+
tmp_dao.table_name = tmp_table
|
20
|
+
|
21
|
+
exec_update "drop table if exists #{tmp_table}"
|
22
|
+
exec_update "create table #{tmp_table} like #{dest_table}"
|
23
|
+
import(tmp_dao, bundle)
|
24
|
+
exec_update "drop table if exists #{old_table}"
|
25
|
+
# Atomic table exchange
|
26
|
+
exec_update "rename table #{dest_table} to #{old_table}, #{tmp_table} to #{dest_table}"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Duplicates DAO (ActiveRecord class) and names it.
|
30
|
+
# Newer activerecord-import requires a class name (not a table name),
|
31
|
+
# we must prepare some name for temporary DAO class.
|
32
|
+
def self.make_temporary_dao(orig)
|
33
|
+
tmp = orig.dup
|
34
|
+
const_set("TemporaryDAO_#{get_unique_sequence}", tmp)
|
35
|
+
tmp.name # fix class name
|
36
|
+
tmp
|
37
|
+
end
|
38
|
+
|
39
|
+
@dao_seq = 0
|
40
|
+
@dao_seq_lock = Mutex.new
|
41
|
+
|
42
|
+
def self.get_unique_sequence
|
43
|
+
@dao_seq_lock.synchronize { @dao_seq += 1 }
|
44
|
+
end
|
45
|
+
|
46
|
+
def exec_update(query)
|
47
|
+
@logger.info query
|
48
|
+
@dao.connection.execute(query)
|
49
|
+
end
|
50
|
+
|
51
|
+
def import(dao, bundle)
|
52
|
+
@logger.info "IMPORT #{bundle.url}* -> #{dao.table_name} (#{@columns.join(', ')})"
|
53
|
+
bundle.each_batch do |rows|
|
54
|
+
dao.import(@columns, rows)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::RebuildTruncate
|
6
|
+
def initialize(dao:, columns:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@columns = columns
|
9
|
+
@logger = logger
|
10
|
+
end
|
11
|
+
|
12
|
+
def execute(bundle)
|
13
|
+
truncate_table(@dao.table_name)
|
14
|
+
import(bundle)
|
15
|
+
end
|
16
|
+
|
17
|
+
def truncate_table(table_name)
|
18
|
+
@logger.info "TRUNCATE #{table_name}"
|
19
|
+
@dao.connection.execute("truncate #{table_name}")
|
20
|
+
@logger.info "truncated."
|
21
|
+
end
|
22
|
+
|
23
|
+
def import(bundle)
|
24
|
+
@logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
|
25
|
+
bundle.each_batch do |rows|
|
26
|
+
@dao.import(@columns, rows)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'redshift_connector/importer/activerecord-import'
|
2
|
+
require 'redshift_connector/logger'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
class Importer::Upsert
|
6
|
+
def initialize(dao:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
|
7
|
+
@dao = dao
|
8
|
+
@columns = columns
|
9
|
+
@upsert_columns = upsert_columns
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
def execute(bundle)
|
14
|
+
import(bundle)
|
15
|
+
end
|
16
|
+
|
17
|
+
def import(bundle)
|
18
|
+
@logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
|
19
|
+
bundle.each_batch do |rows|
|
20
|
+
@dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|