redshift_connector 8.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE +21 -0
  5. data/README.md +42 -0
  6. data/RELEASE.md +89 -0
  7. data/Rakefile +3 -0
  8. data/lib/redshift_connector.rb +35 -0
  9. data/lib/redshift_connector/active_record_data_source.rb +23 -0
  10. data/lib/redshift_connector/active_record_exporter.rb +47 -0
  11. data/lib/redshift_connector/connector.rb +189 -0
  12. data/lib/redshift_connector/data_file.rb +32 -0
  13. data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
  14. data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
  15. data/lib/redshift_connector/exception.rb +5 -0
  16. data/lib/redshift_connector/exporter.rb +40 -0
  17. data/lib/redshift_connector/exporter_builder.rb +49 -0
  18. data/lib/redshift_connector/immediate_exporter.rb +19 -0
  19. data/lib/redshift_connector/importer.rb +58 -0
  20. data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
  21. data/lib/redshift_connector/importer/insert_delta.rb +31 -0
  22. data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
  23. data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
  24. data/lib/redshift_connector/importer/upsert.rb +24 -0
  25. data/lib/redshift_connector/logger.rb +20 -0
  26. data/lib/redshift_connector/query.rb +95 -0
  27. data/lib/redshift_connector/reader.rb +18 -0
  28. data/lib/redshift_connector/reader/abstract.rb +18 -0
  29. data/lib/redshift_connector/reader/csv.rb +24 -0
  30. data/lib/redshift_connector/reader/exception.rb +3 -0
  31. data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
  32. data/lib/redshift_connector/reader/tsv.rb +24 -0
  33. data/lib/redshift_connector/s3_bucket.rb +76 -0
  34. data/lib/redshift_connector/s3_data_file.rb +20 -0
  35. data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
  36. data/lib/redshift_connector/version.rb +3 -0
  37. data/redshift_connector.gemspec +27 -0
  38. metadata +190 -0
@@ -0,0 +1,32 @@
1
+ require 'redshift_connector/logger'
2
+ require 'zlib'
3
+
4
+ module RedshiftConnector
5
+ class DataFile
6
+ def initialize(reader_class:)
7
+ @reader_class = reader_class
8
+ end
9
+
10
+ def each_row(&block)
11
+ f = open
12
+ begin
13
+ if gzipped_object?
14
+ f = Zlib::GzipReader.new(f)
15
+ end
16
+ @reader_class.new(f).each(&block)
17
+ ensure
18
+ f.close
19
+ end
20
+ end
21
+
22
+ # abstract open
23
+
24
+ def data_object?
25
+ @reader_class.data_object?(key)
26
+ end
27
+
28
+ def gzipped_object?
29
+ File.extname(key) == '.gz'
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,25 @@
1
+ require 'redshift_connector/logger'
2
+
3
+ module RedshiftConnector
4
+ class DataFileBundleParams
5
+ def initialize(
6
+ bucket: nil,
7
+ schema:,
8
+ table:,
9
+ txn_id: nil,
10
+ logger: RedshiftConnector.logger
11
+ )
12
+ @bucket = bucket
13
+ @schema = schema
14
+ @table = table
15
+ @txn_id = txn_id
16
+ @logger = logger
17
+ end
18
+
19
+ attr_reader :bucket
20
+ attr_reader :schema
21
+ attr_reader :table
22
+ attr_reader :txn_id
23
+ attr_reader :logger
24
+ end
25
+ end
@@ -0,0 +1,72 @@
1
+ require 'redshift_connector/logger'
2
+ require 'forwardable'
3
+
4
+ module RedshiftConnector
5
+ class DataFileBundleReader
6
+ extend Forwardable
7
+
8
+ DEFAULT_BATCH_SIZE = 1000
9
+
10
+ def initialize(bundle, filter: nil, batch_size: DEFAULT_BATCH_SIZE, logger: RedshiftConnector.logger)
11
+ @bundle = bundle
12
+ @filter = filter || lambda {|*row| row }
13
+ @batch_size = batch_size || 1000
14
+ @logger = logger
15
+ end
16
+
17
+ attr_reader :bundle
18
+ attr_reader :batch_size
19
+ attr_reader :logger
20
+
21
+ def_delegators '@bundle', :url, :bucket, :key
22
+
23
+ def each_row(&block)
24
+ each_object do |obj|
25
+ obj.each_row(&block)
26
+ end
27
+ end
28
+
29
+ alias each each_row
30
+
31
+ def each_object(&block)
32
+ all_data_objects.each do |obj|
33
+ @logger.info "processing s3 object: #{obj.key}"
34
+ yield obj
35
+ end
36
+ end
37
+
38
+ def all_data_objects
39
+ @bundle.data_files.select {|obj| obj.data_object? }
40
+ end
41
+
42
+ REPORT_SIZE = 10_0000
43
+
44
+ def each_batch(report: true)
45
+ n = 0
46
+ reported = 0
47
+ do_each_batch(@batch_size) do |rows|
48
+ yield rows
49
+ n += rows.size
50
+ if n / REPORT_SIZE > reported
51
+ @logger.info "#{n} rows processed" if report
52
+ reported = n / REPORT_SIZE
53
+ end
54
+ end
55
+ @logger.info "total #{n} rows processed" if report
56
+ end
57
+
58
+ def do_each_batch(batch_size)
59
+ filter = @filter
60
+ buf = []
61
+ each_row do |row|
62
+ buf.push filter.(*row)
63
+ if buf.size == batch_size
64
+ yield buf
65
+ buf = []
66
+ end
67
+ end
68
+ yield buf unless buf.empty?
69
+ end
70
+ private :do_each_batch
71
+ end
72
+ end
@@ -0,0 +1,5 @@
1
+ module RedshiftConnector
2
+ class Error < ::StandardError; end
3
+ class ExportError < Error; end
4
+ class ImportError < Error; end
5
+ end
@@ -0,0 +1,40 @@
1
+ module RedshiftConnector
2
+ module Exporter
3
+ @default_data_source = nil
4
+
5
+ def Exporter.default_data_source=(ds)
6
+ @default_data_source = ds
7
+ end
8
+
9
+ def Exporter.default_data_source
10
+ @default_data_source or raise ArgumentError, "RedshiftConnector::Exporter.default_data_source was not set"
11
+ end
12
+
13
+ def Exporter.builder
14
+ default_data_source.exporter_builder
15
+ end
16
+
17
+ def Exporter.for_table_delta(**params)
18
+ builder.build_for_table_delta(**params)
19
+ end
20
+
21
+ def Exporter.for_table(**params)
22
+ builder.build_for_table(**params)
23
+ end
24
+
25
+ def Exporter.for_query(**params)
26
+ builder.build_for_query(**params)
27
+ end
28
+
29
+ def Exporter.foreach(**params, &block)
30
+ exporter = for_query(**params)
31
+ bundle = exporter.execute
32
+ r = DataFileBundleReader.new(bundle, logger: bundle.logger)
33
+ begin
34
+ r.each_row(&block)
35
+ ensure
36
+ bundle.clear if bundle.respond_to?(:clear)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,49 @@
1
+ require 'redshift_connector/query'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class ExporterBuilder
6
+ def initialize(ds:, exporter_class:)
7
+ @ds = ds
8
+ @exporter_class = exporter_class
9
+ end
10
+
11
+ def build_for_table_delta(schema:, table:, condition:, columns:, bundle_params:, logger: RedshiftConnector.logger)
12
+ query = DeltaQuery.new(schema: schema, table: table, columns: columns, condition: condition)
13
+ @exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
14
+ end
15
+
16
+ def build_for_table(schema:, table:, columns:, bundle_params:, logger: RedshiftConnector.logger)
17
+ query = SelectAllQuery.new(schema: schema, table: table, columns: columns)
18
+ @exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
19
+ end
20
+
21
+ def build_for_query(
22
+ schema:,
23
+ table:,
24
+ bucket: nil,
25
+ query:,
26
+ txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
27
+ enable_sort: false,
28
+ logger: RedshiftConnector.logger,
29
+ quiet: false
30
+ )
31
+ logger = NullLogger.new if quiet
32
+ bundle_params = DataFileBundleParams.new(
33
+ bucket: bucket,
34
+ schema: schema,
35
+ table: table,
36
+ txn_id: txn_id,
37
+ logger: logger
38
+ )
39
+ @exporter_class.new(
40
+ ds: @ds,
41
+ query: ArbitraryQuery.new(query),
42
+ bundle_params: bundle_params,
43
+ enable_sort: enable_sort,
44
+ logger: logger
45
+ )
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,19 @@
1
+ require 'redshift_connector/s3_data_file_bundle'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class ImmediateExporter
6
+ def initialize(bundle:, logger: RedshiftConnector.logger)
7
+ @bundle = bundle
8
+ @logger = logger
9
+ end
10
+
11
+ attr_reader :bundle
12
+ attr_reader :logger
13
+
14
+ def execute
15
+ @logger.info "USE #{@bundle.url}*"
16
+ @bundle
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,58 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Importer
4
+ end
5
+ end
6
+
7
+ require 'redshift_connector/importer/upsert'
8
+ require 'redshift_connector/importer/insert_delta'
9
+ require 'redshift_connector/importer/rebuild_rename'
10
+ require 'redshift_connector/importer/rebuild_truncate'
11
+ require 'redshift_connector/logger'
12
+
13
+ module RedshiftConnector
14
+ module Importer
15
+ def Importer.for_delta_upsert(table:, columns:, delete_cond: nil, upsert_columns: nil, logger: RedshiftConnector.logger)
16
+ if delete_cond and upsert_columns
17
+ raise ArgumentError, "delete_cond and upsert_columns are exclusive"
18
+ end
19
+ importer =
20
+ if delete_cond
21
+ Importer::InsertDelta.new(
22
+ dao: table.classify.constantize,
23
+ columns: columns,
24
+ delete_cond: delete_cond,
25
+ logger: logger
26
+ )
27
+ elsif upsert_columns
28
+ Importer::Upsert.new(
29
+ dao: table.classify.constantize,
30
+ columns: columns,
31
+ upsert_columns: upsert_columns,
32
+ logger: logger
33
+ )
34
+ else
35
+ raise ArgumentError, "either of delete_cond or upsert_columns is required for delta import"
36
+ end
37
+ importer
38
+ end
39
+
40
+ def Importer.for_rebuild(strategy: 'rename', table:, columns:, logger: RedshiftConnector.logger)
41
+ c = get_rebuild_class(strategy)
42
+ c.new(
43
+ dao: table.classify.constantize,
44
+ columns: columns,
45
+ logger: logger
46
+ )
47
+ end
48
+
49
+ def Importer.get_rebuild_class(strategy)
50
+ case strategy.to_s
51
+ when 'rename' then RebuildRename
52
+ when 'truncate' then RebuildTruncate
53
+ else
54
+ raise ArgumentError, "unsupported rebuild strategy: #{strategy.inspect}"
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,2 @@
1
+ require 'activerecord-import'
2
+ require 'activerecord-import/base'
@@ -0,0 +1,31 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::InsertDelta
6
+ def initialize(dao:, columns:, delete_cond:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @columns = columns
9
+ @delete_cond = delete_cond
10
+ @logger = logger
11
+ end
12
+
13
+ def execute(bundle)
14
+ delete_rows(@delete_cond)
15
+ import(bundle)
16
+ end
17
+
18
+ def delete_rows(cond_expr)
19
+ @logger.info "DELETE #{@dao.table_name} where (#{cond_expr})"
20
+ @dao.connection.execute("delete from #{@dao.table_name} where #{cond_expr}")
21
+ @logger.info "deleted."
22
+ end
23
+
24
+ def import(bundle)
25
+ @logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
26
+ bundle.each_batch do |rows|
27
+ @dao.import(@columns, rows)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,58 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+ require 'thread'
4
+
5
+ module RedshiftConnector
6
+ class Importer::RebuildRename
7
+ def initialize(dao:, columns:, logger: RedshiftConnector.logger)
8
+ @dao = dao
9
+ @columns = columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute(bundle)
14
+ dest_table = @dao.table_name
15
+ tmp_table = "#{dest_table}_new"
16
+ old_table = "#{dest_table}_old"
17
+
18
+ tmp_dao = self.class.make_temporary_dao(@dao)
19
+ tmp_dao.table_name = tmp_table
20
+
21
+ exec_update "drop table if exists #{tmp_table}"
22
+ exec_update "create table #{tmp_table} like #{dest_table}"
23
+ import(tmp_dao, bundle)
24
+ exec_update "drop table if exists #{old_table}"
25
+ # Atomic table exchange
26
+ exec_update "rename table #{dest_table} to #{old_table}, #{tmp_table} to #{dest_table}"
27
+ end
28
+
29
+ # Duplicates DAO (ActiveRecord class) and names it.
30
+ # Newer activerecord-import requires a class name (not a table name),
31
+ # we must prepare some name for temporary DAO class.
32
+ def self.make_temporary_dao(orig)
33
+ tmp = orig.dup
34
+ const_set("TemporaryDAO_#{get_unique_sequence}", tmp)
35
+ tmp.name # fix class name
36
+ tmp
37
+ end
38
+
39
+ @dao_seq = 0
40
+ @dao_seq_lock = Mutex.new
41
+
42
+ def self.get_unique_sequence
43
+ @dao_seq_lock.synchronize { @dao_seq += 1 }
44
+ end
45
+
46
+ def exec_update(query)
47
+ @logger.info query
48
+ @dao.connection.execute(query)
49
+ end
50
+
51
+ def import(dao, bundle)
52
+ @logger.info "IMPORT #{bundle.url}* -> #{dao.table_name} (#{@columns.join(', ')})"
53
+ bundle.each_batch do |rows|
54
+ dao.import(@columns, rows)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,30 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::RebuildTruncate
6
+ def initialize(dao:, columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @columns = columns
9
+ @logger = logger
10
+ end
11
+
12
+ def execute(bundle)
13
+ truncate_table(@dao.table_name)
14
+ import(bundle)
15
+ end
16
+
17
+ def truncate_table(table_name)
18
+ @logger.info "TRUNCATE #{table_name}"
19
+ @dao.connection.execute("truncate #{table_name}")
20
+ @logger.info "truncated."
21
+ end
22
+
23
+ def import(bundle)
24
+ @logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
25
+ bundle.each_batch do |rows|
26
+ @dao.import(@columns, rows)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::Upsert
6
+ def initialize(dao:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @columns = columns
9
+ @upsert_columns = upsert_columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute(bundle)
14
+ import(bundle)
15
+ end
16
+
17
+ def import(bundle)
18
+ @logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
19
+ bundle.each_batch do |rows|
20
+ @dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
21
+ end
22
+ end
23
+ end
24
+ end