redshift_connector 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE +21 -0
  5. data/README.md +42 -0
  6. data/RELEASE.md +89 -0
  7. data/Rakefile +3 -0
  8. data/lib/redshift_connector.rb +35 -0
  9. data/lib/redshift_connector/active_record_data_source.rb +23 -0
  10. data/lib/redshift_connector/active_record_exporter.rb +47 -0
  11. data/lib/redshift_connector/connector.rb +189 -0
  12. data/lib/redshift_connector/data_file.rb +32 -0
  13. data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
  14. data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
  15. data/lib/redshift_connector/exception.rb +5 -0
  16. data/lib/redshift_connector/exporter.rb +40 -0
  17. data/lib/redshift_connector/exporter_builder.rb +49 -0
  18. data/lib/redshift_connector/immediate_exporter.rb +19 -0
  19. data/lib/redshift_connector/importer.rb +58 -0
  20. data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
  21. data/lib/redshift_connector/importer/insert_delta.rb +31 -0
  22. data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
  23. data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
  24. data/lib/redshift_connector/importer/upsert.rb +24 -0
  25. data/lib/redshift_connector/logger.rb +20 -0
  26. data/lib/redshift_connector/query.rb +95 -0
  27. data/lib/redshift_connector/reader.rb +18 -0
  28. data/lib/redshift_connector/reader/abstract.rb +18 -0
  29. data/lib/redshift_connector/reader/csv.rb +24 -0
  30. data/lib/redshift_connector/reader/exception.rb +3 -0
  31. data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
  32. data/lib/redshift_connector/reader/tsv.rb +24 -0
  33. data/lib/redshift_connector/s3_bucket.rb +76 -0
  34. data/lib/redshift_connector/s3_data_file.rb +20 -0
  35. data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
  36. data/lib/redshift_connector/version.rb +3 -0
  37. data/redshift_connector.gemspec +27 -0
  38. metadata +190 -0
@@ -0,0 +1,32 @@
1
+ require 'redshift_connector/logger'
2
+ require 'zlib'
3
+
4
+ module RedshiftConnector
5
+ class DataFile
6
+ def initialize(reader_class:)
7
+ @reader_class = reader_class
8
+ end
9
+
10
+ def each_row(&block)
11
+ f = open
12
+ begin
13
+ if gzipped_object?
14
+ f = Zlib::GzipReader.new(f)
15
+ end
16
+ @reader_class.new(f).each(&block)
17
+ ensure
18
+ f.close
19
+ end
20
+ end
21
+
22
+ # abstract open
23
+
24
+ def data_object?
25
+ @reader_class.data_object?(key)
26
+ end
27
+
28
+ def gzipped_object?
29
+ File.extname(key) == '.gz'
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,25 @@
1
+ require 'redshift_connector/logger'
2
+
3
+ module RedshiftConnector
4
+ class DataFileBundleParams
5
+ def initialize(
6
+ bucket: nil,
7
+ schema:,
8
+ table:,
9
+ txn_id: nil,
10
+ logger: RedshiftConnector.logger
11
+ )
12
+ @bucket = bucket
13
+ @schema = schema
14
+ @table = table
15
+ @txn_id = txn_id
16
+ @logger = logger
17
+ end
18
+
19
+ attr_reader :bucket
20
+ attr_reader :schema
21
+ attr_reader :table
22
+ attr_reader :txn_id
23
+ attr_reader :logger
24
+ end
25
+ end
@@ -0,0 +1,72 @@
1
+ require 'redshift_connector/logger'
2
+ require 'forwardable'
3
+
4
+ module RedshiftConnector
5
+ class DataFileBundleReader
6
+ extend Forwardable
7
+
8
+ DEFAULT_BATCH_SIZE = 1000
9
+
10
+ def initialize(bundle, filter: nil, batch_size: DEFAULT_BATCH_SIZE, logger: RedshiftConnector.logger)
11
+ @bundle = bundle
12
+ @filter = filter || lambda {|*row| row }
13
+ @batch_size = batch_size || 1000
14
+ @logger = logger
15
+ end
16
+
17
+ attr_reader :bundle
18
+ attr_reader :batch_size
19
+ attr_reader :logger
20
+
21
+ def_delegators '@bundle', :url, :bucket, :key
22
+
23
+ def each_row(&block)
24
+ each_object do |obj|
25
+ obj.each_row(&block)
26
+ end
27
+ end
28
+
29
+ alias each each_row
30
+
31
+ def each_object(&block)
32
+ all_data_objects.each do |obj|
33
+ @logger.info "processing s3 object: #{obj.key}"
34
+ yield obj
35
+ end
36
+ end
37
+
38
+ def all_data_objects
39
+ @bundle.data_files.select {|obj| obj.data_object? }
40
+ end
41
+
42
+ REPORT_SIZE = 10_0000
43
+
44
+ def each_batch(report: true)
45
+ n = 0
46
+ reported = 0
47
+ do_each_batch(@batch_size) do |rows|
48
+ yield rows
49
+ n += rows.size
50
+ if n / REPORT_SIZE > reported
51
+ @logger.info "#{n} rows processed" if report
52
+ reported = n / REPORT_SIZE
53
+ end
54
+ end
55
+ @logger.info "total #{n} rows processed" if report
56
+ end
57
+
58
+ def do_each_batch(batch_size)
59
+ filter = @filter
60
+ buf = []
61
+ each_row do |row|
62
+ buf.push filter.(*row)
63
+ if buf.size == batch_size
64
+ yield buf
65
+ buf = []
66
+ end
67
+ end
68
+ yield buf unless buf.empty?
69
+ end
70
+ private :do_each_batch
71
+ end
72
+ end
@@ -0,0 +1,5 @@
1
+ module RedshiftConnector
2
+ class Error < ::StandardError; end
3
+ class ExportError < Error; end
4
+ class ImportError < Error; end
5
+ end
@@ -0,0 +1,40 @@
1
+ module RedshiftConnector
2
+ module Exporter
3
+ @default_data_source = nil
4
+
5
+ def Exporter.default_data_source=(ds)
6
+ @default_data_source = ds
7
+ end
8
+
9
+ def Exporter.default_data_source
10
+ @default_data_source or raise ArgumentError, "RedshiftConnector::Exporter.default_data_source was not set"
11
+ end
12
+
13
+ def Exporter.builder
14
+ default_data_source.exporter_builder
15
+ end
16
+
17
+ def Exporter.for_table_delta(**params)
18
+ builder.build_for_table_delta(**params)
19
+ end
20
+
21
+ def Exporter.for_table(**params)
22
+ builder.build_for_table(**params)
23
+ end
24
+
25
+ def Exporter.for_query(**params)
26
+ builder.build_for_query(**params)
27
+ end
28
+
29
+ def Exporter.foreach(**params, &block)
30
+ exporter = for_query(**params)
31
+ bundle = exporter.execute
32
+ r = DataFileBundleReader.new(bundle, logger: bundle.logger)
33
+ begin
34
+ r.each_row(&block)
35
+ ensure
36
+ bundle.clear if bundle.respond_to?(:clear)
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,49 @@
1
+ require 'redshift_connector/query'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class ExporterBuilder
6
+ def initialize(ds:, exporter_class:)
7
+ @ds = ds
8
+ @exporter_class = exporter_class
9
+ end
10
+
11
+ def build_for_table_delta(schema:, table:, condition:, columns:, bundle_params:, logger: RedshiftConnector.logger)
12
+ query = DeltaQuery.new(schema: schema, table: table, columns: columns, condition: condition)
13
+ @exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
14
+ end
15
+
16
+ def build_for_table(schema:, table:, columns:, bundle_params:, logger: RedshiftConnector.logger)
17
+ query = SelectAllQuery.new(schema: schema, table: table, columns: columns)
18
+ @exporter_class.new(ds: @ds, query: query, bundle_params: bundle_params, logger: logger)
19
+ end
20
+
21
+ def build_for_query(
22
+ schema:,
23
+ table:,
24
+ bucket: nil,
25
+ query:,
26
+ txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
27
+ enable_sort: false,
28
+ logger: RedshiftConnector.logger,
29
+ quiet: false
30
+ )
31
+ logger = NullLogger.new if quiet
32
+ bundle_params = DataFileBundleParams.new(
33
+ bucket: bucket,
34
+ schema: schema,
35
+ table: table,
36
+ txn_id: txn_id,
37
+ logger: logger
38
+ )
39
+ @exporter_class.new(
40
+ ds: @ds,
41
+ query: ArbitraryQuery.new(query),
42
+ bundle_params: bundle_params,
43
+ enable_sort: enable_sort,
44
+ logger: logger
45
+ )
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,19 @@
1
+ require 'redshift_connector/s3_data_file_bundle'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class ImmediateExporter
6
+ def initialize(bundle:, logger: RedshiftConnector.logger)
7
+ @bundle = bundle
8
+ @logger = logger
9
+ end
10
+
11
+ attr_reader :bundle
12
+ attr_reader :logger
13
+
14
+ def execute
15
+ @logger.info "USE #{@bundle.url}*"
16
+ @bundle
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,58 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Importer
4
+ end
5
+ end
6
+
7
+ require 'redshift_connector/importer/upsert'
8
+ require 'redshift_connector/importer/insert_delta'
9
+ require 'redshift_connector/importer/rebuild_rename'
10
+ require 'redshift_connector/importer/rebuild_truncate'
11
+ require 'redshift_connector/logger'
12
+
13
+ module RedshiftConnector
14
+ module Importer
15
+ def Importer.for_delta_upsert(table:, columns:, delete_cond: nil, upsert_columns: nil, logger: RedshiftConnector.logger)
16
+ if delete_cond and upsert_columns
17
+ raise ArgumentError, "delete_cond and upsert_columns are exclusive"
18
+ end
19
+ importer =
20
+ if delete_cond
21
+ Importer::InsertDelta.new(
22
+ dao: table.classify.constantize,
23
+ columns: columns,
24
+ delete_cond: delete_cond,
25
+ logger: logger
26
+ )
27
+ elsif upsert_columns
28
+ Importer::Upsert.new(
29
+ dao: table.classify.constantize,
30
+ columns: columns,
31
+ upsert_columns: upsert_columns,
32
+ logger: logger
33
+ )
34
+ else
35
+ raise ArgumentError, "either of delete_cond or upsert_columns is required for delta import"
36
+ end
37
+ importer
38
+ end
39
+
40
+ def Importer.for_rebuild(strategy: 'rename', table:, columns:, logger: RedshiftConnector.logger)
41
+ c = get_rebuild_class(strategy)
42
+ c.new(
43
+ dao: table.classify.constantize,
44
+ columns: columns,
45
+ logger: logger
46
+ )
47
+ end
48
+
49
+ def Importer.get_rebuild_class(strategy)
50
+ case strategy.to_s
51
+ when 'rename' then RebuildRename
52
+ when 'truncate' then RebuildTruncate
53
+ else
54
+ raise ArgumentError, "unsupported rebuild strategy: #{strategy.inspect}"
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,2 @@
1
+ require 'activerecord-import'
2
+ require 'activerecord-import/base'
@@ -0,0 +1,31 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::InsertDelta
6
+ def initialize(dao:, columns:, delete_cond:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @columns = columns
9
+ @delete_cond = delete_cond
10
+ @logger = logger
11
+ end
12
+
13
+ def execute(bundle)
14
+ delete_rows(@delete_cond)
15
+ import(bundle)
16
+ end
17
+
18
+ def delete_rows(cond_expr)
19
+ @logger.info "DELETE #{@dao.table_name} where (#{cond_expr})"
20
+ @dao.connection.execute("delete from #{@dao.table_name} where #{cond_expr}")
21
+ @logger.info "deleted."
22
+ end
23
+
24
+ def import(bundle)
25
+ @logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
26
+ bundle.each_batch do |rows|
27
+ @dao.import(@columns, rows)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,58 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+ require 'thread'
4
+
5
+ module RedshiftConnector
6
+ class Importer::RebuildRename
7
+ def initialize(dao:, columns:, logger: RedshiftConnector.logger)
8
+ @dao = dao
9
+ @columns = columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute(bundle)
14
+ dest_table = @dao.table_name
15
+ tmp_table = "#{dest_table}_new"
16
+ old_table = "#{dest_table}_old"
17
+
18
+ tmp_dao = self.class.make_temporary_dao(@dao)
19
+ tmp_dao.table_name = tmp_table
20
+
21
+ exec_update "drop table if exists #{tmp_table}"
22
+ exec_update "create table #{tmp_table} like #{dest_table}"
23
+ import(tmp_dao, bundle)
24
+ exec_update "drop table if exists #{old_table}"
25
+ # Atomic table exchange
26
+ exec_update "rename table #{dest_table} to #{old_table}, #{tmp_table} to #{dest_table}"
27
+ end
28
+
29
+ # Duplicates DAO (ActiveRecord class) and names it.
30
+ # Newer activerecord-import requires a class name (not a table name),
31
+ # we must prepare some name for temporary DAO class.
32
+ def self.make_temporary_dao(orig)
33
+ tmp = orig.dup
34
+ const_set("TemporaryDAO_#{get_unique_sequence}", tmp)
35
+ tmp.name # fix class name
36
+ tmp
37
+ end
38
+
39
+ @dao_seq = 0
40
+ @dao_seq_lock = Mutex.new
41
+
42
+ def self.get_unique_sequence
43
+ @dao_seq_lock.synchronize { @dao_seq += 1 }
44
+ end
45
+
46
+ def exec_update(query)
47
+ @logger.info query
48
+ @dao.connection.execute(query)
49
+ end
50
+
51
+ def import(dao, bundle)
52
+ @logger.info "IMPORT #{bundle.url}* -> #{dao.table_name} (#{@columns.join(', ')})"
53
+ bundle.each_batch do |rows|
54
+ dao.import(@columns, rows)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,30 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::RebuildTruncate
6
+ def initialize(dao:, columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @columns = columns
9
+ @logger = logger
10
+ end
11
+
12
+ def execute(bundle)
13
+ truncate_table(@dao.table_name)
14
+ import(bundle)
15
+ end
16
+
17
+ def truncate_table(table_name)
18
+ @logger.info "TRUNCATE #{table_name}"
19
+ @dao.connection.execute("truncate #{table_name}")
20
+ @logger.info "truncated."
21
+ end
22
+
23
+ def import(bundle)
24
+ @logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
25
+ bundle.each_batch do |rows|
26
+ @dao.import(@columns, rows)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift_connector/importer/activerecord-import'
2
+ require 'redshift_connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::Upsert
6
+ def initialize(dao:, columns:, upsert_columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @columns = columns
9
+ @upsert_columns = upsert_columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute(bundle)
14
+ import(bundle)
15
+ end
16
+
17
+ def import(bundle)
18
+ @logger.info "IMPORT #{bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')}) upsert (#{@upsert_columns.join(', ')})"
19
+ bundle.each_batch do |rows|
20
+ @dao.import(@columns, rows, on_duplicate_key_update: @upsert_columns)
21
+ end
22
+ end
23
+ end
24
+ end