redshift-connector 4.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +10 -0
  3. data/lib/redshift-connector.rb +31 -0
  4. data/lib/redshift-connector/connector.rb +146 -0
  5. data/lib/redshift-connector/exporter.rb +116 -0
  6. data/lib/redshift-connector/importer.rb +89 -0
  7. data/lib/redshift-connector/importer/activerecord-import.rb +2 -0
  8. data/lib/redshift-connector/importer/insert_delta.rb +32 -0
  9. data/lib/redshift-connector/importer/rebuild_rename.rb +41 -0
  10. data/lib/redshift-connector/importer/rebuild_truncate.rb +31 -0
  11. data/lib/redshift-connector/importer/upsert.rb +25 -0
  12. data/lib/redshift-connector/logger.rb +20 -0
  13. data/lib/redshift-connector/query.rb +93 -0
  14. data/lib/redshift-connector/reader.rb +18 -0
  15. data/lib/redshift-connector/reader/abstract.rb +18 -0
  16. data/lib/redshift-connector/reader/csv.rb +24 -0
  17. data/lib/redshift-connector/reader/exception.rb +3 -0
  18. data/lib/redshift-connector/reader/redshift_csv.rb +54 -0
  19. data/lib/redshift-connector/reader/tsv.rb +24 -0
  20. data/lib/redshift-connector/s3_bucket.rb +72 -0
  21. data/lib/redshift-connector/s3_data_file.rb +34 -0
  22. data/lib/redshift-connector/s3_data_file_bundle.rb +101 -0
  23. data/lib/redshift-connector/version.rb +3 -0
  24. data/test/all.rb +3 -0
  25. data/test/config.rb +13 -0
  26. data/test/config.rb.example +18 -0
  27. data/test/database.yml +15 -0
  28. data/test/database.yml.example +15 -0
  29. data/test/foreach.rb +5 -0
  30. data/test/helper.rb +25 -0
  31. data/test/item_pvs.ct.mysql +11 -0
  32. data/test/item_pvs.ct.redshift +9 -0
  33. data/test/reader/test_redshift_csv.rb +30 -0
  34. data/test/test_connector.rb +148 -0
  35. data/test/test_reader.rb +10 -0
  36. data/test/test_s3_import.rb +32 -0
  37. metadata +190 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c5d5bf307943f19b5187a9244ecd89717241d72f
4
+ data.tar.gz: 1651cd723205ee50247c5e407f39aaa1eafd287d
5
+ SHA512:
6
+ metadata.gz: ff3ef2d4b7e647617529313df5b1d24eb9bf537672ba4aeba3de6542d2fa60589576854b308bc7db5b34c853e1cf30a1a517f6cfd1c58b876018dd6c6fcab7c8
7
+ data.tar.gz: 51de6b97fd7099a2dfaf401b1f230e49f991c623cd43949bbf9e8a4e5f822180040b024638938613aaa89fdbaced93f5c9e27bf6e5e57c8e3d6c24a486f8c06b
data/README.md ADDED
@@ -0,0 +1,10 @@
1
+ # Redshift Connector for Rails
2
+
3
+ redshift-connector is a Redshift bulk data connector for Rails (ActiveRecord).
4
+
5
+ ## Settings
6
+
7
+ Add following block to your Gemfile and bundle.
8
+ ```
9
+ gem 'redshift-connector'
10
+ ```
@@ -0,0 +1,31 @@
1
+ module RedshiftConnector
2
+ end
3
+
4
+ require 'redshift-connector/connector'
5
+ require 'redshift-connector/exporter'
6
+ require 'redshift-connector/importer'
7
+ require 'redshift-connector/s3_bucket'
8
+ require 'redshift-connector/s3_data_file_bundle'
9
+ require 'redshift-connector/version'
10
+
11
+ module RedshiftConnector
12
+ def RedshiftConnector.transport_delta(**params)
13
+ Connector.transport_delta(**params)
14
+ end
15
+
16
+ def RedshiftConnector.transport_all(**params)
17
+ Connector.transport_all(**params)
18
+ end
19
+
20
+ def RedshiftConnector.transport_delta_from_s3(**params)
21
+ Importer.transport_delta_from_s3(**params)
22
+ end
23
+
24
+ def RedshiftConnector.transport_all_from_s3(**params)
25
+ Importer.transport_all_from_s3(**params)
26
+ end
27
+
28
+ def RedshiftConnector.foreach(**params, &block)
29
+ Exporter.foreach(**params, &block)
30
+ end
31
+ end
@@ -0,0 +1,146 @@
1
+ require 'redshift-connector/exporter'
2
+ require 'redshift-connector/importer'
3
+ require 'redshift-connector/s3_data_file_bundle'
4
+ require 'redshift-connector/logger'
5
+
6
+ module RedshiftConnector
7
+ class Connector
8
+ def Connector.transport_delta(
9
+ schema:,
10
+ table: nil,
11
+ src_table: table,
12
+ dest_table: table,
13
+ condition:,
14
+ columns:,
15
+ delete_cond: nil,
16
+ upsert_columns: nil,
17
+ bucket: nil,
18
+ txn_id:, filter:,
19
+ logger: RedshiftConnector.logger,
20
+ quiet: false
21
+ )
22
+ unless src_table and dest_table
23
+ raise ArgumentError, "missing :table, :src_table or :dest_table"
24
+ end
25
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
26
+ logger = NullLogger.new if quiet
27
+ bundle = S3DataFileBundle.for_table(
28
+ bucket: bucket,
29
+ schema: schema,
30
+ table: src_table,
31
+ txn_id: txn_id,
32
+ filter: filter,
33
+ logger: logger
34
+ )
35
+ exporter = Exporter.for_table_delta(
36
+ bundle: bundle,
37
+ schema: schema,
38
+ table: src_table,
39
+ columns: columns,
40
+ condition: condition,
41
+ logger: logger
42
+ )
43
+ if delete_cond and upsert_columns
44
+ raise ArgumentError, "delete_cond and upsert_columns are exclusive"
45
+ end
46
+ dao = dest_table.classify.constantize
47
+ importer =
48
+ if delete_cond
49
+ Importer::InsertDelta.new(
50
+ dao: dao,
51
+ bundle: bundle,
52
+ columns: columns,
53
+ delete_cond: delete_cond,
54
+ logger: logger
55
+ )
56
+ elsif upsert_columns
57
+ Importer::Upsert.new(
58
+ dao: dao,
59
+ bundle: bundle,
60
+ columns: columns,
61
+ upsert_columns: upsert_columns,
62
+ logger: logger
63
+ )
64
+ else
65
+ raise ArgumentError, "either of delete_cond or upsert_columns is required for transport_delta"
66
+ end
67
+ new(exporter: exporter, importer: importer, logger: logger)
68
+ end
69
+
70
+ def Connector.transport_all(
71
+ strategy: 'rename',
72
+ schema:,
73
+ table:,
74
+ src_table: table,
75
+ dest_table: table,
76
+ columns:,
77
+ bucket: nil,
78
+ txn_id:,
79
+ filter:,
80
+ logger: RedshiftConnector.logger,
81
+ quiet: false
82
+ )
83
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
84
+ logger = NullLogger.new if quiet
85
+ bundle = S3DataFileBundle.for_table(
86
+ bucket: bucket,
87
+ schema: schema,
88
+ table: table,
89
+ txn_id: txn_id,
90
+ filter: filter,
91
+ logger: logger
92
+ )
93
+ exporter = Exporter.for_table(
94
+ bundle: bundle,
95
+ schema: schema,
96
+ table: table,
97
+ columns: columns,
98
+ logger: logger
99
+ )
100
+ importer = Importer.get_rebuild_class(strategy).new(
101
+ dao: table.classify.constantize,
102
+ bundle: bundle,
103
+ columns: columns,
104
+ logger: logger
105
+ )
106
+ new(exporter: exporter, importer: importer, logger: logger)
107
+ end
108
+
109
+ def initialize(exporter:, importer:, logger:)
110
+ @exporter = exporter
111
+ @importer = importer
112
+ @logger = logger
113
+ end
114
+
115
+ def export_enabled?
116
+ not ENV['IMPORT_ONLY']
117
+ end
118
+
119
+ def export_forced?
120
+ !! (ENV['EXPORT_ONLY'] or ENV['FORCE'])
121
+ end
122
+
123
+ def import_enabled?
124
+ not ENV['EXPORT_ONLY']
125
+ end
126
+
127
+ def execute
128
+ export(forced: export_forced?) if export_enabled?
129
+ import if import_enabled?
130
+ end
131
+
132
+ def export(forced: false)
133
+ @logger.info "==== export task =================================================="
134
+ if not forced and @exporter.completed?
135
+ @logger.info "export task is already executed; skip"
136
+ else
137
+ @exporter.execute
138
+ end
139
+ end
140
+
141
+ def import
142
+ @logger.info "==== import task =================================================="
143
+ @importer.execute
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,116 @@
1
+ require 'redshift-connector/query'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Exporter
6
+ def Exporter.default_data_source=(ds)
7
+ @default_data_source = ds
8
+ end
9
+
10
+ def Exporter.default_data_source
11
+ @default_data_source or raise ArgumentError, "RedshiftConnector::Exporter.default_data_source was not set"
12
+ end
13
+
14
+ def Exporter.for_table_delta(ds: default_data_source, schema:, table:, condition:, columns:, bundle:, logger: RedshiftConnector.logger)
15
+ delta_query = DeltaQuery.new(schema: schema, table: table, columns: columns, condition: condition)
16
+ unload_query = UnloadQuery.new(query: delta_query, bundle: bundle)
17
+ new(ds: ds, query: unload_query, bundle: bundle, logger: logger)
18
+ end
19
+
20
+ def Exporter.for_table(ds: default_data_source, schema:, table:, columns:, bundle:, logger: RedshiftConnector.logger)
21
+ query = SelectAllQuery.new(schema: schema, table: table, columns: columns)
22
+ unload_query = UnloadQuery.new(query: query, bundle: bundle)
23
+ new(ds: ds, query: unload_query, bundle: bundle, logger: logger)
24
+ end
25
+
26
+ def Exporter.foreach(**params, &block)
27
+ exporter = Exporter.for_query(**params)
28
+ begin
29
+ exporter.execute
30
+ exporter.bundle.each_row(&block)
31
+ ensure
32
+ exporter.bundle.clear
33
+ end
34
+ end
35
+
36
+ def Exporter.for_query(
37
+ ds: default_data_source,
38
+ schema:,
39
+ table:,
40
+ bucket: nil,
41
+ query:,
42
+ txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
43
+ filter: nil,
44
+ logger: RedshiftConnector.logger,
45
+ quiet: false
46
+ )
47
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
48
+ logger = NullLogger.new if quiet
49
+ bundle = S3DataFileBundle.for_table(
50
+ bucket: bucket,
51
+ schema: schema,
52
+ table: table,
53
+ txn_id: txn_id,
54
+ filter: filter,
55
+ logger: logger
56
+ )
57
+ exporter = Exporter.new(
58
+ ds: ds,
59
+ query: UnloadQuery.wrap(query: query, bundle: bundle),
60
+ bundle: bundle,
61
+ logger: logger
62
+ )
63
+ exporter
64
+ end
65
+
66
+ def initialize(ds: self.class.default_data_source, query:, bundle:, logger: RedshiftConnector.logger)
67
+ @ds = ds
68
+ @query = query
69
+ @bundle = bundle
70
+ @logger = logger
71
+ end
72
+
73
+ attr_reader :query
74
+ attr_reader :bundle
75
+ attr_reader :logger
76
+
77
+ def completed?
78
+ @bundle.bucket.object(flag_object_key).exists?
79
+ end
80
+
81
+ def create_flag_object
82
+ @logger.info "TOUCH #{flag_object_key}"
83
+ @bundle.bucket.object(flag_object_key).put(body: "OK")
84
+ end
85
+
86
+ def flag_object_key
87
+ "#{File.dirname(@bundle.prefix)}/00completed"
88
+ end
89
+
90
+ def execute
91
+ @bundle.clear
92
+ @logger.info "EXPORT #{@query.description} -> #{@bundle.url}*"
93
+ @ds.connection_pool.with_connection do |conn|
94
+ stmt = @query.to_sql
95
+ @logger.info "[SQL/Redshift] #{batch_job_label}#{stmt.strip}"
96
+ conn.execute(batch_job_label + stmt)
97
+ end
98
+ create_flag_object
99
+ end
100
+
101
+ def batch_job_label
102
+ @batch_job_label ||= begin
103
+ components = Dir.getwd.split('/')
104
+ app = if components.last == 'current'
105
+ # is Capistrano environment
106
+ components[-2]
107
+ else
108
+ components[-1]
109
+ end
110
+ batch_file = caller.detect {|c| /redshift-connector|active_record/ !~ c }
111
+ path = batch_file ? batch_file.split(':').first : '?'
112
+ "/* Job: #{app}:#{path} */ "
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,89 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Importer
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/importer/upsert'
8
+ require 'redshift-connector/importer/insert_delta'
9
+ require 'redshift-connector/importer/rebuild_rename'
10
+ require 'redshift-connector/importer/rebuild_truncate'
11
+
12
+ require 'redshift-connector/s3_data_file_bundle'
13
+ require 'redshift-connector/logger'
14
+
15
+ module RedshiftConnector
16
+ module Importer
17
+ def Importer.transport_delta_from_s3(
18
+ bucket: nil, prefix:, format:, filter: nil,
19
+ table:, columns:,
20
+ delete_cond: nil, upsert_columns: nil,
21
+ logger: RedshiftConnector.logger, quiet: false)
22
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
23
+ logger = NullLogger.new if quiet
24
+ bundle = S3DataFileBundle.for_prefix(
25
+ bucket: bucket,
26
+ prefix: prefix,
27
+ format: format,
28
+ filter: filter,
29
+ logger: logger
30
+ )
31
+ if delete_cond and upsert_columns
32
+ raise ArgumentError, "delete_cond and upsert_columns are exclusive"
33
+ end
34
+ importer =
35
+ if delete_cond
36
+ Importer::InsertDelta.new(
37
+ dao: table.classify.constantize,
38
+ bundle: bundle,
39
+ columns: columns,
40
+ delete_cond: delete_cond,
41
+ logger: logger
42
+ )
43
+ elsif upsert_columns
44
+ Importer::Upsert.new(
45
+ dao: table.classify.constantize,
46
+ bundle: bundle,
47
+ columns: columns,
48
+ upsert_columns: upsert_columns,
49
+ logger: logger
50
+ )
51
+ else
52
+ raise ArgumentError, "either of delete_cond or upsert_columns is required for transport_delta"
53
+ end
54
+ importer
55
+ end
56
+
57
+ def Importer.transport_all_from_s3(
58
+ strategy: 'rename',
59
+ bucket: nil, prefix:, format:, filter: nil,
60
+ table:, columns:,
61
+ logger: RedshiftConnector.logger, quiet: false)
62
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
63
+ logger = NullLogger.new if quiet
64
+ bundle = S3DataFileBundle.for_prefix(
65
+ bucket: bucket,
66
+ prefix: prefix,
67
+ format: format,
68
+ filter: filter,
69
+ logger: logger
70
+ )
71
+ importer = get_rebuild_class(strategy).new(
72
+ dao: table.classify.constantize,
73
+ bundle: bundle,
74
+ columns: columns,
75
+ logger: logger
76
+ )
77
+ importer
78
+ end
79
+
80
+ def Importer.get_rebuild_class(strategy)
81
+ case strategy.to_s
82
+ when 'rename' then RebuildRename
83
+ when 'truncate' then RebuildTruncate
84
+ else
85
+ raise ArgumentError, "unsupported rebuild strategy: #{strategy.inspect}"
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,2 @@
1
+ require 'activerecord-import'
2
+ require 'activerecord-import/base'
@@ -0,0 +1,32 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::InsertDelta
6
+ def initialize(dao:, bundle:, columns:, delete_cond:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @delete_cond = delete_cond
11
+ @logger = logger
12
+ end
13
+
14
+ def execute
15
+ delete_rows(@delete_cond)
16
+ import
17
+ end
18
+
19
+ def delete_rows(cond_expr)
20
+ @logger.info "DELETE #{@dao.table_name} where (#{cond_expr})"
21
+ @dao.connection.execute("delete from #{@dao.table_name} where #{cond_expr}")
22
+ @logger.info "deleted."
23
+ end
24
+
25
+ def import
26
+ @logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
27
+ @bundle.each_batch do |rows|
28
+ @dao.import(@columns, rows)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,41 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::RebuildRename
6
+ def initialize(dao:, bundle:, columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute
14
+ dest_table = @dao.table_name
15
+ tmp_table = "#{dest_table}_new"
16
+ old_table = "#{dest_table}_old"
17
+
18
+ tmp_dao = @dao.dup
19
+ tmp_dao.table_name = tmp_table
20
+
21
+ exec_update "drop table if exists #{tmp_table}"
22
+ exec_update "create table #{tmp_table} like #{dest_table}"
23
+ import(tmp_dao)
24
+ exec_update "drop table if exists #{old_table}"
25
+ # Atomic table exchange
26
+ exec_update "rename table #{dest_table} to #{old_table}, #{tmp_table} to #{dest_table}"
27
+ end
28
+
29
+ def exec_update(query)
30
+ @logger.info query
31
+ @dao.connection.execute(query)
32
+ end
33
+
34
+ def import(dao)
35
+ @logger.info "IMPORT #{@bundle.url}* -> #{dao.table_name} (#{@columns.join(', ')})"
36
+ @bundle.each_batch do |rows|
37
+ dao.import(@columns, rows)
38
+ end
39
+ end
40
+ end
41
+ end