redshift-connector 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +10 -0
  3. data/lib/redshift-connector.rb +31 -0
  4. data/lib/redshift-connector/connector.rb +146 -0
  5. data/lib/redshift-connector/exporter.rb +116 -0
  6. data/lib/redshift-connector/importer.rb +89 -0
  7. data/lib/redshift-connector/importer/activerecord-import.rb +2 -0
  8. data/lib/redshift-connector/importer/insert_delta.rb +32 -0
  9. data/lib/redshift-connector/importer/rebuild_rename.rb +41 -0
  10. data/lib/redshift-connector/importer/rebuild_truncate.rb +31 -0
  11. data/lib/redshift-connector/importer/upsert.rb +25 -0
  12. data/lib/redshift-connector/logger.rb +20 -0
  13. data/lib/redshift-connector/query.rb +93 -0
  14. data/lib/redshift-connector/reader.rb +18 -0
  15. data/lib/redshift-connector/reader/abstract.rb +18 -0
  16. data/lib/redshift-connector/reader/csv.rb +24 -0
  17. data/lib/redshift-connector/reader/exception.rb +3 -0
  18. data/lib/redshift-connector/reader/redshift_csv.rb +54 -0
  19. data/lib/redshift-connector/reader/tsv.rb +24 -0
  20. data/lib/redshift-connector/s3_bucket.rb +72 -0
  21. data/lib/redshift-connector/s3_data_file.rb +34 -0
  22. data/lib/redshift-connector/s3_data_file_bundle.rb +101 -0
  23. data/lib/redshift-connector/version.rb +3 -0
  24. data/test/all.rb +3 -0
  25. data/test/config.rb +13 -0
  26. data/test/config.rb.example +18 -0
  27. data/test/database.yml +15 -0
  28. data/test/database.yml.example +15 -0
  29. data/test/foreach.rb +5 -0
  30. data/test/helper.rb +25 -0
  31. data/test/item_pvs.ct.mysql +11 -0
  32. data/test/item_pvs.ct.redshift +9 -0
  33. data/test/reader/test_redshift_csv.rb +30 -0
  34. data/test/test_connector.rb +148 -0
  35. data/test/test_reader.rb +10 -0
  36. data/test/test_s3_import.rb +32 -0
  37. metadata +190 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c5d5bf307943f19b5187a9244ecd89717241d72f
4
+ data.tar.gz: 1651cd723205ee50247c5e407f39aaa1eafd287d
5
+ SHA512:
6
+ metadata.gz: ff3ef2d4b7e647617529313df5b1d24eb9bf537672ba4aeba3de6542d2fa60589576854b308bc7db5b34c853e1cf30a1a517f6cfd1c58b876018dd6c6fcab7c8
7
+ data.tar.gz: 51de6b97fd7099a2dfaf401b1f230e49f991c623cd43949bbf9e8a4e5f822180040b024638938613aaa89fdbaced93f5c9e27bf6e5e57c8e3d6c24a486f8c06b
data/README.md ADDED
@@ -0,0 +1,10 @@
1
+ # Redshift Connector for Rails
2
+
3
+ redshift-connector is a Redshift bulk data connector for Rails (ActiveRecord).
4
+
5
+ ## Settings
6
+
7
+ Add following block to your Gemfile and bundle.
8
+ ```
9
+ gem 'redshift-connector'
10
+ ```
@@ -0,0 +1,31 @@
1
+ module RedshiftConnector
2
+ end
3
+
4
+ require 'redshift-connector/connector'
5
+ require 'redshift-connector/exporter'
6
+ require 'redshift-connector/importer'
7
+ require 'redshift-connector/s3_bucket'
8
+ require 'redshift-connector/s3_data_file_bundle'
9
+ require 'redshift-connector/version'
10
+
11
+ module RedshiftConnector
12
+ def RedshiftConnector.transport_delta(**params)
13
+ Connector.transport_delta(**params)
14
+ end
15
+
16
+ def RedshiftConnector.transport_all(**params)
17
+ Connector.transport_all(**params)
18
+ end
19
+
20
+ def RedshiftConnector.transport_delta_from_s3(**params)
21
+ Importer.transport_delta_from_s3(**params)
22
+ end
23
+
24
+ def RedshiftConnector.transport_all_from_s3(**params)
25
+ Importer.transport_all_from_s3(**params)
26
+ end
27
+
28
+ def RedshiftConnector.foreach(**params, &block)
29
+ Exporter.foreach(**params, &block)
30
+ end
31
+ end
@@ -0,0 +1,146 @@
1
+ require 'redshift-connector/exporter'
2
+ require 'redshift-connector/importer'
3
+ require 'redshift-connector/s3_data_file_bundle'
4
+ require 'redshift-connector/logger'
5
+
6
+ module RedshiftConnector
7
+ class Connector
8
+ def Connector.transport_delta(
9
+ schema:,
10
+ table: nil,
11
+ src_table: table,
12
+ dest_table: table,
13
+ condition:,
14
+ columns:,
15
+ delete_cond: nil,
16
+ upsert_columns: nil,
17
+ bucket: nil,
18
+ txn_id:, filter:,
19
+ logger: RedshiftConnector.logger,
20
+ quiet: false
21
+ )
22
+ unless src_table and dest_table
23
+ raise ArgumentError, "missing :table, :src_table or :dest_table"
24
+ end
25
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
26
+ logger = NullLogger.new if quiet
27
+ bundle = S3DataFileBundle.for_table(
28
+ bucket: bucket,
29
+ schema: schema,
30
+ table: src_table,
31
+ txn_id: txn_id,
32
+ filter: filter,
33
+ logger: logger
34
+ )
35
+ exporter = Exporter.for_table_delta(
36
+ bundle: bundle,
37
+ schema: schema,
38
+ table: src_table,
39
+ columns: columns,
40
+ condition: condition,
41
+ logger: logger
42
+ )
43
+ if delete_cond and upsert_columns
44
+ raise ArgumentError, "delete_cond and upsert_columns are exclusive"
45
+ end
46
+ dao = dest_table.classify.constantize
47
+ importer =
48
+ if delete_cond
49
+ Importer::InsertDelta.new(
50
+ dao: dao,
51
+ bundle: bundle,
52
+ columns: columns,
53
+ delete_cond: delete_cond,
54
+ logger: logger
55
+ )
56
+ elsif upsert_columns
57
+ Importer::Upsert.new(
58
+ dao: dao,
59
+ bundle: bundle,
60
+ columns: columns,
61
+ upsert_columns: upsert_columns,
62
+ logger: logger
63
+ )
64
+ else
65
+ raise ArgumentError, "either of delete_cond or upsert_columns is required for transport_delta"
66
+ end
67
+ new(exporter: exporter, importer: importer, logger: logger)
68
+ end
69
+
70
+ def Connector.transport_all(
71
+ strategy: 'rename',
72
+ schema:,
73
+ table:,
74
+ src_table: table,
75
+ dest_table: table,
76
+ columns:,
77
+ bucket: nil,
78
+ txn_id:,
79
+ filter:,
80
+ logger: RedshiftConnector.logger,
81
+ quiet: false
82
+ )
83
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
84
+ logger = NullLogger.new if quiet
85
+ bundle = S3DataFileBundle.for_table(
86
+ bucket: bucket,
87
+ schema: schema,
88
+ table: table,
89
+ txn_id: txn_id,
90
+ filter: filter,
91
+ logger: logger
92
+ )
93
+ exporter = Exporter.for_table(
94
+ bundle: bundle,
95
+ schema: schema,
96
+ table: table,
97
+ columns: columns,
98
+ logger: logger
99
+ )
100
+ importer = Importer.get_rebuild_class(strategy).new(
101
+ dao: table.classify.constantize,
102
+ bundle: bundle,
103
+ columns: columns,
104
+ logger: logger
105
+ )
106
+ new(exporter: exporter, importer: importer, logger: logger)
107
+ end
108
+
109
+ def initialize(exporter:, importer:, logger:)
110
+ @exporter = exporter
111
+ @importer = importer
112
+ @logger = logger
113
+ end
114
+
115
+ def export_enabled?
116
+ not ENV['IMPORT_ONLY']
117
+ end
118
+
119
+ def export_forced?
120
+ !! (ENV['EXPORT_ONLY'] or ENV['FORCE'])
121
+ end
122
+
123
+ def import_enabled?
124
+ not ENV['EXPORT_ONLY']
125
+ end
126
+
127
+ def execute
128
+ export(forced: export_forced?) if export_enabled?
129
+ import if import_enabled?
130
+ end
131
+
132
+ def export(forced: false)
133
+ @logger.info "==== export task =================================================="
134
+ if not forced and @exporter.completed?
135
+ @logger.info "export task is already executed; skip"
136
+ else
137
+ @exporter.execute
138
+ end
139
+ end
140
+
141
+ def import
142
+ @logger.info "==== import task =================================================="
143
+ @importer.execute
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,116 @@
1
+ require 'redshift-connector/query'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Exporter
6
+ def Exporter.default_data_source=(ds)
7
+ @default_data_source = ds
8
+ end
9
+
10
+ def Exporter.default_data_source
11
+ @default_data_source or raise ArgumentError, "RedshiftConnector::Exporter.default_data_source was not set"
12
+ end
13
+
14
+ def Exporter.for_table_delta(ds: default_data_source, schema:, table:, condition:, columns:, bundle:, logger: RedshiftConnector.logger)
15
+ delta_query = DeltaQuery.new(schema: schema, table: table, columns: columns, condition: condition)
16
+ unload_query = UnloadQuery.new(query: delta_query, bundle: bundle)
17
+ new(ds: ds, query: unload_query, bundle: bundle, logger: logger)
18
+ end
19
+
20
+ def Exporter.for_table(ds: default_data_source, schema:, table:, columns:, bundle:, logger: RedshiftConnector.logger)
21
+ query = SelectAllQuery.new(schema: schema, table: table, columns: columns)
22
+ unload_query = UnloadQuery.new(query: query, bundle: bundle)
23
+ new(ds: ds, query: unload_query, bundle: bundle, logger: logger)
24
+ end
25
+
26
+ def Exporter.foreach(**params, &block)
27
+ exporter = Exporter.for_query(**params)
28
+ begin
29
+ exporter.execute
30
+ exporter.bundle.each_row(&block)
31
+ ensure
32
+ exporter.bundle.clear
33
+ end
34
+ end
35
+
36
+ def Exporter.for_query(
37
+ ds: default_data_source,
38
+ schema:,
39
+ table:,
40
+ bucket: nil,
41
+ query:,
42
+ txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
43
+ filter: nil,
44
+ logger: RedshiftConnector.logger,
45
+ quiet: false
46
+ )
47
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
48
+ logger = NullLogger.new if quiet
49
+ bundle = S3DataFileBundle.for_table(
50
+ bucket: bucket,
51
+ schema: schema,
52
+ table: table,
53
+ txn_id: txn_id,
54
+ filter: filter,
55
+ logger: logger
56
+ )
57
+ exporter = Exporter.new(
58
+ ds: ds,
59
+ query: UnloadQuery.wrap(query: query, bundle: bundle),
60
+ bundle: bundle,
61
+ logger: logger
62
+ )
63
+ exporter
64
+ end
65
+
66
+ def initialize(ds: self.class.default_data_source, query:, bundle:, logger: RedshiftConnector.logger)
67
+ @ds = ds
68
+ @query = query
69
+ @bundle = bundle
70
+ @logger = logger
71
+ end
72
+
73
+ attr_reader :query
74
+ attr_reader :bundle
75
+ attr_reader :logger
76
+
77
+ def completed?
78
+ @bundle.bucket.object(flag_object_key).exists?
79
+ end
80
+
81
+ def create_flag_object
82
+ @logger.info "TOUCH #{flag_object_key}"
83
+ @bundle.bucket.object(flag_object_key).put(body: "OK")
84
+ end
85
+
86
+ def flag_object_key
87
+ "#{File.dirname(@bundle.prefix)}/00completed"
88
+ end
89
+
90
+ def execute
91
+ @bundle.clear
92
+ @logger.info "EXPORT #{@query.description} -> #{@bundle.url}*"
93
+ @ds.connection_pool.with_connection do |conn|
94
+ stmt = @query.to_sql
95
+ @logger.info "[SQL/Redshift] #{batch_job_label}#{stmt.strip}"
96
+ conn.execute(batch_job_label + stmt)
97
+ end
98
+ create_flag_object
99
+ end
100
+
101
+ def batch_job_label
102
+ @batch_job_label ||= begin
103
+ components = Dir.getwd.split('/')
104
+ app = if components.last == 'current'
105
+ # is Capistrano environment
106
+ components[-2]
107
+ else
108
+ components[-1]
109
+ end
110
+ batch_file = caller.detect {|c| /redshift-connector|active_record/ !~ c }
111
+ path = batch_file ? batch_file.split(':').first : '?'
112
+ "/* Job: #{app}:#{path} */ "
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,89 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Importer
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/importer/upsert'
8
+ require 'redshift-connector/importer/insert_delta'
9
+ require 'redshift-connector/importer/rebuild_rename'
10
+ require 'redshift-connector/importer/rebuild_truncate'
11
+
12
+ require 'redshift-connector/s3_data_file_bundle'
13
+ require 'redshift-connector/logger'
14
+
15
+ module RedshiftConnector
16
+ module Importer
17
+ def Importer.transport_delta_from_s3(
18
+ bucket: nil, prefix:, format:, filter: nil,
19
+ table:, columns:,
20
+ delete_cond: nil, upsert_columns: nil,
21
+ logger: RedshiftConnector.logger, quiet: false)
22
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
23
+ logger = NullLogger.new if quiet
24
+ bundle = S3DataFileBundle.for_prefix(
25
+ bucket: bucket,
26
+ prefix: prefix,
27
+ format: format,
28
+ filter: filter,
29
+ logger: logger
30
+ )
31
+ if delete_cond and upsert_columns
32
+ raise ArgumentError, "delete_cond and upsert_columns are exclusive"
33
+ end
34
+ importer =
35
+ if delete_cond
36
+ Importer::InsertDelta.new(
37
+ dao: table.classify.constantize,
38
+ bundle: bundle,
39
+ columns: columns,
40
+ delete_cond: delete_cond,
41
+ logger: logger
42
+ )
43
+ elsif upsert_columns
44
+ Importer::Upsert.new(
45
+ dao: table.classify.constantize,
46
+ bundle: bundle,
47
+ columns: columns,
48
+ upsert_columns: upsert_columns,
49
+ logger: logger
50
+ )
51
+ else
52
+ raise ArgumentError, "either of delete_cond or upsert_columns is required for transport_delta"
53
+ end
54
+ importer
55
+ end
56
+
57
+ def Importer.transport_all_from_s3(
58
+ strategy: 'rename',
59
+ bucket: nil, prefix:, format:, filter: nil,
60
+ table:, columns:,
61
+ logger: RedshiftConnector.logger, quiet: false)
62
+ bucket = bucket ? S3Bucket.get(bucket) : S3Bucket.default
63
+ logger = NullLogger.new if quiet
64
+ bundle = S3DataFileBundle.for_prefix(
65
+ bucket: bucket,
66
+ prefix: prefix,
67
+ format: format,
68
+ filter: filter,
69
+ logger: logger
70
+ )
71
+ importer = get_rebuild_class(strategy).new(
72
+ dao: table.classify.constantize,
73
+ bundle: bundle,
74
+ columns: columns,
75
+ logger: logger
76
+ )
77
+ importer
78
+ end
79
+
80
+ def Importer.get_rebuild_class(strategy)
81
+ case strategy.to_s
82
+ when 'rename' then RebuildRename
83
+ when 'truncate' then RebuildTruncate
84
+ else
85
+ raise ArgumentError, "unsupported rebuild strategy: #{strategy.inspect}"
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,2 @@
1
+ require 'activerecord-import'
2
+ require 'activerecord-import/base'
@@ -0,0 +1,32 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::InsertDelta
6
+ def initialize(dao:, bundle:, columns:, delete_cond:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @delete_cond = delete_cond
11
+ @logger = logger
12
+ end
13
+
14
+ def execute
15
+ delete_rows(@delete_cond)
16
+ import
17
+ end
18
+
19
+ def delete_rows(cond_expr)
20
+ @logger.info "DELETE #{@dao.table_name} where (#{cond_expr})"
21
+ @dao.connection.execute("delete from #{@dao.table_name} where #{cond_expr}")
22
+ @logger.info "deleted."
23
+ end
24
+
25
+ def import
26
+ @logger.info "IMPORT #{@bundle.url}* -> #{@dao.table_name} (#{@columns.join(', ')})"
27
+ @bundle.each_batch do |rows|
28
+ @dao.import(@columns, rows)
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,41 @@
1
+ require 'redshift-connector/importer/activerecord-import'
2
+ require 'redshift-connector/logger'
3
+
4
+ module RedshiftConnector
5
+ class Importer::RebuildRename
6
+ def initialize(dao:, bundle:, columns:, logger: RedshiftConnector.logger)
7
+ @dao = dao
8
+ @bundle = bundle
9
+ @columns = columns
10
+ @logger = logger
11
+ end
12
+
13
+ def execute
14
+ dest_table = @dao.table_name
15
+ tmp_table = "#{dest_table}_new"
16
+ old_table = "#{dest_table}_old"
17
+
18
+ tmp_dao = @dao.dup
19
+ tmp_dao.table_name = tmp_table
20
+
21
+ exec_update "drop table if exists #{tmp_table}"
22
+ exec_update "create table #{tmp_table} like #{dest_table}"
23
+ import(tmp_dao)
24
+ exec_update "drop table if exists #{old_table}"
25
+ # Atomic table exchange
26
+ exec_update "rename table #{dest_table} to #{old_table}, #{tmp_table} to #{dest_table}"
27
+ end
28
+
29
+ def exec_update(query)
30
+ @logger.info query
31
+ @dao.connection.execute(query)
32
+ end
33
+
34
+ def import(dao)
35
+ @logger.info "IMPORT #{@bundle.url}* -> #{dao.table_name} (#{@columns.join(', ')})"
36
+ @bundle.each_batch do |rows|
37
+ dao.import(@columns, rows)
38
+ end
39
+ end
40
+ end
41
+ end