redshift_connector 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE +21 -0
  5. data/README.md +42 -0
  6. data/RELEASE.md +89 -0
  7. data/Rakefile +3 -0
  8. data/lib/redshift_connector.rb +35 -0
  9. data/lib/redshift_connector/active_record_data_source.rb +23 -0
  10. data/lib/redshift_connector/active_record_exporter.rb +47 -0
  11. data/lib/redshift_connector/connector.rb +189 -0
  12. data/lib/redshift_connector/data_file.rb +32 -0
  13. data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
  14. data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
  15. data/lib/redshift_connector/exception.rb +5 -0
  16. data/lib/redshift_connector/exporter.rb +40 -0
  17. data/lib/redshift_connector/exporter_builder.rb +49 -0
  18. data/lib/redshift_connector/immediate_exporter.rb +19 -0
  19. data/lib/redshift_connector/importer.rb +58 -0
  20. data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
  21. data/lib/redshift_connector/importer/insert_delta.rb +31 -0
  22. data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
  23. data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
  24. data/lib/redshift_connector/importer/upsert.rb +24 -0
  25. data/lib/redshift_connector/logger.rb +20 -0
  26. data/lib/redshift_connector/query.rb +95 -0
  27. data/lib/redshift_connector/reader.rb +18 -0
  28. data/lib/redshift_connector/reader/abstract.rb +18 -0
  29. data/lib/redshift_connector/reader/csv.rb +24 -0
  30. data/lib/redshift_connector/reader/exception.rb +3 -0
  31. data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
  32. data/lib/redshift_connector/reader/tsv.rb +24 -0
  33. data/lib/redshift_connector/s3_bucket.rb +76 -0
  34. data/lib/redshift_connector/s3_data_file.rb +20 -0
  35. data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
  36. data/lib/redshift_connector/version.rb +3 -0
  37. data/redshift_connector.gemspec +27 -0
  38. metadata +190 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 6dfd6aa65bfc44fca5d3e0dc7e042911344be4213764e267136f7259b3529a90
4
+ data.tar.gz: f12016b3044c1199e0b0e3bba4c8e8826e6a72b80d40088d849110dd4624607f
5
+ SHA512:
6
+ metadata.gz: 68e8a3265255b168cbb40b1b200902922fb9641035533cf561ab69feee1d825d15dd999f3d0fb9dfbfc0b00349be662d713c1ca5ac3da418d49c44ad03771e20
7
+ data.tar.gz: 59cf47cc0aaec30d8025497dc11347904d50f9be8a5e68086d9c2952f773619b27be3c8ed5edf8a9f6708afe370ba1dfac7cb48c1602c7cbbb23d33e0457b054
data/.gitignore ADDED
@@ -0,0 +1,20 @@
1
+ # Packaging
2
+ *.gem
3
+ /spec/reports/
4
+ /spec/examples.txt
5
+
6
+ # Documents
7
+ /.yardoc/
8
+ /_yardoc/
9
+ /doc/
10
+ /rdoc/
11
+
12
+ # Bundler
13
+ /.bundle/
14
+ /vendor/bundle
15
+ /lib/bundler/man/
16
+ Gemfile.lock
17
+
18
+ # Gem Specific
19
+ test/database.yml
20
+ test/config.rb
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source "https://rubygems.org"
2
+ gemspec
3
+
4
+ # We need explicit version specification (4 or 5) here,
5
+ # to resolve version dependencies correctly.
6
+ gem 'activerecord', '~> 5.0'
7
+ gem 'mysql2'
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016,2017 Minero Aoki
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,42 @@
1
+ # Redshift Connector for Rails
2
+
3
+ redshift_connector is a Redshift bulk data connector for Rails (ActiveRecord).
4
+
5
+ This library is formerly called as "redshift-connector",
6
+ but renamed to "redshift_connector" to follow the gem naming standard.
7
+
8
+ ## Settings
9
+
10
+ Add following block to your Gemfile and bundle.
11
+ ```
12
+ gem 'redshift_connector'
13
+ ```
14
+ Add config/initializers/redshift_connector.rb like following:
15
+ ```
16
+ module RedshiftConnector
17
+ Exporter.default_data_source = Any_ActiveRecord_Class_Bound_To_Redshift
18
+
19
+ S3Bucket.add('primary', default: true,
20
+ region: 'YOUR_AWS_REGION_NAME',
21
+ bucket: 'YOUR_BUCKET_NAME',
22
+ prefix: 'YOUR_PREFIX',
23
+ iam_role: 'arn:aws:iam::XXXXXXXXXXXX:role/RedshiftReadOnly'
24
+ # For explicit S3 access, use following:
25
+ # aws_access_key_id: 'XXXXXXXXXXXXX',
26
+ # aws_secret_access_key: 'XXXXXXXXXXXXX'
27
+ )
28
+ end
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ### Fetching rows
34
+
35
+ ```
36
+ RedshiftConnector.foreach(schema: 'app_mst', table: 'shops', query: 'select id, name from app_mst.shops') do |id, name|
37
+ p [id, name]
38
+ end
39
+ ```
40
+ `schema` and `table` is the source table name (written in the query).
41
+ This method executes Redshift UNLOAD statement with given query and
42
+ unload result to the intermediate S3, then read contents.
data/RELEASE.md ADDED
@@ -0,0 +1,89 @@
1
+ # Release Note
2
+
3
+ ## version 8.0.0
4
+
5
+ - [INCOMPATIBLE] This library is renamed to "redshift_connector". Just modify your Gemfile from "redshift-connector" to "redshift_connector".
6
+ - [INCOMPATIBLE] redshift-connector-data_file gem is merged.
7
+ - [INCOMPATIBLE] (internal) *DataFileBundle#each, #each_row, #each_object, #each_batch, #all_data_objects are removed. Use DataFileBundleReader class instead.
8
+ - [INCOMPATIBLE] (internal) AbstractDataFileBundle class is removed.
9
+ - [INCOMPATIBLE] (internal) AbstractDataFile class is removed.
10
+
11
+ ## version 7.2.2
12
+
13
+ - [fix] RedshiftConnector.transport_all: src_table/dest_table parameter did not work.
14
+ - [fix] RedshiftConnector.transport_all (strategy=rename): newer activerecord-import requires class name.
15
+
16
+ ## version 7.2.1
17
+
18
+ - no change.
19
+
20
+ ## version 7.2.0
21
+
22
+ - Removes aws-sdk dependency
23
+
24
+ ## version 7.0.2
25
+
26
+ - [fix] RedshiftConnector.foreach did not work
27
+
28
+ ## version 7.0.1
29
+
30
+ - [fix] RedshiftConnector.transport_delta_from_s3, .transport_all_from_s3 were wrongly dropped, restore them.
31
+
32
+ ## version 7.0.0
33
+
34
+ - [INCOMPATIBLE] Library hierarchy changed: redshift-connector/* -> redshift_connector/*. redshift-connector.rb still exists as an entry point for bundler.
35
+ - [new] Exporter becomes pluggable. You can implement your own exporter data source instead of ActiveRecord.
36
+
37
+ ## version 6.0.0
38
+
39
+ - version number change only.
40
+
41
+ ## version 5.6.0
42
+
43
+ - Unifies version 4.x (supports Rails 4) and 5.x (supports Rails 5).
44
+
45
+ ## version 4.5.0 / 5.5.0
46
+
47
+ - [new] Separates S3 access layer to another gem: redshift-connector-data_file
48
+
49
+ ## version 4.4.1 / 5.4.1
50
+
51
+ - [new] New option enable_sort for Connector.foreach, to enforce global sorting.
52
+
53
+ ## version 4.4.0 / 5.4.0
54
+
55
+ - [CHANGE] Drops export-only-once feature (and FORCE environment switch), it is not so useful.
56
+ Exporter now always exports data.
57
+
58
+ ## version 4.3.2 / 5.3.2
59
+
60
+ - [new] Allows reading from S3 signed URL (for separated export/import processes)
61
+
62
+ ## version 4.3.1 / 5.3.1
63
+
64
+ - First release for Rails 5 series.
65
+ - [fix] Add option for AWS multi-regions support
66
+
67
+ ## version 4.3.0
68
+
69
+ - [new] New method RedshiftConnector.foreach to read rows with UNLOAD
70
+
71
+ ## version 4.2.0
72
+
73
+ - [new] New methods RedshiftConnector.transport_delta_from_s3, .transport_all_from_s3 to read from S3
74
+
75
+ ## version 4.1.0
76
+
77
+ - [new] Introduces rebuild operator. New facade method Connector.transport_all.
78
+
79
+ ## version 4.0.2
80
+
81
+ - [fix] Correctly parses UNLOAD-generated CSV (dangerous characeters are escaped by backslash).
82
+
83
+ ## version 4.0.1
84
+
85
+ - [new] Allow configure the default logger by RedshiftConnector.logger=.
86
+
87
+ ## version 4.0.0
88
+
89
+ First release for Rails 4 series.
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ task :test do
2
+ load "#{__dir__}/test/all.rb"
3
+ end
@@ -0,0 +1,35 @@
1
+ module RedshiftConnector
2
+ end
3
+
4
+ require 'redshift_connector/connector'
5
+ require 'redshift_connector/exporter'
6
+ require 'redshift_connector/active_record_data_source'
7
+ require 'redshift_connector/active_record_exporter'
8
+ require 'redshift_connector/immediate_exporter'
9
+ require 'redshift_connector/importer'
10
+ require 'redshift_connector/s3_bucket'
11
+ require 'redshift_connector/s3_data_file_bundle'
12
+ require 'redshift_connector/exception'
13
+ require 'redshift_connector/version'
14
+
15
+ module RedshiftConnector
16
+ def RedshiftConnector.transport_delta(**params)
17
+ Connector.transport_delta(**params)
18
+ end
19
+
20
+ def RedshiftConnector.transport_all(**params)
21
+ Connector.transport_all(**params)
22
+ end
23
+
24
+ def RedshiftConnector.transport_delta_from_s3(**params)
25
+ Connector.transport_delta_from_s3(**params)
26
+ end
27
+
28
+ def RedshiftConnector.transport_all_from_s3(**params)
29
+ Connector.transport_all_from_s3(**params)
30
+ end
31
+
32
+ def RedshiftConnector.foreach(**params, &block)
33
+ Exporter.foreach(**params, &block)
34
+ end
35
+ end
@@ -0,0 +1,23 @@
1
+ require 'redshift_connector/exporter_builder'
2
+
3
+ module RedshiftConnector
4
+ class ActiveRecordDataSource
5
+ def ActiveRecordDataSource.for_dao(dao)
6
+ new(dao)
7
+ end
8
+
9
+ def initialize(dao)
10
+ @dao = dao
11
+ end
12
+
13
+ def exporter_builder
14
+ ExporterBuilder.new(ds: self, exporter_class: ActiveRecordExporter)
15
+ end
16
+
17
+ def execute_query(query_str)
18
+ @dao.connection_pool.with_connection {|conn|
19
+ conn.execute(query_str)
20
+ }
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,47 @@
1
+ require 'redshift_connector/s3_data_file_bundle'
2
+ require 'redshift_connector/query'
3
+ require 'redshift_connector/logger'
4
+
5
+ module RedshiftConnector
6
+ class ActiveRecordExporter
7
+ def initialize(ds:, query:, bundle_params:, enable_sort: false, logger: RedshiftConnector.logger)
8
+ @ds = ds
9
+ @query = query
10
+ @bundle_params = bundle_params
11
+ @enable_sort = enable_sort
12
+ @logger = logger
13
+
14
+ @bundle = S3DataFileBundle.for_params(bundle_params)
15
+ end
16
+
17
+ attr_reader :query
18
+ attr_reader :bundle_params
19
+ attr_reader :bundle
20
+ attr_reader :logger
21
+
22
+ def execute
23
+ @bundle.clear
24
+ unload_query = UnloadQuery.new(query: @query, bundle: @bundle, enable_sort: @enable_sort)
25
+ @logger.info "EXPORT #{unload_query.description} -> #{@bundle.url}*"
26
+ stmt = unload_query.to_sql
27
+ @logger.info "[SQL/Redshift] #{batch_job_label}#{stmt.strip}"
28
+ @ds.execute_query(batch_job_label + stmt)
29
+ @bundle
30
+ end
31
+
32
+ def batch_job_label
33
+ @batch_job_label ||= begin
34
+ components = Dir.getwd.split('/')
35
+ app = if components.last == 'current'
36
+ # is Capistrano environment
37
+ components[-2]
38
+ else
39
+ components[-1]
40
+ end
41
+ batch_file = caller.detect {|c| /redshift_connector|active_record/ !~ c }
42
+ path = batch_file ? batch_file.split(':').first : '?'
43
+ "/* Job: #{app}:#{path} */ "
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,189 @@
1
+ require 'redshift_connector/exporter'
2
+ require 'redshift_connector/immediate_exporter'
3
+ require 'redshift_connector/importer'
4
+ require 'redshift_connector/s3_data_file_bundle'
5
+ require 'redshift_connector/data_file_bundle_params'
6
+ require 'redshift_connector/data_file_bundle_reader'
7
+ require 'redshift_connector/logger'
8
+
9
+ module RedshiftConnector
10
+ class Connector
11
+ def Connector.transport_delta_from_s3(
12
+ bucket: nil,
13
+ prefix:,
14
+ format:,
15
+ filter: nil,
16
+ table:,
17
+ columns:,
18
+ delete_cond: nil,
19
+ upsert_columns: nil,
20
+ logger: RedshiftConnector.logger,
21
+ quiet: false
22
+ )
23
+ logger = NullLogger.new if quiet
24
+ bundle = S3DataFileBundle.for_prefix(
25
+ bucket: (bucket ? S3Bucket.get(bucket) : S3Bucket.default),
26
+ prefix: prefix,
27
+ format: format,
28
+ logger: logger
29
+ )
30
+ exporter = ImmediateExporter.new(bundle: bundle, logger: logger)
31
+ importer = Importer.for_delta_upsert(
32
+ table: table,
33
+ columns: columns,
34
+ delete_cond: delete_cond,
35
+ upsert_columns: upsert_columns,
36
+ logger: logger
37
+ )
38
+ new(exporter: exporter, importer: importer, filter: filter, logger: logger)
39
+ end
40
+
41
+ def Connector.transport_delta(
42
+ schema:,
43
+ table: nil,
44
+ src_table: table,
45
+ dest_table: table,
46
+ condition:,
47
+ columns:,
48
+ delete_cond: nil,
49
+ upsert_columns: nil,
50
+ bucket: nil,
51
+ txn_id: nil,
52
+ filter:,
53
+ logger: RedshiftConnector.logger,
54
+ quiet: false
55
+ )
56
+ unless src_table and dest_table
57
+ raise ArgumentError, "missing :table, :src_table or :dest_table"
58
+ end
59
+ logger = NullLogger.new if quiet
60
+ bundle_params = DataFileBundleParams.new(
61
+ bucket: bucket,
62
+ schema: schema,
63
+ table: src_table,
64
+ txn_id: txn_id,
65
+ logger: logger
66
+ )
67
+ exporter = Exporter.for_table_delta(
68
+ bundle_params: bundle_params,
69
+ schema: schema,
70
+ table: src_table,
71
+ columns: columns,
72
+ condition: condition,
73
+ logger: logger
74
+ )
75
+ importer = Importer.for_delta_upsert(
76
+ table: dest_table,
77
+ columns: columns,
78
+ delete_cond: delete_cond,
79
+ upsert_columns: upsert_columns,
80
+ logger: logger
81
+ )
82
+ new(exporter: exporter, importer: importer, filter: filter, logger: logger)
83
+ end
84
+
85
+ def Connector.transport_all_from_s3(
86
+ strategy: 'rename',
87
+ table:,
88
+ columns:,
89
+ bucket: nil,
90
+ prefix:,
91
+ format:,
92
+ filter: nil,
93
+ logger: RedshiftConnector.logger,
94
+ quiet: false
95
+ )
96
+ logger = NullLogger.new if quiet
97
+ bundle = S3DataFileBundle.for_prefix(
98
+ bucket: (bucket ? S3Bucket.get(bucket) : S3Bucket.default),
99
+ prefix: prefix,
100
+ format: format,
101
+ logger: logger
102
+ )
103
+ exporter = ImmediateExporter.new(bundle: bundle, logger: logger)
104
+ importer = Importer.for_rebuild(
105
+ strategy: strategy,
106
+ table: table,
107
+ columns: columns,
108
+ logger: logger
109
+ )
110
+ new(exporter: exporter, importer: importer, filter: filter, logger: logger)
111
+ end
112
+
113
+ def Connector.transport_all(
114
+ strategy: 'rename',
115
+ schema:,
116
+ table: nil,
117
+ src_table: table,
118
+ dest_table: table,
119
+ columns:,
120
+ bucket: nil,
121
+ txn_id: nil,
122
+ filter:,
123
+ logger: RedshiftConnector.logger,
124
+ quiet: false
125
+ )
126
+ logger = NullLogger.new if quiet
127
+ bundle_params = DataFileBundleParams.new(
128
+ bucket: bucket,
129
+ schema: schema,
130
+ table: src_table,
131
+ txn_id: txn_id,
132
+ logger: logger
133
+ )
134
+ exporter = Exporter.for_table(
135
+ bundle_params: bundle_params,
136
+ schema: schema,
137
+ table: src_table,
138
+ columns: columns,
139
+ logger: logger
140
+ )
141
+ importer = Importer.for_rebuild(
142
+ strategy: strategy,
143
+ table: dest_table,
144
+ columns: columns,
145
+ logger: logger
146
+ )
147
+ new(exporter: exporter, importer: importer, filter: filter, logger: logger)
148
+ end
149
+
150
+ def initialize(exporter:, importer:, filter: nil, logger:)
151
+ @exporter = exporter
152
+ @importer = importer
153
+ @filter = filter
154
+ @logger = logger
155
+ @bundle = nil
156
+ end
157
+
158
+ def export_enabled?
159
+ not ENV['IMPORT_ONLY']
160
+ end
161
+
162
+ def import_enabled?
163
+ not ENV['EXPORT_ONLY']
164
+ end
165
+
166
+ def execute
167
+ export if export_enabled?
168
+ import if import_enabled?
169
+ end
170
+
171
+ def export
172
+ @logger.info "==== export task =================================================="
173
+ @bundle = @exporter.execute
174
+ end
175
+
176
+ DEFAULT_BATCH_SIZE = 1000
177
+
178
+ def import
179
+ @logger.info "==== import task =================================================="
180
+ r = DataFileBundleReader.new(
181
+ @bundle,
182
+ filter: @filter,
183
+ batch_size: DEFAULT_BATCH_SIZE,
184
+ logger: @logger
185
+ )
186
+ @importer.execute(r)
187
+ end
188
+ end
189
+ end