redshift_connector 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE +21 -0
  5. data/README.md +42 -0
  6. data/RELEASE.md +89 -0
  7. data/Rakefile +3 -0
  8. data/lib/redshift_connector.rb +35 -0
  9. data/lib/redshift_connector/active_record_data_source.rb +23 -0
  10. data/lib/redshift_connector/active_record_exporter.rb +47 -0
  11. data/lib/redshift_connector/connector.rb +189 -0
  12. data/lib/redshift_connector/data_file.rb +32 -0
  13. data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
  14. data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
  15. data/lib/redshift_connector/exception.rb +5 -0
  16. data/lib/redshift_connector/exporter.rb +40 -0
  17. data/lib/redshift_connector/exporter_builder.rb +49 -0
  18. data/lib/redshift_connector/immediate_exporter.rb +19 -0
  19. data/lib/redshift_connector/importer.rb +58 -0
  20. data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
  21. data/lib/redshift_connector/importer/insert_delta.rb +31 -0
  22. data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
  23. data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
  24. data/lib/redshift_connector/importer/upsert.rb +24 -0
  25. data/lib/redshift_connector/logger.rb +20 -0
  26. data/lib/redshift_connector/query.rb +95 -0
  27. data/lib/redshift_connector/reader.rb +18 -0
  28. data/lib/redshift_connector/reader/abstract.rb +18 -0
  29. data/lib/redshift_connector/reader/csv.rb +24 -0
  30. data/lib/redshift_connector/reader/exception.rb +3 -0
  31. data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
  32. data/lib/redshift_connector/reader/tsv.rb +24 -0
  33. data/lib/redshift_connector/s3_bucket.rb +76 -0
  34. data/lib/redshift_connector/s3_data_file.rb +20 -0
  35. data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
  36. data/lib/redshift_connector/version.rb +3 -0
  37. data/redshift_connector.gemspec +27 -0
  38. metadata +190 -0
@@ -0,0 +1,20 @@
1
+ module RedshiftConnector
2
+ @logger = nil
3
+
4
+ def RedshiftConnector.logger
5
+ # Defer to access Rails
6
+ @logger || Rails.logger
7
+ end
8
+
9
+ def RedshiftConnector.logger=(logger)
10
+ @logger = logger
11
+ end
12
+
13
+ class NullLogger
14
+ def noop(*args) end
15
+ alias error noop
16
+ alias warn noop
17
+ alias info noop
18
+ alias debug noop
19
+ end
20
+ end
@@ -0,0 +1,95 @@
1
+ module RedshiftConnector
2
+ class DeltaQuery
3
+ def initialize(schema:, table:, columns:, condition: nil)
4
+ @schema = schema
5
+ @table = table
6
+ @columns = columns
7
+ @condition = condition
8
+ end
9
+
10
+ def table_spec
11
+ "#{@schema}.#{@table}"
12
+ end
13
+
14
+ def description
15
+ "#{table_spec} (#{@columns.join(', ')}) where (#{@condition})"
16
+ end
17
+
18
+ def to_sql
19
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
20
+ + " from #{table_spec}" \
21
+ + (@condition ? " where #{@condition}" : '')
22
+ end
23
+ end
24
+
25
+ class SelectAllQuery
26
+ def initialize(schema:, table:, columns:)
27
+ @schema = schema
28
+ @table = table
29
+ @columns = columns
30
+ end
31
+
32
+ def table_spec
33
+ "#{@schema}.#{@table}"
34
+ end
35
+
36
+ def description
37
+ "#{table_spec} (#{@columns.join(', ')})"
38
+ end
39
+
40
+ def to_sql
41
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
42
+ + " from #{table_spec}"
43
+ end
44
+ end
45
+
46
+ class UnloadQuery
47
+ def UnloadQuery.wrap(query:, bundle:, enable_sort: false)
48
+ new(query: ArbitraryQuery.new(query), bundle: bundle, enable_sort: enable_sort)
49
+ end
50
+
51
+ def initialize(query:, bundle:, enable_sort: false)
52
+ @query = query
53
+ @bundle = bundle
54
+ @enable_sort = enable_sort
55
+ end
56
+
57
+ def table_spec
58
+ @query.table_spec
59
+ end
60
+
61
+ def description
62
+ @query.description
63
+ end
64
+
65
+ def to_sql
66
+ <<-EndSQL.gsub(/^\s+/, '')
67
+ unload ('#{escape_query(@query.to_sql)}')
68
+ to '#{@bundle.url}'
69
+ credentials '#{@bundle.credential_string}'
70
+ gzip
71
+ allowoverwrite
72
+ parallel #{@enable_sort ? 'off' : 'on'}
73
+ delimiter ',' escape addquotes
74
+ EndSQL
75
+ end
76
+
77
+ def escape_query(query)
78
+ query.gsub("'", "\\\\'")
79
+ end
80
+ end
81
+
82
+ class ArbitraryQuery
83
+ def initialize(query)
84
+ @query = query
85
+ end
86
+
87
+ def description
88
+ @query
89
+ end
90
+
91
+ def to_sql
92
+ @query
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift_connector/reader/redshift_csv'
8
+ require 'redshift_connector/reader/csv'
9
+ require 'redshift_connector/reader/tsv'
10
+ require 'redshift_connector/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift_connector/reader/abstract'
2
+ require 'redshift_connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(key)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,25 @@
1
+ require 'redshift_connector/reader/abstract'
2
+ require 'redshift_connector/reader/exception'
3
+ require 'redshift_csv_file'
4
+ require 'forwardable'
5
+
6
+ module RedshiftConnector
7
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
8
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
9
+ class Reader::RedshiftCSV < Reader::Abstract
10
+ declare_reader :redshift_csv
11
+
12
+ def self.data_object?(key)
13
+ /\.csv(?:\.|\z)/ =~ File.basename(key)
14
+ end
15
+
16
+ # f :: IO
17
+ def initialize(f)
18
+ @f = f
19
+ @csv = RedshiftCsvFile.new(@f)
20
+ end
21
+
22
+ extend Forwardable
23
+ def_delegators '@csv', :each, :each_row, :read_row
24
+ end
25
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift_connector/reader/abstract'
2
+ require 'redshift_connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(key)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,76 @@
1
+ require 'aws-sdk-s3'
2
+
3
+ module RedshiftConnector
4
+ class S3Bucket
5
+ @buckets = {}
6
+ @default = nil
7
+
8
+ def S3Bucket.add(name, default: false, **params)
9
+ instance = new(**params)
10
+ @buckets[name.to_s] = instance
11
+ if !@default or default
12
+ @default = instance
13
+ end
14
+ end
15
+
16
+ def S3Bucket.default
17
+ @default or raise ArgumentError, "no default S3 bucket configured"
18
+ end
19
+
20
+ def S3Bucket.get(name)
21
+ @buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
22
+ end
23
+
24
+ def initialize(region: nil, bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
25
+ @region = region
26
+ @name = bucket
27
+ @prefix = prefix
28
+ @access_key_id = access_key_id
29
+ @secret_access_key = secret_access_key
30
+ @iam_role = iam_role
31
+ end
32
+
33
+ attr_reader :name
34
+ attr_reader :prefix
35
+
36
+ def url
37
+ "s3://#{@bucket.name}/#{@prefix}/"
38
+ end
39
+
40
+ def client
41
+ @client ||= begin
42
+ args = { region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key }.reject {|k, v| v.nil? }
43
+ Aws::S3::Client.new(**args)
44
+ end
45
+ end
46
+
47
+ def bucket
48
+ @bucket ||= begin
49
+ resource = Aws::S3::Resource.new(client: client)
50
+ resource.bucket(@name)
51
+ end
52
+ end
53
+
54
+ def object(key)
55
+ bucket.object(key)
56
+ end
57
+
58
+ def objects(prefix:)
59
+ bucket.objects(prefix: prefix)
60
+ end
61
+
62
+ def delete_objects(keys)
63
+ bucket.delete_objects(delete: {objects: keys.map {|k| {key: k} }})
64
+ end
65
+
66
+ def credential_string
67
+ if @iam_role
68
+ "aws_iam_role=#{@iam_role}"
69
+ elsif @access_key_id
70
+ "aws_access_key_id=#{@access_key_id};aws_secret_access_key=#{@secret_access_key}"
71
+ else
72
+ raise ArgumentError, "no credential given for Redshift S3 access"
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,20 @@
1
+ require 'redshift_connector/data_file'
2
+
3
+ module RedshiftConnector
4
+ class S3DataFile < DataFile
5
+ def initialize(object, reader_class:)
6
+ super reader_class: reader_class
7
+ @object = object
8
+ end
9
+
10
+ def key
11
+ @object.key
12
+ end
13
+
14
+ def open
15
+ @object.get.body
16
+ end
17
+
18
+ delegate :presigned_url, to: :@object
19
+ end
20
+ end
@@ -0,0 +1,68 @@
1
+ require 'redshift_connector/s3_bucket'
2
+ require 'redshift_connector/s3_data_file'
3
+ require 'redshift_connector/reader'
4
+ require 'redshift_connector/logger'
5
+ require 'aws-sdk-s3'
6
+
7
+ module RedshiftConnector
8
+ class S3DataFileBundle
9
+ def self.for_params(params)
10
+ unless params.txn_id
11
+ raise ArgumentError, "cannot create bundle: missing txn_id"
12
+ end
13
+ s3bucket = params.bucket ? S3Bucket.get(params.bucket) : S3Bucket.default
14
+ for_table(
15
+ bucket: s3bucket,
16
+ schema: params.schema,
17
+ table: params.table,
18
+ txn_id: params.txn_id,
19
+ logger: params.logger
20
+ )
21
+ end
22
+
23
+ def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, logger: RedshiftConnector.logger)
24
+ real_prefix = "#{bucket.prefix}/#{prefix}"
25
+ new(bucket, real_prefix, format: format, logger: logger)
26
+ end
27
+
28
+ def self.for_table(bucket: S3Bucket.default, schema:, table:, txn_id:, logger: RedshiftConnector.logger)
29
+ prefix = "#{bucket.prefix}/#{schema}_export/#{table}/#{txn_id}/#{table}.csv."
30
+ new(bucket, prefix, format: :redshift_csv, logger: logger)
31
+ end
32
+
33
+ def initialize(bucket, prefix, format: :csv, logger: RedshiftConnector.logger)
34
+ @bucket = bucket
35
+ @prefix = prefix
36
+ @format = format
37
+ @logger = logger
38
+ @reader_class = Reader.get(format)
39
+ logger.info "reader: #{@reader_class}"
40
+ end
41
+
42
+ attr_reader :bucket
43
+ attr_reader :prefix
44
+ attr_reader :logger
45
+
46
+ def url
47
+ "s3://#{@bucket.name}/#{@prefix}"
48
+ end
49
+
50
+ def credential_string
51
+ @bucket.credential_string
52
+ end
53
+
54
+ def data_files
55
+ @bucket.objects(prefix: @prefix)
56
+ .map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
57
+ end
58
+
59
+ def clear
60
+ pref = File.dirname(@prefix) + '/'
61
+ keys = @bucket.objects(prefix: pref).map(&:key)
62
+ unless keys.empty?
63
+ logger.info "DELETE #{pref}*"
64
+ @bucket.delete_objects(keys)
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ VERSION = '8.0.0'
3
+ end
@@ -0,0 +1,27 @@
1
+ require_relative 'lib/redshift_connector/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.platform = Gem::Platform::RUBY
5
+ s.name = 'redshift_connector'
6
+ s.version = RedshiftConnector::VERSION
7
+ s.summary = 'Redshift bulk data connector'
8
+ s.description = 'redshift_connector is a bulk data connector for Rails (ActiveRecord).'
9
+ s.license = 'MIT'
10
+
11
+ s.author = ['Minero Aoki']
12
+ s.email = 'aamine@loveruby.net'
13
+ s.homepage = 'https://github.com/bricolages/redshift_connector'
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
16
+ s.require_path = 'lib'
17
+
18
+ s.required_ruby_version = '>= 2.1.0'
19
+ s.add_dependency 'activerecord'
20
+ s.add_dependency 'activerecord-redshift'
21
+ s.add_dependency 'pg', '~> 0.18.0'
22
+ s.add_dependency 'activerecord-import'
23
+ s.add_dependency 'redshift_csv_file', '~> 1.0'
24
+ s.add_dependency 'aws-sdk-s3', '~> 1.0'
25
+ s.add_development_dependency 'test-unit'
26
+ s.add_development_dependency 'rake'
27
+ end
metadata ADDED
@@ -0,0 +1,190 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshift_connector
3
+ version: !ruby/object:Gem::Version
4
+ version: 8.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Minero Aoki
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-07-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activerecord
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord-redshift
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pg
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.18.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.18.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: activerecord-import
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: redshift_csv_file
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: aws-sdk-s3
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: test-unit
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: redshift_connector is a bulk data connector for Rails (ActiveRecord).
126
+ email: aamine@loveruby.net
127
+ executables: []
128
+ extensions: []
129
+ extra_rdoc_files: []
130
+ files:
131
+ - ".gitignore"
132
+ - Gemfile
133
+ - LICENSE
134
+ - README.md
135
+ - RELEASE.md
136
+ - Rakefile
137
+ - lib/redshift_connector.rb
138
+ - lib/redshift_connector/active_record_data_source.rb
139
+ - lib/redshift_connector/active_record_exporter.rb
140
+ - lib/redshift_connector/connector.rb
141
+ - lib/redshift_connector/data_file.rb
142
+ - lib/redshift_connector/data_file_bundle_params.rb
143
+ - lib/redshift_connector/data_file_bundle_reader.rb
144
+ - lib/redshift_connector/exception.rb
145
+ - lib/redshift_connector/exporter.rb
146
+ - lib/redshift_connector/exporter_builder.rb
147
+ - lib/redshift_connector/immediate_exporter.rb
148
+ - lib/redshift_connector/importer.rb
149
+ - lib/redshift_connector/importer/activerecord-import.rb
150
+ - lib/redshift_connector/importer/insert_delta.rb
151
+ - lib/redshift_connector/importer/rebuild_rename.rb
152
+ - lib/redshift_connector/importer/rebuild_truncate.rb
153
+ - lib/redshift_connector/importer/upsert.rb
154
+ - lib/redshift_connector/logger.rb
155
+ - lib/redshift_connector/query.rb
156
+ - lib/redshift_connector/reader.rb
157
+ - lib/redshift_connector/reader/abstract.rb
158
+ - lib/redshift_connector/reader/csv.rb
159
+ - lib/redshift_connector/reader/exception.rb
160
+ - lib/redshift_connector/reader/redshift_csv.rb
161
+ - lib/redshift_connector/reader/tsv.rb
162
+ - lib/redshift_connector/s3_bucket.rb
163
+ - lib/redshift_connector/s3_data_file.rb
164
+ - lib/redshift_connector/s3_data_file_bundle.rb
165
+ - lib/redshift_connector/version.rb
166
+ - redshift_connector.gemspec
167
+ homepage: https://github.com/bricolages/redshift_connector
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
171
+ post_install_message:
172
+ rdoc_options: []
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: 2.1.0
180
+ required_rubygems_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ requirements: []
186
+ rubygems_version: 3.0.3
187
+ signing_key:
188
+ specification_version: 4
189
+ summary: Redshift bulk data connector
190
+ test_files: []