redshift_connector 8.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +20 -0
  3. data/Gemfile +7 -0
  4. data/LICENSE +21 -0
  5. data/README.md +42 -0
  6. data/RELEASE.md +89 -0
  7. data/Rakefile +3 -0
  8. data/lib/redshift_connector.rb +35 -0
  9. data/lib/redshift_connector/active_record_data_source.rb +23 -0
  10. data/lib/redshift_connector/active_record_exporter.rb +47 -0
  11. data/lib/redshift_connector/connector.rb +189 -0
  12. data/lib/redshift_connector/data_file.rb +32 -0
  13. data/lib/redshift_connector/data_file_bundle_params.rb +25 -0
  14. data/lib/redshift_connector/data_file_bundle_reader.rb +72 -0
  15. data/lib/redshift_connector/exception.rb +5 -0
  16. data/lib/redshift_connector/exporter.rb +40 -0
  17. data/lib/redshift_connector/exporter_builder.rb +49 -0
  18. data/lib/redshift_connector/immediate_exporter.rb +19 -0
  19. data/lib/redshift_connector/importer.rb +58 -0
  20. data/lib/redshift_connector/importer/activerecord-import.rb +2 -0
  21. data/lib/redshift_connector/importer/insert_delta.rb +31 -0
  22. data/lib/redshift_connector/importer/rebuild_rename.rb +58 -0
  23. data/lib/redshift_connector/importer/rebuild_truncate.rb +30 -0
  24. data/lib/redshift_connector/importer/upsert.rb +24 -0
  25. data/lib/redshift_connector/logger.rb +20 -0
  26. data/lib/redshift_connector/query.rb +95 -0
  27. data/lib/redshift_connector/reader.rb +18 -0
  28. data/lib/redshift_connector/reader/abstract.rb +18 -0
  29. data/lib/redshift_connector/reader/csv.rb +24 -0
  30. data/lib/redshift_connector/reader/exception.rb +3 -0
  31. data/lib/redshift_connector/reader/redshift_csv.rb +25 -0
  32. data/lib/redshift_connector/reader/tsv.rb +24 -0
  33. data/lib/redshift_connector/s3_bucket.rb +76 -0
  34. data/lib/redshift_connector/s3_data_file.rb +20 -0
  35. data/lib/redshift_connector/s3_data_file_bundle.rb +68 -0
  36. data/lib/redshift_connector/version.rb +3 -0
  37. data/redshift_connector.gemspec +27 -0
  38. metadata +190 -0
@@ -0,0 +1,20 @@
1
+ module RedshiftConnector
2
+ @logger = nil
3
+
4
+ def RedshiftConnector.logger
5
+ # Defer to access Rails
6
+ @logger || Rails.logger
7
+ end
8
+
9
+ def RedshiftConnector.logger=(logger)
10
+ @logger = logger
11
+ end
12
+
13
+ class NullLogger
14
+ def noop(*args) end
15
+ alias error noop
16
+ alias warn noop
17
+ alias info noop
18
+ alias debug noop
19
+ end
20
+ end
@@ -0,0 +1,95 @@
1
+ module RedshiftConnector
2
+ class DeltaQuery
3
+ def initialize(schema:, table:, columns:, condition: nil)
4
+ @schema = schema
5
+ @table = table
6
+ @columns = columns
7
+ @condition = condition
8
+ end
9
+
10
+ def table_spec
11
+ "#{@schema}.#{@table}"
12
+ end
13
+
14
+ def description
15
+ "#{table_spec} (#{@columns.join(', ')}) where (#{@condition})"
16
+ end
17
+
18
+ def to_sql
19
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
20
+ + " from #{table_spec}" \
21
+ + (@condition ? " where #{@condition}" : '')
22
+ end
23
+ end
24
+
25
+ class SelectAllQuery
26
+ def initialize(schema:, table:, columns:)
27
+ @schema = schema
28
+ @table = table
29
+ @columns = columns
30
+ end
31
+
32
+ def table_spec
33
+ "#{@schema}.#{@table}"
34
+ end
35
+
36
+ def description
37
+ "#{table_spec} (#{@columns.join(', ')})"
38
+ end
39
+
40
+ def to_sql
41
+ "select #{@columns.map {|c| %Q("#{c}") }.join(', ')}" \
42
+ + " from #{table_spec}"
43
+ end
44
+ end
45
+
46
+ class UnloadQuery
47
+ def UnloadQuery.wrap(query:, bundle:, enable_sort: false)
48
+ new(query: ArbitraryQuery.new(query), bundle: bundle, enable_sort: enable_sort)
49
+ end
50
+
51
+ def initialize(query:, bundle:, enable_sort: false)
52
+ @query = query
53
+ @bundle = bundle
54
+ @enable_sort = enable_sort
55
+ end
56
+
57
+ def table_spec
58
+ @query.table_spec
59
+ end
60
+
61
+ def description
62
+ @query.description
63
+ end
64
+
65
+ def to_sql
66
+ <<-EndSQL.gsub(/^\s+/, '')
67
+ unload ('#{escape_query(@query.to_sql)}')
68
+ to '#{@bundle.url}'
69
+ credentials '#{@bundle.credential_string}'
70
+ gzip
71
+ allowoverwrite
72
+ parallel #{@enable_sort ? 'off' : 'on'}
73
+ delimiter ',' escape addquotes
74
+ EndSQL
75
+ end
76
+
77
+ def escape_query(query)
78
+ query.gsub("'", "\\\\'")
79
+ end
80
+ end
81
+
82
+ class ArbitraryQuery
83
+ def initialize(query)
84
+ @query = query
85
+ end
86
+
87
+ def description
88
+ @query
89
+ end
90
+
91
+ def to_sql
92
+ @query
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift_connector/reader/redshift_csv'
8
+ require 'redshift_connector/reader/csv'
9
+ require 'redshift_connector/reader/tsv'
10
+ require 'redshift_connector/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift_connector/reader/abstract'
2
+ require 'redshift_connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(key)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,25 @@
1
+ require 'redshift_connector/reader/abstract'
2
+ require 'redshift_connector/reader/exception'
3
+ require 'redshift_csv_file'
4
+ require 'forwardable'
5
+
6
+ module RedshiftConnector
7
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
8
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
9
+ class Reader::RedshiftCSV < Reader::Abstract
10
+ declare_reader :redshift_csv
11
+
12
+ def self.data_object?(key)
13
+ /\.csv(?:\.|\z)/ =~ File.basename(key)
14
+ end
15
+
16
+ # f :: IO
17
+ def initialize(f)
18
+ @f = f
19
+ @csv = RedshiftCsvFile.new(@f)
20
+ end
21
+
22
+ extend Forwardable
23
+ def_delegators '@csv', :each, :each_row, :read_row
24
+ end
25
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift_connector/reader/abstract'
2
+ require 'redshift_connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(key)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,76 @@
1
+ require 'aws-sdk-s3'
2
+
3
+ module RedshiftConnector
4
+ class S3Bucket
5
+ @buckets = {}
6
+ @default = nil
7
+
8
+ def S3Bucket.add(name, default: false, **params)
9
+ instance = new(**params)
10
+ @buckets[name.to_s] = instance
11
+ if !@default or default
12
+ @default = instance
13
+ end
14
+ end
15
+
16
+ def S3Bucket.default
17
+ @default or raise ArgumentError, "no default S3 bucket configured"
18
+ end
19
+
20
+ def S3Bucket.get(name)
21
+ @buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
22
+ end
23
+
24
+ def initialize(region: nil, bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
25
+ @region = region
26
+ @name = bucket
27
+ @prefix = prefix
28
+ @access_key_id = access_key_id
29
+ @secret_access_key = secret_access_key
30
+ @iam_role = iam_role
31
+ end
32
+
33
+ attr_reader :name
34
+ attr_reader :prefix
35
+
36
+ def url
37
+ "s3://#{@bucket.name}/#{@prefix}/"
38
+ end
39
+
40
+ def client
41
+ @client ||= begin
42
+ args = { region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key }.reject {|k, v| v.nil? }
43
+ Aws::S3::Client.new(**args)
44
+ end
45
+ end
46
+
47
+ def bucket
48
+ @bucket ||= begin
49
+ resource = Aws::S3::Resource.new(client: client)
50
+ resource.bucket(@name)
51
+ end
52
+ end
53
+
54
+ def object(key)
55
+ bucket.object(key)
56
+ end
57
+
58
+ def objects(prefix:)
59
+ bucket.objects(prefix: prefix)
60
+ end
61
+
62
+ def delete_objects(keys)
63
+ bucket.delete_objects(delete: {objects: keys.map {|k| {key: k} }})
64
+ end
65
+
66
+ def credential_string
67
+ if @iam_role
68
+ "aws_iam_role=#{@iam_role}"
69
+ elsif @access_key_id
70
+ "aws_access_key_id=#{@access_key_id};aws_secret_access_key=#{@secret_access_key}"
71
+ else
72
+ raise ArgumentError, "no credential given for Redshift S3 access"
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,20 @@
1
+ require 'redshift_connector/data_file'
2
+
3
+ module RedshiftConnector
4
+ class S3DataFile < DataFile
5
+ def initialize(object, reader_class:)
6
+ super reader_class: reader_class
7
+ @object = object
8
+ end
9
+
10
+ def key
11
+ @object.key
12
+ end
13
+
14
+ def open
15
+ @object.get.body
16
+ end
17
+
18
+ delegate :presigned_url, to: :@object
19
+ end
20
+ end
@@ -0,0 +1,68 @@
1
+ require 'redshift_connector/s3_bucket'
2
+ require 'redshift_connector/s3_data_file'
3
+ require 'redshift_connector/reader'
4
+ require 'redshift_connector/logger'
5
+ require 'aws-sdk-s3'
6
+
7
+ module RedshiftConnector
8
+ class S3DataFileBundle
9
+ def self.for_params(params)
10
+ unless params.txn_id
11
+ raise ArgumentError, "cannot create bundle: missing txn_id"
12
+ end
13
+ s3bucket = params.bucket ? S3Bucket.get(params.bucket) : S3Bucket.default
14
+ for_table(
15
+ bucket: s3bucket,
16
+ schema: params.schema,
17
+ table: params.table,
18
+ txn_id: params.txn_id,
19
+ logger: params.logger
20
+ )
21
+ end
22
+
23
+ def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, logger: RedshiftConnector.logger)
24
+ real_prefix = "#{bucket.prefix}/#{prefix}"
25
+ new(bucket, real_prefix, format: format, logger: logger)
26
+ end
27
+
28
+ def self.for_table(bucket: S3Bucket.default, schema:, table:, txn_id:, logger: RedshiftConnector.logger)
29
+ prefix = "#{bucket.prefix}/#{schema}_export/#{table}/#{txn_id}/#{table}.csv."
30
+ new(bucket, prefix, format: :redshift_csv, logger: logger)
31
+ end
32
+
33
+ def initialize(bucket, prefix, format: :csv, logger: RedshiftConnector.logger)
34
+ @bucket = bucket
35
+ @prefix = prefix
36
+ @format = format
37
+ @logger = logger
38
+ @reader_class = Reader.get(format)
39
+ logger.info "reader: #{@reader_class}"
40
+ end
41
+
42
+ attr_reader :bucket
43
+ attr_reader :prefix
44
+ attr_reader :logger
45
+
46
+ def url
47
+ "s3://#{@bucket.name}/#{@prefix}"
48
+ end
49
+
50
+ def credential_string
51
+ @bucket.credential_string
52
+ end
53
+
54
+ def data_files
55
+ @bucket.objects(prefix: @prefix)
56
+ .map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
57
+ end
58
+
59
+ def clear
60
+ pref = File.dirname(@prefix) + '/'
61
+ keys = @bucket.objects(prefix: pref).map(&:key)
62
+ unless keys.empty?
63
+ logger.info "DELETE #{pref}*"
64
+ @bucket.delete_objects(keys)
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ VERSION = '8.0.0'
3
+ end
@@ -0,0 +1,27 @@
1
+ require_relative 'lib/redshift_connector/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.platform = Gem::Platform::RUBY
5
+ s.name = 'redshift_connector'
6
+ s.version = RedshiftConnector::VERSION
7
+ s.summary = 'Redshift bulk data connector'
8
+ s.description = 'redshift_connector is a bulk data connector for Rails (ActiveRecord).'
9
+ s.license = 'MIT'
10
+
11
+ s.author = ['Minero Aoki']
12
+ s.email = 'aamine@loveruby.net'
13
+ s.homepage = 'https://github.com/bricolages/redshift_connector'
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
16
+ s.require_path = 'lib'
17
+
18
+ s.required_ruby_version = '>= 2.1.0'
19
+ s.add_dependency 'activerecord'
20
+ s.add_dependency 'activerecord-redshift'
21
+ s.add_dependency 'pg', '~> 0.18.0'
22
+ s.add_dependency 'activerecord-import'
23
+ s.add_dependency 'redshift_csv_file', '~> 1.0'
24
+ s.add_dependency 'aws-sdk-s3', '~> 1.0'
25
+ s.add_development_dependency 'test-unit'
26
+ s.add_development_dependency 'rake'
27
+ end
metadata ADDED
@@ -0,0 +1,190 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshift_connector
3
+ version: !ruby/object:Gem::Version
4
+ version: 8.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Minero Aoki
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-07-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activerecord
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: activerecord-redshift
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pg
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.18.0
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.18.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: activerecord-import
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: redshift_csv_file
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: aws-sdk-s3
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: test-unit
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ description: redshift_connector is a bulk data connector for Rails (ActiveRecord).
126
+ email: aamine@loveruby.net
127
+ executables: []
128
+ extensions: []
129
+ extra_rdoc_files: []
130
+ files:
131
+ - ".gitignore"
132
+ - Gemfile
133
+ - LICENSE
134
+ - README.md
135
+ - RELEASE.md
136
+ - Rakefile
137
+ - lib/redshift_connector.rb
138
+ - lib/redshift_connector/active_record_data_source.rb
139
+ - lib/redshift_connector/active_record_exporter.rb
140
+ - lib/redshift_connector/connector.rb
141
+ - lib/redshift_connector/data_file.rb
142
+ - lib/redshift_connector/data_file_bundle_params.rb
143
+ - lib/redshift_connector/data_file_bundle_reader.rb
144
+ - lib/redshift_connector/exception.rb
145
+ - lib/redshift_connector/exporter.rb
146
+ - lib/redshift_connector/exporter_builder.rb
147
+ - lib/redshift_connector/immediate_exporter.rb
148
+ - lib/redshift_connector/importer.rb
149
+ - lib/redshift_connector/importer/activerecord-import.rb
150
+ - lib/redshift_connector/importer/insert_delta.rb
151
+ - lib/redshift_connector/importer/rebuild_rename.rb
152
+ - lib/redshift_connector/importer/rebuild_truncate.rb
153
+ - lib/redshift_connector/importer/upsert.rb
154
+ - lib/redshift_connector/logger.rb
155
+ - lib/redshift_connector/query.rb
156
+ - lib/redshift_connector/reader.rb
157
+ - lib/redshift_connector/reader/abstract.rb
158
+ - lib/redshift_connector/reader/csv.rb
159
+ - lib/redshift_connector/reader/exception.rb
160
+ - lib/redshift_connector/reader/redshift_csv.rb
161
+ - lib/redshift_connector/reader/tsv.rb
162
+ - lib/redshift_connector/s3_bucket.rb
163
+ - lib/redshift_connector/s3_data_file.rb
164
+ - lib/redshift_connector/s3_data_file_bundle.rb
165
+ - lib/redshift_connector/version.rb
166
+ - redshift_connector.gemspec
167
+ homepage: https://github.com/bricolages/redshift_connector
168
+ licenses:
169
+ - MIT
170
+ metadata: {}
171
+ post_install_message:
172
+ rdoc_options: []
173
+ require_paths:
174
+ - lib
175
+ required_ruby_version: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: 2.1.0
180
+ required_rubygems_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: '0'
185
+ requirements: []
186
+ rubygems_version: 3.0.3
187
+ signing_key:
188
+ specification_version: 4
189
+ summary: Redshift bulk data connector
190
+ test_files: []