redshift-connector 4.5.0 → 5.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -29
- data/lib/redshift-connector/connector.rb +11 -3
- data/lib/redshift-connector/exporter.rb +15 -2
- data/lib/redshift-connector/query.rb +3 -5
- data/lib/redshift-connector/reader.rb +18 -0
- data/lib/redshift-connector/reader/abstract.rb +18 -0
- data/lib/redshift-connector/reader/csv.rb +24 -0
- data/lib/redshift-connector/reader/exception.rb +3 -0
- data/lib/redshift-connector/reader/redshift_csv.rb +54 -0
- data/lib/redshift-connector/reader/tsv.rb +24 -0
- data/lib/redshift-connector/s3_bucket.rb +2 -6
- data/lib/redshift-connector/s3_data_file.rb +19 -5
- data/lib/redshift-connector/s3_data_file_bundle.rb +19 -3
- data/lib/redshift-connector/version.rb +1 -1
- data/test/foreach.rb +1 -1
- data/test/reader/test_redshift_csv.rb +1 -1
- data/test/test_reader.rb +1 -0
- metadata +13 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b3d56a36e8dba44e03271c37ecedd89871e5f0d2
|
4
|
+
data.tar.gz: 3237a2be89f2b86c524d99ce0064773a3807b41e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0de405e11f29b3efad2e9161436396cd436396ceb525c284d505a42abb3893da882b6d3c1767b88589736776bc26cc8442be350ed77e40ca26e5ed73a1da096c
|
7
|
+
data.tar.gz: 6be60fc0ac03acb59f990dfb605abba2f49711ec32441d11d366ffa322b54835b9625deb3224429ff88f7528f2433a777028283f49fc81132d04f6a79d417cab
|
data/README.md
CHANGED
@@ -8,32 +8,3 @@ Add following block to your Gemfile and bundle.
|
|
8
8
|
```
|
9
9
|
gem 'redshift-connector'
|
10
10
|
```
|
11
|
-
Add config/initializers/redshift-connector.rb like following:
|
12
|
-
```
|
13
|
-
module RedshiftConnector
|
14
|
-
Exporter.default_data_source = Any_ActiveRecord_Class_Bound_To_Redshift
|
15
|
-
|
16
|
-
S3Bucket.add('primary', default: true,
|
17
|
-
region: 'YOUR_AWS_REGION_NAME',
|
18
|
-
bucket: 'YOUR_BUCKET_NAME',
|
19
|
-
prefix: 'YOUR_PREFIX',
|
20
|
-
iam_role: 'arn:aws:iam::XXXXXXXXXXXX:role/RedshiftReadOnly'
|
21
|
-
# For explicit S3 access, use following:
|
22
|
-
# aws_access_key_id: 'XXXXXXXXXXXXX',
|
23
|
-
# aws_secret_access_key: 'XXXXXXXXXXXXX'
|
24
|
-
)
|
25
|
-
end
|
26
|
-
```
|
27
|
-
|
28
|
-
## Usage
|
29
|
-
|
30
|
-
### Fetching rows
|
31
|
-
|
32
|
-
```
|
33
|
-
RedshiftConnector.foreach(schema: 'app_mst', table: 'shops', query: 'select id, name from app_mst.shops') do |id, name|
|
34
|
-
p [id, name]
|
35
|
-
end
|
36
|
-
```
|
37
|
-
`schema` and `table` is the source table name (written in the query).
|
38
|
-
This method executes Redshift UNLOAD statement with given query and
|
39
|
-
unload result to the intermediate S3, then read contents.
|
@@ -116,18 +116,26 @@ module RedshiftConnector
|
|
116
116
|
not ENV['IMPORT_ONLY']
|
117
117
|
end
|
118
118
|
|
119
|
+
def export_forced?
|
120
|
+
!! (ENV['EXPORT_ONLY'] or ENV['FORCE'])
|
121
|
+
end
|
122
|
+
|
119
123
|
def import_enabled?
|
120
124
|
not ENV['EXPORT_ONLY']
|
121
125
|
end
|
122
126
|
|
123
127
|
def execute
|
124
|
-
export if export_enabled?
|
128
|
+
export(forced: export_forced?) if export_enabled?
|
125
129
|
import if import_enabled?
|
126
130
|
end
|
127
131
|
|
128
|
-
def export
|
132
|
+
def export(forced: false)
|
129
133
|
@logger.info "==== export task =================================================="
|
130
|
-
@exporter.
|
134
|
+
if not forced and @exporter.completed?
|
135
|
+
@logger.info "export task is already executed; skip"
|
136
|
+
else
|
137
|
+
@exporter.execute
|
138
|
+
end
|
131
139
|
end
|
132
140
|
|
133
141
|
def import
|
@@ -41,7 +41,6 @@ module RedshiftConnector
|
|
41
41
|
query:,
|
42
42
|
txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
|
43
43
|
filter: nil,
|
44
|
-
enable_sort: false,
|
45
44
|
logger: RedshiftConnector.logger,
|
46
45
|
quiet: false
|
47
46
|
)
|
@@ -57,7 +56,7 @@ module RedshiftConnector
|
|
57
56
|
)
|
58
57
|
exporter = Exporter.new(
|
59
58
|
ds: ds,
|
60
|
-
query: UnloadQuery.wrap(query: query, bundle: bundle
|
59
|
+
query: UnloadQuery.wrap(query: query, bundle: bundle),
|
61
60
|
bundle: bundle,
|
62
61
|
logger: logger
|
63
62
|
)
|
@@ -75,6 +74,19 @@ module RedshiftConnector
|
|
75
74
|
attr_reader :bundle
|
76
75
|
attr_reader :logger
|
77
76
|
|
77
|
+
def completed?
|
78
|
+
@bundle.bucket.object(flag_object_key).exists?
|
79
|
+
end
|
80
|
+
|
81
|
+
def create_flag_object
|
82
|
+
@logger.info "TOUCH #{flag_object_key}"
|
83
|
+
@bundle.bucket.object(flag_object_key).put(body: "OK")
|
84
|
+
end
|
85
|
+
|
86
|
+
def flag_object_key
|
87
|
+
"#{File.dirname(@bundle.prefix)}/00completed"
|
88
|
+
end
|
89
|
+
|
78
90
|
def execute
|
79
91
|
@bundle.clear
|
80
92
|
@logger.info "EXPORT #{@query.description} -> #{@bundle.url}*"
|
@@ -83,6 +95,7 @@ module RedshiftConnector
|
|
83
95
|
@logger.info "[SQL/Redshift] #{batch_job_label}#{stmt.strip}"
|
84
96
|
conn.execute(batch_job_label + stmt)
|
85
97
|
end
|
98
|
+
create_flag_object
|
86
99
|
end
|
87
100
|
|
88
101
|
def batch_job_label
|
@@ -44,14 +44,13 @@ module RedshiftConnector
|
|
44
44
|
end
|
45
45
|
|
46
46
|
class UnloadQuery
|
47
|
-
def UnloadQuery.wrap(query:, bundle
|
48
|
-
new(query: ArbitraryQuery.new(query), bundle: bundle
|
47
|
+
def UnloadQuery.wrap(query:, bundle:)
|
48
|
+
new(query: ArbitraryQuery.new(query), bundle: bundle)
|
49
49
|
end
|
50
50
|
|
51
|
-
def initialize(query:, bundle
|
51
|
+
def initialize(query:, bundle:)
|
52
52
|
@query = query
|
53
53
|
@bundle = bundle
|
54
|
-
@enable_sort = enable_sort
|
55
54
|
end
|
56
55
|
|
57
56
|
def table_spec
|
@@ -69,7 +68,6 @@ module RedshiftConnector
|
|
69
68
|
credentials '#{@bundle.credential_string}'
|
70
69
|
gzip
|
71
70
|
allowoverwrite
|
72
|
-
parallel #{@enable_sort ? 'off' : 'on'}
|
73
71
|
delimiter ',' escape addquotes
|
74
72
|
EndSQL
|
75
73
|
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# create module
|
2
|
+
module RedshiftConnector
|
3
|
+
module Reader
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
require 'redshift-connector/reader/redshift_csv'
|
8
|
+
require 'redshift-connector/reader/csv'
|
9
|
+
require 'redshift-connector/reader/tsv'
|
10
|
+
require 'redshift-connector/reader/exception'
|
11
|
+
|
12
|
+
module RedshiftConnector
|
13
|
+
module Reader
|
14
|
+
def Reader.get(id)
|
15
|
+
Abstract.get_reader_class(id)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module RedshiftConnector
|
2
|
+
class Reader::Abstract
|
3
|
+
READER_CLASSES = {} # {Symbol => Class}
|
4
|
+
|
5
|
+
def self.declare_reader(id)
|
6
|
+
READER_CLASSES[id.to_sym] = self
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.get_reader_class(id)
|
10
|
+
READER_CLASSES[id.to_sym] or
|
11
|
+
raise ArgumentError, "unknown data file reader type: #{id.inspect}"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.get_reader_class(id)
|
16
|
+
Reader::Abstract.get_reader_class(id)
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'redshift-connector/reader/abstract'
|
2
|
+
require 'redshift-connector/reader/exception'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module RedshiftConnector
|
6
|
+
# Parses (standard) CSV files.
|
7
|
+
# For UNLOAD-generated CSV, use RedshiftCSV class.
|
8
|
+
class Reader::CSV < Reader::Abstract
|
9
|
+
declare_reader :csv
|
10
|
+
|
11
|
+
def self.data_object?(obj)
|
12
|
+
/\.csv(?:\.|\z)/ =~ File.basename(obj.key)
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize(f)
|
16
|
+
@f = f
|
17
|
+
end
|
18
|
+
|
19
|
+
def each(&block)
|
20
|
+
csv = CSV.new(@f)
|
21
|
+
csv.each(&block)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'redshift-connector/reader/abstract'
|
2
|
+
require 'redshift-connector/reader/exception'
|
3
|
+
|
4
|
+
module RedshiftConnector
|
5
|
+
# Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
|
6
|
+
# UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
|
7
|
+
class Reader::RedshiftCSV < Reader::Abstract
|
8
|
+
declare_reader :redshift_csv
|
9
|
+
|
10
|
+
def self.data_object?(obj)
|
11
|
+
/\.csv(?:\.|\z)/ =~ File.basename(obj.key)
|
12
|
+
end
|
13
|
+
|
14
|
+
# f :: IO
|
15
|
+
def initialize(f)
|
16
|
+
@f = f
|
17
|
+
end
|
18
|
+
|
19
|
+
def each
|
20
|
+
# We can use simple #each_line to read single row
|
21
|
+
# because line terminators are always escaped by UNLOAD.
|
22
|
+
@f.each_line do |line|
|
23
|
+
yield parse_row(line, @f.lineno)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_row(line, lineno = nil)
|
28
|
+
row = []
|
29
|
+
s = StringScanner.new(line)
|
30
|
+
s.skip(/\s+/)
|
31
|
+
until s.eos?
|
32
|
+
col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
|
33
|
+
row.push unescape_column(col)
|
34
|
+
s.skip(/\s*/) # skip line terminator on line ends
|
35
|
+
s.skip(/,\s*/)
|
36
|
+
end
|
37
|
+
row
|
38
|
+
end
|
39
|
+
|
40
|
+
UNESCAPE_MAP = {
|
41
|
+
'\\"' => '"',
|
42
|
+
"\\'" => "'",
|
43
|
+
'\\,' => ',',
|
44
|
+
'\\r' => "\r",
|
45
|
+
'\\n' => "\n",
|
46
|
+
'\\\\' => '\\'
|
47
|
+
}
|
48
|
+
|
49
|
+
def unescape_column(col)
|
50
|
+
charmap = UNESCAPE_MAP
|
51
|
+
col[1...-1].gsub(/\\./) {|s| charmap[s] }
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'redshift-connector/reader/abstract'
|
2
|
+
require 'redshift-connector/reader/exception'
|
3
|
+
require 'csv'
|
4
|
+
|
5
|
+
module RedshiftConnector
|
6
|
+
# Parses TSV (Tab Separated Format) files.
|
7
|
+
class Reader::TSV < Reader::Abstract
|
8
|
+
declare_reader :tsv
|
9
|
+
|
10
|
+
def self.data_object?(obj)
|
11
|
+
/\.tsv(?:\.|\z)/ =~ File.basename(obj.key)
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize(f)
|
15
|
+
@f = f
|
16
|
+
end
|
17
|
+
|
18
|
+
def each(&block)
|
19
|
+
@f.each_line do |line|
|
20
|
+
yield line.chomp.split("\t", -1)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -21,8 +21,7 @@ module RedshiftConnector
|
|
21
21
|
@buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
|
22
22
|
end
|
23
23
|
|
24
|
-
def initialize(
|
25
|
-
@region = region
|
24
|
+
def initialize(bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
|
26
25
|
@name = bucket
|
27
26
|
@prefix = prefix
|
28
27
|
@access_key_id = access_key_id
|
@@ -38,10 +37,7 @@ module RedshiftConnector
|
|
38
37
|
end
|
39
38
|
|
40
39
|
def client
|
41
|
-
@client ||=
|
42
|
-
args = { region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key }.reject {|k, v| v.nil? }
|
43
|
-
Aws::S3::Client.new(**args)
|
44
|
-
end
|
40
|
+
@client ||= Aws::S3::Client.new(access_key_id: @access_key_id, secret_access_key: @secret_access_key)
|
45
41
|
end
|
46
42
|
|
47
43
|
def bucket
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require '
|
1
|
+
require 'zlib'
|
2
2
|
|
3
3
|
module RedshiftConnector
|
4
|
-
class S3DataFile
|
4
|
+
class S3DataFile
|
5
5
|
def initialize(object, reader_class:)
|
6
6
|
@object = object
|
7
7
|
@reader_class = reader_class
|
@@ -11,10 +11,24 @@ module RedshiftConnector
|
|
11
11
|
@object.key
|
12
12
|
end
|
13
13
|
|
14
|
-
def
|
15
|
-
@object.get
|
14
|
+
def each_row(&block)
|
15
|
+
response = @object.get
|
16
|
+
f = if gzipped_object?
|
17
|
+
Zlib::GzipReader.new(response.body)
|
18
|
+
else
|
19
|
+
response.body
|
20
|
+
end
|
21
|
+
@reader_class.new(f).each(&block)
|
22
|
+
ensure
|
23
|
+
response.body.close if response
|
16
24
|
end
|
17
25
|
|
18
|
-
|
26
|
+
def data_object?
|
27
|
+
@reader_class.data_object?(@object)
|
28
|
+
end
|
29
|
+
|
30
|
+
def gzipped_object?
|
31
|
+
File.extname(@object.key) == '.gz'
|
32
|
+
end
|
19
33
|
end
|
20
34
|
end
|
@@ -1,11 +1,11 @@
|
|
1
1
|
require 'redshift-connector/s3_bucket'
|
2
2
|
require 'redshift-connector/s3_data_file'
|
3
|
+
require 'redshift-connector/reader'
|
3
4
|
require 'redshift-connector/logger'
|
4
|
-
require 'redshift-connector/data_file'
|
5
5
|
require 'aws-sdk'
|
6
6
|
|
7
7
|
module RedshiftConnector
|
8
|
-
class S3DataFileBundle
|
8
|
+
class S3DataFileBundle
|
9
9
|
def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
|
10
10
|
real_prefix = "#{bucket.prefix}/#{prefix}"
|
11
11
|
new(bucket, real_prefix, format: format, filter: filter, batch_size: batch_size, logger: logger)
|
@@ -68,9 +68,25 @@ module RedshiftConnector
|
|
68
68
|
end
|
69
69
|
private :do_each_batch
|
70
70
|
|
71
|
-
def
|
71
|
+
def each_row(&block)
|
72
|
+
each_object do |obj|
|
73
|
+
obj.each_row(&block)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
alias each each_row
|
78
|
+
|
79
|
+
def each_object(&block)
|
80
|
+
all_data_objects.each do |obj|
|
81
|
+
@logger.info "processing s3 object: #{obj.key}"
|
82
|
+
yield obj
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
def all_data_objects
|
72
87
|
@bucket.objects(prefix: @prefix)
|
73
88
|
.map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
|
89
|
+
.select {|obj| obj.data_object? }
|
74
90
|
end
|
75
91
|
|
76
92
|
def clear
|
data/test/foreach.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require_relative 'helper'
|
2
2
|
|
3
|
-
RedshiftConnector.foreach(schema: 'tabemiru', table: 'items', query: 'select
|
3
|
+
RedshiftConnector.foreach(schema: 'tabemiru', table: 'items', query: 'select * from tabemiru.items where id < 10') do |row|
|
4
4
|
p row
|
5
5
|
end
|
data/test/test_reader.rb
CHANGED
metadata
CHANGED
@@ -1,31 +1,31 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: redshift-connector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 5.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Minero Aoki
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-02-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activerecord
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '5'
|
19
|
+
version: '5.0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '5'
|
26
|
+
version: '5.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: activerecord5-redshift-adapter
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - ">="
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: redshift-connector-data_file
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: 1.0.0
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: 1.0.0
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: pg
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -154,6 +140,12 @@ files:
|
|
154
140
|
- lib/redshift-connector/importer/upsert.rb
|
155
141
|
- lib/redshift-connector/logger.rb
|
156
142
|
- lib/redshift-connector/query.rb
|
143
|
+
- lib/redshift-connector/reader.rb
|
144
|
+
- lib/redshift-connector/reader/abstract.rb
|
145
|
+
- lib/redshift-connector/reader/csv.rb
|
146
|
+
- lib/redshift-connector/reader/exception.rb
|
147
|
+
- lib/redshift-connector/reader/redshift_csv.rb
|
148
|
+
- lib/redshift-connector/reader/tsv.rb
|
157
149
|
- lib/redshift-connector/s3_bucket.rb
|
158
150
|
- lib/redshift-connector/s3_data_file.rb
|
159
151
|
- lib/redshift-connector/s3_data_file_bundle.rb
|