redshift-connector 4.5.0 → 5.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cb3953798104bd92d6d856b8fd3b2d835ba9d54a
4
- data.tar.gz: e2f51e74ec4969e1187ce35e457a2af357bda1bf
3
+ metadata.gz: b3d56a36e8dba44e03271c37ecedd89871e5f0d2
4
+ data.tar.gz: 3237a2be89f2b86c524d99ce0064773a3807b41e
5
5
  SHA512:
6
- metadata.gz: cec111e3ca92096a3a0071b7a76a36b3c259213ab03aecb5f75ca1e3b2cc41fd1b4e140384a5a482944d7f02819cfe8da5010ae0bd8c9e9b0619001e09aa3542
7
- data.tar.gz: ac5dbcc924a7ffb5ee7387d4cf3f56a78371c10a870d4b3b3215502d19f256daff0bcc970ed2d7d946b6ecd62f8305a4ab55ecd6adbd0e4b1896cbf42b671067
6
+ metadata.gz: 0de405e11f29b3efad2e9161436396cd436396ceb525c284d505a42abb3893da882b6d3c1767b88589736776bc26cc8442be350ed77e40ca26e5ed73a1da096c
7
+ data.tar.gz: 6be60fc0ac03acb59f990dfb605abba2f49711ec32441d11d366ffa322b54835b9625deb3224429ff88f7528f2433a777028283f49fc81132d04f6a79d417cab
data/README.md CHANGED
@@ -8,32 +8,3 @@ Add following block to your Gemfile and bundle.
8
8
  ```
9
9
  gem 'redshift-connector'
10
10
  ```
11
- Add config/initializers/redshift-connector.rb like following:
12
- ```
13
- module RedshiftConnector
14
- Exporter.default_data_source = Any_ActiveRecord_Class_Bound_To_Redshift
15
-
16
- S3Bucket.add('primary', default: true,
17
- region: 'YOUR_AWS_REGION_NAME',
18
- bucket: 'YOUR_BUCKET_NAME',
19
- prefix: 'YOUR_PREFIX',
20
- iam_role: 'arn:aws:iam::XXXXXXXXXXXX:role/RedshiftReadOnly'
21
- # For explicit S3 access, use following:
22
- # aws_access_key_id: 'XXXXXXXXXXXXX',
23
- # aws_secret_access_key: 'XXXXXXXXXXXXX'
24
- )
25
- end
26
- ```
27
-
28
- ## Usage
29
-
30
- ### Fetching rows
31
-
32
- ```
33
- RedshiftConnector.foreach(schema: 'app_mst', table: 'shops', query: 'select id, name from app_mst.shops') do |id, name|
34
- p [id, name]
35
- end
36
- ```
37
- `schema` and `table` is the source table name (written in the query).
38
- This method executes Redshift UNLOAD statement with given query and
39
- unload result to the intermediate S3, then read contents.
@@ -116,18 +116,26 @@ module RedshiftConnector
116
116
  not ENV['IMPORT_ONLY']
117
117
  end
118
118
 
119
+ def export_forced?
120
+ !! (ENV['EXPORT_ONLY'] or ENV['FORCE'])
121
+ end
122
+
119
123
  def import_enabled?
120
124
  not ENV['EXPORT_ONLY']
121
125
  end
122
126
 
123
127
  def execute
124
- export if export_enabled?
128
+ export(forced: export_forced?) if export_enabled?
125
129
  import if import_enabled?
126
130
  end
127
131
 
128
- def export
132
+ def export(forced: false)
129
133
  @logger.info "==== export task =================================================="
130
- @exporter.execute
134
+ if not forced and @exporter.completed?
135
+ @logger.info "export task is already executed; skip"
136
+ else
137
+ @exporter.execute
138
+ end
131
139
  end
132
140
 
133
141
  def import
@@ -41,7 +41,6 @@ module RedshiftConnector
41
41
  query:,
42
42
  txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
43
43
  filter: nil,
44
- enable_sort: false,
45
44
  logger: RedshiftConnector.logger,
46
45
  quiet: false
47
46
  )
@@ -57,7 +56,7 @@ module RedshiftConnector
57
56
  )
58
57
  exporter = Exporter.new(
59
58
  ds: ds,
60
- query: UnloadQuery.wrap(query: query, bundle: bundle, enable_sort: enable_sort),
59
+ query: UnloadQuery.wrap(query: query, bundle: bundle),
61
60
  bundle: bundle,
62
61
  logger: logger
63
62
  )
@@ -75,6 +74,19 @@ module RedshiftConnector
75
74
  attr_reader :bundle
76
75
  attr_reader :logger
77
76
 
77
+ def completed?
78
+ @bundle.bucket.object(flag_object_key).exists?
79
+ end
80
+
81
+ def create_flag_object
82
+ @logger.info "TOUCH #{flag_object_key}"
83
+ @bundle.bucket.object(flag_object_key).put(body: "OK")
84
+ end
85
+
86
+ def flag_object_key
87
+ "#{File.dirname(@bundle.prefix)}/00completed"
88
+ end
89
+
78
90
  def execute
79
91
  @bundle.clear
80
92
  @logger.info "EXPORT #{@query.description} -> #{@bundle.url}*"
@@ -83,6 +95,7 @@ module RedshiftConnector
83
95
  @logger.info "[SQL/Redshift] #{batch_job_label}#{stmt.strip}"
84
96
  conn.execute(batch_job_label + stmt)
85
97
  end
98
+ create_flag_object
86
99
  end
87
100
 
88
101
  def batch_job_label
@@ -44,14 +44,13 @@ module RedshiftConnector
44
44
  end
45
45
 
46
46
  class UnloadQuery
47
- def UnloadQuery.wrap(query:, bundle:, enable_sort: false)
48
- new(query: ArbitraryQuery.new(query), bundle: bundle, enable_sort: enable_sort)
47
+ def UnloadQuery.wrap(query:, bundle:)
48
+ new(query: ArbitraryQuery.new(query), bundle: bundle)
49
49
  end
50
50
 
51
- def initialize(query:, bundle:, enable_sort: false)
51
+ def initialize(query:, bundle:)
52
52
  @query = query
53
53
  @bundle = bundle
54
- @enable_sort = enable_sort
55
54
  end
56
55
 
57
56
  def table_spec
@@ -69,7 +68,6 @@ module RedshiftConnector
69
68
  credentials '#{@bundle.credential_string}'
70
69
  gzip
71
70
  allowoverwrite
72
- parallel #{@enable_sort ? 'off' : 'on'}
73
71
  delimiter ',' escape addquotes
74
72
  EndSQL
75
73
  end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/reader/redshift_csv'
8
+ require 'redshift-connector/reader/csv'
9
+ require 'redshift-connector/reader/tsv'
10
+ require 'redshift-connector/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(obj)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+
4
+ module RedshiftConnector
5
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
6
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
7
+ class Reader::RedshiftCSV < Reader::Abstract
8
+ declare_reader :redshift_csv
9
+
10
+ def self.data_object?(obj)
11
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ # f :: IO
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each
20
+ # We can use simple #each_line to read single row
21
+ # because line terminators are always escaped by UNLOAD.
22
+ @f.each_line do |line|
23
+ yield parse_row(line, @f.lineno)
24
+ end
25
+ end
26
+
27
+ def parse_row(line, lineno = nil)
28
+ row = []
29
+ s = StringScanner.new(line)
30
+ s.skip(/\s+/)
31
+ until s.eos?
32
+ col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
33
+ row.push unescape_column(col)
34
+ s.skip(/\s*/) # skip line terminator on line ends
35
+ s.skip(/,\s*/)
36
+ end
37
+ row
38
+ end
39
+
40
+ UNESCAPE_MAP = {
41
+ '\\"' => '"',
42
+ "\\'" => "'",
43
+ '\\,' => ',',
44
+ '\\r' => "\r",
45
+ '\\n' => "\n",
46
+ '\\\\' => '\\'
47
+ }
48
+
49
+ def unescape_column(col)
50
+ charmap = UNESCAPE_MAP
51
+ col[1...-1].gsub(/\\./) {|s| charmap[s] }
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(obj)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -21,8 +21,7 @@ module RedshiftConnector
21
21
  @buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
22
22
  end
23
23
 
24
- def initialize(region: nil, bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
25
- @region = region
24
+ def initialize(bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
26
25
  @name = bucket
27
26
  @prefix = prefix
28
27
  @access_key_id = access_key_id
@@ -38,10 +37,7 @@ module RedshiftConnector
38
37
  end
39
38
 
40
39
  def client
41
- @client ||= begin
42
- args = { region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key }.reject {|k, v| v.nil? }
43
- Aws::S3::Client.new(**args)
44
- end
40
+ @client ||= Aws::S3::Client.new(access_key_id: @access_key_id, secret_access_key: @secret_access_key)
45
41
  end
46
42
 
47
43
  def bucket
@@ -1,7 +1,7 @@
1
- require 'redshift-connector/data_file'
1
+ require 'zlib'
2
2
 
3
3
  module RedshiftConnector
4
- class S3DataFile < AbstractDataFile
4
+ class S3DataFile
5
5
  def initialize(object, reader_class:)
6
6
  @object = object
7
7
  @reader_class = reader_class
@@ -11,10 +11,24 @@ module RedshiftConnector
11
11
  @object.key
12
12
  end
13
13
 
14
- def content
15
- @object.get.body
14
+ def each_row(&block)
15
+ response = @object.get
16
+ f = if gzipped_object?
17
+ Zlib::GzipReader.new(response.body)
18
+ else
19
+ response.body
20
+ end
21
+ @reader_class.new(f).each(&block)
22
+ ensure
23
+ response.body.close if response
16
24
  end
17
25
 
18
- delegate :presigned_url, to: :@object
26
+ def data_object?
27
+ @reader_class.data_object?(@object)
28
+ end
29
+
30
+ def gzipped_object?
31
+ File.extname(@object.key) == '.gz'
32
+ end
19
33
  end
20
34
  end
@@ -1,11 +1,11 @@
1
1
  require 'redshift-connector/s3_bucket'
2
2
  require 'redshift-connector/s3_data_file'
3
+ require 'redshift-connector/reader'
3
4
  require 'redshift-connector/logger'
4
- require 'redshift-connector/data_file'
5
5
  require 'aws-sdk'
6
6
 
7
7
  module RedshiftConnector
8
- class S3DataFileBundle < AbstractDataFileBundle
8
+ class S3DataFileBundle
9
9
  def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
10
10
  real_prefix = "#{bucket.prefix}/#{prefix}"
11
11
  new(bucket, real_prefix, format: format, filter: filter, batch_size: batch_size, logger: logger)
@@ -68,9 +68,25 @@ module RedshiftConnector
68
68
  end
69
69
  private :do_each_batch
70
70
 
71
- def data_files
71
+ def each_row(&block)
72
+ each_object do |obj|
73
+ obj.each_row(&block)
74
+ end
75
+ end
76
+
77
+ alias each each_row
78
+
79
+ def each_object(&block)
80
+ all_data_objects.each do |obj|
81
+ @logger.info "processing s3 object: #{obj.key}"
82
+ yield obj
83
+ end
84
+ end
85
+
86
+ def all_data_objects
72
87
  @bucket.objects(prefix: @prefix)
73
88
  .map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
89
+ .select {|obj| obj.data_object? }
74
90
  end
75
91
 
76
92
  def clear
@@ -1,3 +1,3 @@
1
1
  module RedshiftConnector
2
- VERSION = '4.5.0'
2
+ VERSION = '5.0.0'
3
3
  end
data/test/foreach.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require_relative 'helper'
2
2
 
3
- RedshiftConnector.foreach(schema: 'tabemiru', table: 'items', query: 'select id from tabemiru.items where id < 50 order by 1', enable_sort: true) do |row|
3
+ RedshiftConnector.foreach(schema: 'tabemiru', table: 'items', query: 'select * from tabemiru.items where id < 10') do |row|
4
4
  p row
5
5
  end
@@ -1,5 +1,5 @@
1
1
  require 'test/unit'
2
- require 'redshift-connector/data_file'
2
+ require 'redshift-connector/reader'
3
3
 
4
4
  module RedshiftConnector
5
5
  module Reader
data/test/test_reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'test/unit'
2
+ require 'redshift-connector/reader'
2
3
 
3
4
  module RedshiftConnector
4
5
  class TestReader < Test::Unit::TestCase
metadata CHANGED
@@ -1,31 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redshift-connector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.5.0
4
+ version: 5.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-12 00:00:00.000000000 Z
11
+ date: 2017-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "<"
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '5'
19
+ version: '5.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "<"
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '5'
26
+ version: '5.0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: activerecord4-redshift-adapter
28
+ name: activerecord5-redshift-adapter
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: redshift-connector-data_file
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: 1.0.0
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: 1.0.0
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: pg
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -154,6 +140,12 @@ files:
154
140
  - lib/redshift-connector/importer/upsert.rb
155
141
  - lib/redshift-connector/logger.rb
156
142
  - lib/redshift-connector/query.rb
143
+ - lib/redshift-connector/reader.rb
144
+ - lib/redshift-connector/reader/abstract.rb
145
+ - lib/redshift-connector/reader/csv.rb
146
+ - lib/redshift-connector/reader/exception.rb
147
+ - lib/redshift-connector/reader/redshift_csv.rb
148
+ - lib/redshift-connector/reader/tsv.rb
157
149
  - lib/redshift-connector/s3_bucket.rb
158
150
  - lib/redshift-connector/s3_data_file.rb
159
151
  - lib/redshift-connector/s3_data_file_bundle.rb