redshift-connector 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cb3953798104bd92d6d856b8fd3b2d835ba9d54a
4
- data.tar.gz: e2f51e74ec4969e1187ce35e457a2af357bda1bf
3
+ metadata.gz: b3d56a36e8dba44e03271c37ecedd89871e5f0d2
4
+ data.tar.gz: 3237a2be89f2b86c524d99ce0064773a3807b41e
5
5
  SHA512:
6
- metadata.gz: cec111e3ca92096a3a0071b7a76a36b3c259213ab03aecb5f75ca1e3b2cc41fd1b4e140384a5a482944d7f02819cfe8da5010ae0bd8c9e9b0619001e09aa3542
7
- data.tar.gz: ac5dbcc924a7ffb5ee7387d4cf3f56a78371c10a870d4b3b3215502d19f256daff0bcc970ed2d7d946b6ecd62f8305a4ab55ecd6adbd0e4b1896cbf42b671067
6
+ metadata.gz: 0de405e11f29b3efad2e9161436396cd436396ceb525c284d505a42abb3893da882b6d3c1767b88589736776bc26cc8442be350ed77e40ca26e5ed73a1da096c
7
+ data.tar.gz: 6be60fc0ac03acb59f990dfb605abba2f49711ec32441d11d366ffa322b54835b9625deb3224429ff88f7528f2433a777028283f49fc81132d04f6a79d417cab
data/README.md CHANGED
@@ -8,32 +8,3 @@ Add following block to your Gemfile and bundle.
8
8
  ```
9
9
  gem 'redshift-connector'
10
10
  ```
11
- Add config/initializers/redshift-connector.rb like following:
12
- ```
13
- module RedshiftConnector
14
- Exporter.default_data_source = Any_ActiveRecord_Class_Bound_To_Redshift
15
-
16
- S3Bucket.add('primary', default: true,
17
- region: 'YOUR_AWS_REGION_NAME',
18
- bucket: 'YOUR_BUCKET_NAME',
19
- prefix: 'YOUR_PREFIX',
20
- iam_role: 'arn:aws:iam::XXXXXXXXXXXX:role/RedshiftReadOnly'
21
- # For explicit S3 access, use following:
22
- # aws_access_key_id: 'XXXXXXXXXXXXX',
23
- # aws_secret_access_key: 'XXXXXXXXXXXXX'
24
- )
25
- end
26
- ```
27
-
28
- ## Usage
29
-
30
- ### Fetching rows
31
-
32
- ```
33
- RedshiftConnector.foreach(schema: 'app_mst', table: 'shops', query: 'select id, name from app_mst.shops') do |id, name|
34
- p [id, name]
35
- end
36
- ```
37
- `schema` and `table` is the source table name (written in the query).
38
- This method executes Redshift UNLOAD statement with given query and
39
- unload result to the intermediate S3, then read contents.
@@ -116,18 +116,26 @@ module RedshiftConnector
116
116
  not ENV['IMPORT_ONLY']
117
117
  end
118
118
 
119
+ def export_forced?
120
+ !! (ENV['EXPORT_ONLY'] or ENV['FORCE'])
121
+ end
122
+
119
123
  def import_enabled?
120
124
  not ENV['EXPORT_ONLY']
121
125
  end
122
126
 
123
127
  def execute
124
- export if export_enabled?
128
+ export(forced: export_forced?) if export_enabled?
125
129
  import if import_enabled?
126
130
  end
127
131
 
128
- def export
132
+ def export(forced: false)
129
133
  @logger.info "==== export task =================================================="
130
- @exporter.execute
134
+ if not forced and @exporter.completed?
135
+ @logger.info "export task is already executed; skip"
136
+ else
137
+ @exporter.execute
138
+ end
131
139
  end
132
140
 
133
141
  def import
@@ -41,7 +41,6 @@ module RedshiftConnector
41
41
  query:,
42
42
  txn_id: "#{Time.now.strftime('%Y%m%d_%H%M%S')}_#{$$}",
43
43
  filter: nil,
44
- enable_sort: false,
45
44
  logger: RedshiftConnector.logger,
46
45
  quiet: false
47
46
  )
@@ -57,7 +56,7 @@ module RedshiftConnector
57
56
  )
58
57
  exporter = Exporter.new(
59
58
  ds: ds,
60
- query: UnloadQuery.wrap(query: query, bundle: bundle, enable_sort: enable_sort),
59
+ query: UnloadQuery.wrap(query: query, bundle: bundle),
61
60
  bundle: bundle,
62
61
  logger: logger
63
62
  )
@@ -75,6 +74,19 @@ module RedshiftConnector
75
74
  attr_reader :bundle
76
75
  attr_reader :logger
77
76
 
77
+ def completed?
78
+ @bundle.bucket.object(flag_object_key).exists?
79
+ end
80
+
81
+ def create_flag_object
82
+ @logger.info "TOUCH #{flag_object_key}"
83
+ @bundle.bucket.object(flag_object_key).put(body: "OK")
84
+ end
85
+
86
+ def flag_object_key
87
+ "#{File.dirname(@bundle.prefix)}/00completed"
88
+ end
89
+
78
90
  def execute
79
91
  @bundle.clear
80
92
  @logger.info "EXPORT #{@query.description} -> #{@bundle.url}*"
@@ -83,6 +95,7 @@ module RedshiftConnector
83
95
  @logger.info "[SQL/Redshift] #{batch_job_label}#{stmt.strip}"
84
96
  conn.execute(batch_job_label + stmt)
85
97
  end
98
+ create_flag_object
86
99
  end
87
100
 
88
101
  def batch_job_label
@@ -44,14 +44,13 @@ module RedshiftConnector
44
44
  end
45
45
 
46
46
  class UnloadQuery
47
- def UnloadQuery.wrap(query:, bundle:, enable_sort: false)
48
- new(query: ArbitraryQuery.new(query), bundle: bundle, enable_sort: enable_sort)
47
+ def UnloadQuery.wrap(query:, bundle:)
48
+ new(query: ArbitraryQuery.new(query), bundle: bundle)
49
49
  end
50
50
 
51
- def initialize(query:, bundle:, enable_sort: false)
51
+ def initialize(query:, bundle:)
52
52
  @query = query
53
53
  @bundle = bundle
54
- @enable_sort = enable_sort
55
54
  end
56
55
 
57
56
  def table_spec
@@ -69,7 +68,6 @@ module RedshiftConnector
69
68
  credentials '#{@bundle.credential_string}'
70
69
  gzip
71
70
  allowoverwrite
72
- parallel #{@enable_sort ? 'off' : 'on'}
73
71
  delimiter ',' escape addquotes
74
72
  EndSQL
75
73
  end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/reader/redshift_csv'
8
+ require 'redshift-connector/reader/csv'
9
+ require 'redshift-connector/reader/tsv'
10
+ require 'redshift-connector/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(obj)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+
4
+ module RedshiftConnector
5
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
6
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
7
+ class Reader::RedshiftCSV < Reader::Abstract
8
+ declare_reader :redshift_csv
9
+
10
+ def self.data_object?(obj)
11
+ /\.csv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ # f :: IO
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each
20
+ # We can use simple #each_line to read single row
21
+ # because line terminators are always escaped by UNLOAD.
22
+ @f.each_line do |line|
23
+ yield parse_row(line, @f.lineno)
24
+ end
25
+ end
26
+
27
+ def parse_row(line, lineno = nil)
28
+ row = []
29
+ s = StringScanner.new(line)
30
+ s.skip(/\s+/)
31
+ until s.eos?
32
+ col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
33
+ row.push unescape_column(col)
34
+ s.skip(/\s*/) # skip line terminator on line ends
35
+ s.skip(/,\s*/)
36
+ end
37
+ row
38
+ end
39
+
40
+ UNESCAPE_MAP = {
41
+ '\\"' => '"',
42
+ "\\'" => "'",
43
+ '\\,' => ',',
44
+ '\\r' => "\r",
45
+ '\\n' => "\n",
46
+ '\\\\' => '\\'
47
+ }
48
+
49
+ def unescape_column(col)
50
+ charmap = UNESCAPE_MAP
51
+ col[1...-1].gsub(/\\./) {|s| charmap[s] }
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/reader/abstract'
2
+ require 'redshift-connector/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(obj)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(obj.key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -21,8 +21,7 @@ module RedshiftConnector
21
21
  @buckets[name.to_s] or raise ArgumentError, "no such S3 bucket configured: #{name.inspect}"
22
22
  end
23
23
 
24
- def initialize(region: nil, bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
25
- @region = region
24
+ def initialize(bucket:, prefix: nil, access_key_id: nil, secret_access_key: nil, iam_role: nil)
26
25
  @name = bucket
27
26
  @prefix = prefix
28
27
  @access_key_id = access_key_id
@@ -38,10 +37,7 @@ module RedshiftConnector
38
37
  end
39
38
 
40
39
  def client
41
- @client ||= begin
42
- args = { region: @region, access_key_id: @access_key_id, secret_access_key: @secret_access_key }.reject {|k, v| v.nil? }
43
- Aws::S3::Client.new(**args)
44
- end
40
+ @client ||= Aws::S3::Client.new(access_key_id: @access_key_id, secret_access_key: @secret_access_key)
45
41
  end
46
42
 
47
43
  def bucket
@@ -1,7 +1,7 @@
1
- require 'redshift-connector/data_file'
1
+ require 'zlib'
2
2
 
3
3
  module RedshiftConnector
4
- class S3DataFile < AbstractDataFile
4
+ class S3DataFile
5
5
  def initialize(object, reader_class:)
6
6
  @object = object
7
7
  @reader_class = reader_class
@@ -11,10 +11,24 @@ module RedshiftConnector
11
11
  @object.key
12
12
  end
13
13
 
14
- def content
15
- @object.get.body
14
+ def each_row(&block)
15
+ response = @object.get
16
+ f = if gzipped_object?
17
+ Zlib::GzipReader.new(response.body)
18
+ else
19
+ response.body
20
+ end
21
+ @reader_class.new(f).each(&block)
22
+ ensure
23
+ response.body.close if response
16
24
  end
17
25
 
18
- delegate :presigned_url, to: :@object
26
+ def data_object?
27
+ @reader_class.data_object?(@object)
28
+ end
29
+
30
+ def gzipped_object?
31
+ File.extname(@object.key) == '.gz'
32
+ end
19
33
  end
20
34
  end
@@ -1,11 +1,11 @@
1
1
  require 'redshift-connector/s3_bucket'
2
2
  require 'redshift-connector/s3_data_file'
3
+ require 'redshift-connector/reader'
3
4
  require 'redshift-connector/logger'
4
- require 'redshift-connector/data_file'
5
5
  require 'aws-sdk'
6
6
 
7
7
  module RedshiftConnector
8
- class S3DataFileBundle < AbstractDataFileBundle
8
+ class S3DataFileBundle
9
9
  def self.for_prefix(bucket: S3Bucket.default, prefix:, format:, filter: nil, batch_size: 1000, logger: RedshiftConnector.logger)
10
10
  real_prefix = "#{bucket.prefix}/#{prefix}"
11
11
  new(bucket, real_prefix, format: format, filter: filter, batch_size: batch_size, logger: logger)
@@ -68,9 +68,25 @@ module RedshiftConnector
68
68
  end
69
69
  private :do_each_batch
70
70
 
71
- def data_files
71
+ def each_row(&block)
72
+ each_object do |obj|
73
+ obj.each_row(&block)
74
+ end
75
+ end
76
+
77
+ alias each each_row
78
+
79
+ def each_object(&block)
80
+ all_data_objects.each do |obj|
81
+ @logger.info "processing s3 object: #{obj.key}"
82
+ yield obj
83
+ end
84
+ end
85
+
86
+ def all_data_objects
72
87
  @bucket.objects(prefix: @prefix)
73
88
  .map {|obj| S3DataFile.new(obj, reader_class: @reader_class) }
89
+ .select {|obj| obj.data_object? }
74
90
  end
75
91
 
76
92
  def clear
@@ -1,3 +1,3 @@
1
1
  module RedshiftConnector
2
- VERSION = '4.5.0'
2
+ VERSION = '5.0.0'
3
3
  end
data/test/foreach.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require_relative 'helper'
2
2
 
3
- RedshiftConnector.foreach(schema: 'tabemiru', table: 'items', query: 'select id from tabemiru.items where id < 50 order by 1', enable_sort: true) do |row|
3
+ RedshiftConnector.foreach(schema: 'tabemiru', table: 'items', query: 'select * from tabemiru.items where id < 10') do |row|
4
4
  p row
5
5
  end
@@ -1,5 +1,5 @@
1
1
  require 'test/unit'
2
- require 'redshift-connector/data_file'
2
+ require 'redshift-connector/reader'
3
3
 
4
4
  module RedshiftConnector
5
5
  module Reader
data/test/test_reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require 'test/unit'
2
+ require 'redshift-connector/reader'
2
3
 
3
4
  module RedshiftConnector
4
5
  class TestReader < Test::Unit::TestCase
metadata CHANGED
@@ -1,31 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: redshift-connector
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.5.0
4
+ version: 5.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Minero Aoki
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-05-12 00:00:00.000000000 Z
11
+ date: 2017-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "<"
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '5'
19
+ version: '5.0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "<"
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '5'
26
+ version: '5.0'
27
27
  - !ruby/object:Gem::Dependency
28
- name: activerecord4-redshift-adapter
28
+ name: activerecord5-redshift-adapter
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - ">="
@@ -38,20 +38,6 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: redshift-connector-data_file
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - "~>"
46
- - !ruby/object:Gem::Version
47
- version: 1.0.0
48
- type: :runtime
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - "~>"
53
- - !ruby/object:Gem::Version
54
- version: 1.0.0
55
41
  - !ruby/object:Gem::Dependency
56
42
  name: pg
57
43
  requirement: !ruby/object:Gem::Requirement
@@ -154,6 +140,12 @@ files:
154
140
  - lib/redshift-connector/importer/upsert.rb
155
141
  - lib/redshift-connector/logger.rb
156
142
  - lib/redshift-connector/query.rb
143
+ - lib/redshift-connector/reader.rb
144
+ - lib/redshift-connector/reader/abstract.rb
145
+ - lib/redshift-connector/reader/csv.rb
146
+ - lib/redshift-connector/reader/exception.rb
147
+ - lib/redshift-connector/reader/redshift_csv.rb
148
+ - lib/redshift-connector/reader/tsv.rb
157
149
  - lib/redshift-connector/s3_bucket.rb
158
150
  - lib/redshift-connector/s3_data_file.rb
159
151
  - lib/redshift-connector/s3_data_file_bundle.rb