redshift-connector-data_file 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3f4a0fc069c6eb635d15751a45ccd2f2a7bf86c1
4
+ data.tar.gz: 4789569a3178cf601a307c349e3ba42e07830bb5
5
+ SHA512:
6
+ metadata.gz: 53e845941f6242c5804bcbc0918e8efe64a8cc825e896803ac2015b59774b199ec7ea43421aaf0a8ebd0cfc59cc394f663908174f2563fa34970d8159cc231e0
7
+ data.tar.gz: 434c36b9a6cfac9719e88245bdd187b1a6bd6ba28f9464ec611eefddfebfeb4b4e5549400ebc8fc08f0050470f31df3b9ada1f495f16c7962d414556e5cd15bb
data/.gitignore ADDED
@@ -0,0 +1,10 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /.ruby-version
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in redshift-connector-data_file.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2017 Hidekazu Kobayashi
4
+ Copyright (c) 2016,2017 Minero Aoki
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,25 @@
1
+ # RedshiftConnector::DataFile
2
+
3
+ A gem to handle data files exported from Redshift
4
+
5
+ [redshift-connector](https://github.com/aamine/redshift-coonector) uses this internally.
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'redshift-connector-data_file'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install redshift-connector-data_file
22
+
23
+ ## Contributing
24
+
25
+ Bug reports and pull requests are welcome on GitHub at https://github.com/koba789/redshift-connector-data_file.
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "redshift-connector/data_file"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,9 @@
1
+ require "redshift-connector/data_file/version"
2
+ require "redshift-connector/data_file/logger"
3
+ require "redshift-connector/data_file/reader"
4
+ require "redshift-connector/data_file/url_data_file_bundle"
5
+
6
+ module RedshiftConnector
7
+ module DataFile
8
+ end
9
+ end
@@ -0,0 +1,24 @@
1
+ require 'zlib'
2
+
3
+ module RedshiftConnector
4
+ class AbstractDataFile
5
+ def each_row(&block)
6
+ f = if gzipped_object?
7
+ Zlib::GzipReader.new(content)
8
+ else
9
+ content
10
+ end
11
+ @reader_class.new(f).each(&block)
12
+ ensure
13
+ content.close
14
+ end
15
+
16
+ def data_object?
17
+ @reader_class.data_object?(key)
18
+ end
19
+
20
+ def gzipped_object?
21
+ File.extname(key) == '.gz'
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ module RedshiftConnector
2
+ class AbstractDataFileBundle
3
+ def each_row(&block)
4
+ each_object do |obj|
5
+ obj.each_row(&block)
6
+ end
7
+ end
8
+
9
+ alias each each_row
10
+
11
+ def each_object(&block)
12
+ all_data_objects.each do |obj|
13
+ @logger.info "processing s3 object: #{obj.key}"
14
+ yield obj
15
+ end
16
+ end
17
+
18
+ def all_data_objects
19
+ data_files.select {|obj| obj.data_object? }
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,14 @@
1
+ module RedshiftConnector
2
+ module DataFile
3
+ @logger = nil
4
+
5
+ def self.logger
6
+ # REVIEW: Reverse dependency
7
+ @logger || RedshiftConnector&.logger || Rails.logger
8
+ end
9
+
10
+ def self.logger=(logger)
11
+ @logger = logger
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,18 @@
1
+ # create module
2
+ module RedshiftConnector
3
+ module Reader
4
+ end
5
+ end
6
+
7
+ require 'redshift-connector/data_file/reader/redshift_csv'
8
+ require 'redshift-connector/data_file/reader/csv'
9
+ require 'redshift-connector/data_file/reader/tsv'
10
+ require 'redshift-connector/data_file/reader/exception'
11
+
12
+ module RedshiftConnector
13
+ module Reader
14
+ def Reader.get(id)
15
+ Abstract.get_reader_class(id)
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,18 @@
1
+ module RedshiftConnector
2
+ class Reader::Abstract
3
+ READER_CLASSES = {} # {Symbol => Class}
4
+
5
+ def self.declare_reader(id)
6
+ READER_CLASSES[id.to_sym] = self
7
+ end
8
+
9
+ def self.get_reader_class(id)
10
+ READER_CLASSES[id.to_sym] or
11
+ raise ArgumentError, "unknown data file reader type: #{id.inspect}"
12
+ end
13
+ end
14
+
15
+ def self.get_reader_class(id)
16
+ Reader::Abstract.get_reader_class(id)
17
+ end
18
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/data_file/reader/abstract'
2
+ require 'redshift-connector/data_file/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses (standard) CSV files.
7
+ # For UNLOAD-generated CSV, use RedshiftCSV class.
8
+ class Reader::CSV < Reader::Abstract
9
+ declare_reader :csv
10
+
11
+ def self.data_object?(key)
12
+ /\.csv(?:\.|\z)/ =~ File.basename(key)
13
+ end
14
+
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each(&block)
20
+ csv = CSV.new(@f)
21
+ csv.each(&block)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module RedshiftConnector
2
+ class Reader::MalformedCSVException < StandardError; end
3
+ end
@@ -0,0 +1,54 @@
1
+ require 'redshift-connector/data_file/reader/abstract'
2
+ require 'redshift-connector/data_file/reader/exception'
3
+
4
+ module RedshiftConnector
5
+ # Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
6
+ # UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
7
+ class Reader::RedshiftCSV < Reader::Abstract
8
+ declare_reader :redshift_csv
9
+
10
+ def self.data_object?(key)
11
+ /\.csv(?:\.|\z)/ =~ File.basename(key)
12
+ end
13
+
14
+ # f :: IO
15
+ def initialize(f)
16
+ @f = f
17
+ end
18
+
19
+ def each
20
+ # We can use simple #each_line to read single row
21
+ # because line terminators are always escaped by UNLOAD.
22
+ @f.each_line do |line|
23
+ yield parse_row(line, @f.lineno)
24
+ end
25
+ end
26
+
27
+ def parse_row(line, lineno = nil)
28
+ row = []
29
+ s = StringScanner.new(line)
30
+ s.skip(/\s+/)
31
+ until s.eos?
32
+ col = s.scan(/"(?:\\.|[^"\\]+)*"/) or raise MalformedCSVException, "CSV parse error at line #{lineno}"
33
+ row.push unescape_column(col)
34
+ s.skip(/\s*/) # skip line terminator on line ends
35
+ s.skip(/,\s*/)
36
+ end
37
+ row
38
+ end
39
+
40
+ UNESCAPE_MAP = {
41
+ '\\"' => '"',
42
+ "\\'" => "'",
43
+ '\\,' => ',',
44
+ '\\r' => "\r",
45
+ '\\n' => "\n",
46
+ '\\\\' => '\\'
47
+ }
48
+
49
+ def unescape_column(col)
50
+ charmap = UNESCAPE_MAP
51
+ col[1...-1].gsub(/\\./) {|s| charmap[s] }
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,24 @@
1
+ require 'redshift-connector/data_file/reader/abstract'
2
+ require 'redshift-connector/data_file/reader/exception'
3
+ require 'csv'
4
+
5
+ module RedshiftConnector
6
+ # Parses TSV (Tab Separated Format) files.
7
+ class Reader::TSV < Reader::Abstract
8
+ declare_reader :tsv
9
+
10
+ def self.data_object?(key)
11
+ /\.tsv(?:\.|\z)/ =~ File.basename(key)
12
+ end
13
+
14
+ def initialize(f)
15
+ @f = f
16
+ end
17
+
18
+ def each(&block)
19
+ @f.each_line do |line|
20
+ yield line.chomp.split("\t", -1)
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,23 @@
1
+ require 'redshift-connector/data_file/abstract_data_file'
2
+ require 'uri'
3
+ require 'zlib'
4
+ require 'open3'
5
+
6
+ module RedshiftConnector
7
+ class UrlDataFile < AbstractDataFile
8
+ def initialize(url, reader_class:)
9
+ @url = url
10
+ @reader_class = reader_class
11
+ end
12
+
13
+ def key
14
+ URI.parse(@url).path
15
+ end
16
+
17
+ def content
18
+ stdin, stdout, stderr, wait_th = Open3.popen3('curl', @url)
19
+ stdin.close
20
+ stdout
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,21 @@
1
+ require 'redshift-connector/data_file/reader'
2
+ require 'redshift-connector/data_file/logger'
3
+ require 'redshift-connector/data_file/abstract_data_file_bundle'
4
+ require 'redshift-connector/data_file/url_data_file'
5
+
6
+ module RedshiftConnector
7
+ class UrlDataFileBundle < AbstractDataFileBundle
8
+ def initialize(data_file_urls, format: :redshift_csv, filter: nil, logger: DataFile.logger)
9
+ @data_file_urls = data_file_urls
10
+ @filter = filter || lambda {|*row| row }
11
+ @logger = logger
12
+ @reader_class = Reader.get(format)
13
+ end
14
+
15
+ def data_files
16
+ @data_file_urls.map do |url|
17
+ UrlDataFile.new(url, reader_class: @reader_class)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,5 @@
1
+ module RedshiftConnector
2
+ module DataFile
3
+ VERSION = "1.0.0"
4
+ end
5
+ end
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'redshift-connector/data_file/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "redshift-connector-data_file"
8
+ spec.version = RedshiftConnector::DataFile::VERSION
9
+ spec.authors = ["Hidekazu Kobayashi", "Minero Aoki"]
10
+ spec.email = ["kobahide789@gmail.com"]
11
+
12
+ spec.summary = "Utility classes for exported data files from Redshift"
13
+ spec.homepage = "https://github.com/koba789/redshift-connector-data_file"
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
16
+ f.match(%r{^(test|spec|features)/})
17
+ end
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.14"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: redshift-connector-data_file
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Hidekazu Kobayashi
8
+ - Minero Aoki
9
+ autorequire:
10
+ bindir: exe
11
+ cert_chain: []
12
+ date: 2017-05-12 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: bundler
16
+ requirement: !ruby/object:Gem::Requirement
17
+ requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.14'
21
+ type: :development
22
+ prerelease: false
23
+ version_requirements: !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - "~>"
26
+ - !ruby/object:Gem::Version
27
+ version: '1.14'
28
+ - !ruby/object:Gem::Dependency
29
+ name: rake
30
+ requirement: !ruby/object:Gem::Requirement
31
+ requirements:
32
+ - - "~>"
33
+ - !ruby/object:Gem::Version
34
+ version: '10.0'
35
+ type: :development
36
+ prerelease: false
37
+ version_requirements: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - "~>"
40
+ - !ruby/object:Gem::Version
41
+ version: '10.0'
42
+ description:
43
+ email:
44
+ - kobahide789@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - ".gitignore"
50
+ - Gemfile
51
+ - LICENSE
52
+ - README.md
53
+ - Rakefile
54
+ - bin/console
55
+ - bin/setup
56
+ - lib/redshift-connector/data_file.rb
57
+ - lib/redshift-connector/data_file/abstract_data_file.rb
58
+ - lib/redshift-connector/data_file/abstract_data_file_bundle.rb
59
+ - lib/redshift-connector/data_file/logger.rb
60
+ - lib/redshift-connector/data_file/reader.rb
61
+ - lib/redshift-connector/data_file/reader/abstract.rb
62
+ - lib/redshift-connector/data_file/reader/csv.rb
63
+ - lib/redshift-connector/data_file/reader/exception.rb
64
+ - lib/redshift-connector/data_file/reader/redshift_csv.rb
65
+ - lib/redshift-connector/data_file/reader/tsv.rb
66
+ - lib/redshift-connector/data_file/url_data_file.rb
67
+ - lib/redshift-connector/data_file/url_data_file_bundle.rb
68
+ - lib/redshift-connector/data_file/version.rb
69
+ - redshift-connector-data_file.gemspec
70
+ homepage: https://github.com/koba789/redshift-connector-data_file
71
+ licenses: []
72
+ metadata: {}
73
+ post_install_message:
74
+ rdoc_options: []
75
+ require_paths:
76
+ - lib
77
+ required_ruby_version: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ required_rubygems_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ version: '0'
87
+ requirements: []
88
+ rubyforge_project:
89
+ rubygems_version: 2.6.11
90
+ signing_key:
91
+ specification_version: 4
92
+ summary: Utility classes for exported data files from Redshift
93
+ test_files: []