redshift_csv_file 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +1 -0
- data/Gemfile +2 -0
- data/LICENSE +21 -0
- data/README.md +25 -0
- data/lib/redshift_csv_file.rb +111 -0
- data/redshift_csv_file.gemspec +19 -0
- metadata +77 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 988bff806bddf57141c4d2819c6c440fbdc25955a5b83060299e66699dfdca2b
|
4
|
+
data.tar.gz: 013c197fbffc732dc8ade4554059c00d2f9cd4764a25b4b91aca50102a245df3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 19f59a3acde973ef62d222aeb40624555b11c0add2b0d7b194891e2929282e8b56d5a2f175ad5569fa46bfea58b9b7a02f78064c8ca316ed1c29afcea9bd3e7d
|
7
|
+
data.tar.gz: 2a65476327e518e612f1011e73a9c1870d0fe1cdeffcb52338c7739c158242ac6b1f2e31a483de808aecff27fc2a4fa7ec27debcc24164441c0a6f935b85a78a
|
data/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Gemfile.lock
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2019 Bricolage Org
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# redshift_csv_file library
|
2
|
+
|
3
|
+
redshift_csv_file is a Redshift-specific CSV file parser in Ruby.
|
4
|
+
Amazon Redshift generates non-standard CSV format, special handling is required.
|
5
|
+
|
6
|
+
## Usage
|
7
|
+
|
8
|
+
```
|
9
|
+
require 'redshift_csv_file'
|
10
|
+
|
11
|
+
File.open('unloaded_file.csv') {|f|
|
12
|
+
RedshiftCsvFile.new(f).each_row do |col1, col2, col3|
|
13
|
+
p [col1, col2, col3] # => ["value1", "value2", "value3"]
|
14
|
+
end
|
15
|
+
}
|
16
|
+
```
|
17
|
+
|
18
|
+
## License
|
19
|
+
|
20
|
+
MIT license.
|
21
|
+
See LICENSE file for details.
|
22
|
+
|
23
|
+
## Author
|
24
|
+
|
25
|
+
Minero Aoki
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
# Reads CSV file generated by Redshift UNLOAD statement (with option ADDQUOTES ESCAPE).
|
4
|
+
# UNLOAD escapes data by '\' (backslash character), we cannot use standard CSV class.
|
5
|
+
class RedshiftCsvFile
|
6
|
+
class MalformedCSVException < StandardError; end
|
7
|
+
|
8
|
+
# f :: IO
|
9
|
+
def initialize(f)
|
10
|
+
@f = f
|
11
|
+
@s = ScanBuffer.new(@f)
|
12
|
+
end
|
13
|
+
|
14
|
+
def each_row
|
15
|
+
s = @s
|
16
|
+
while row = parse_row(@s)
|
17
|
+
yield row
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
alias each each_row
|
22
|
+
|
23
|
+
def read_row
|
24
|
+
return nil if @s.eof?
|
25
|
+
parse_row(@s)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def parse_row(s)
|
31
|
+
s.next_row or return nil
|
32
|
+
row = []
|
33
|
+
begin
|
34
|
+
first = false
|
35
|
+
column = s.scan_column
|
36
|
+
unless column
|
37
|
+
raise MalformedCSVException, "CSV parse error: unterminated column or row at line #{s.lineno}"
|
38
|
+
end
|
39
|
+
row.push unescape_column(column)
|
40
|
+
end while s.read_separator
|
41
|
+
unless s.read_eol
|
42
|
+
raise MalformedCSVException, "CSV parse error: missing column separator at line #{s.lineno}"
|
43
|
+
end
|
44
|
+
row
|
45
|
+
end
|
46
|
+
|
47
|
+
UNESCAPE_MAP = {
|
48
|
+
'\\t' => "\t",
|
49
|
+
'\\r' => "\r",
|
50
|
+
'\\n' => "\n",
|
51
|
+
}
|
52
|
+
|
53
|
+
def unescape_column(col)
|
54
|
+
charmap = UNESCAPE_MAP
|
55
|
+
col[1...-1].gsub(/\\./m) {|s| charmap[s] || s[1,1] }
|
56
|
+
end
|
57
|
+
|
58
|
+
class ScanBuffer
|
59
|
+
def initialize(f)
|
60
|
+
@f = f
|
61
|
+
@s = StringScanner.new("")
|
62
|
+
@eof = false
|
63
|
+
end
|
64
|
+
|
65
|
+
def eof?
|
66
|
+
@s.eos? && @eof
|
67
|
+
end
|
68
|
+
|
69
|
+
def lineno
|
70
|
+
@f.lineno
|
71
|
+
end
|
72
|
+
|
73
|
+
def next_row
|
74
|
+
fill_buffer
|
75
|
+
end
|
76
|
+
|
77
|
+
MAX_COLUMN_LENGTH = (1.2 * (1024 ** 3)).to_i # 1.2MB
|
78
|
+
|
79
|
+
def scan_column
|
80
|
+
s = @s
|
81
|
+
s.skip(/[ \t]+/)
|
82
|
+
until column = s.scan(/"(?:\\.|[^"\\]+)*"/m)
|
83
|
+
fill_buffer or return nil
|
84
|
+
return nil if s.eos?
|
85
|
+
if s.rest_size > MAX_COLUMN_LENGTH
|
86
|
+
raise MalformedCSVException, "CSV parse error: too long column at line #{@f.lineno}"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
column
|
90
|
+
end
|
91
|
+
|
92
|
+
def fill_buffer
|
93
|
+
line = @f.gets
|
94
|
+
if line
|
95
|
+
@s << line
|
96
|
+
true
|
97
|
+
else
|
98
|
+
@eof = true
|
99
|
+
false
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def read_separator
|
104
|
+
@s.skip(/[ \t]*,/)
|
105
|
+
end
|
106
|
+
|
107
|
+
def read_eol
|
108
|
+
@s.skip(/[ \t\r]*(?:\n|\z)/)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.platform = Gem::Platform::RUBY
|
3
|
+
s.name = 'redshift_csv_file'
|
4
|
+
s.version = '1.0.0'
|
5
|
+
s.summary = 'Redshift unloaded CSV file parser'
|
6
|
+
s.description = 'redshift_csv_file is a Redshift-specific CSV file parser. Amazon Redshift generates non-standard CSV format, special handling is required.'
|
7
|
+
s.license = 'MIT'
|
8
|
+
|
9
|
+
s.author = ['Minero Aoki']
|
10
|
+
s.email = 'aamine@loveruby.net'
|
11
|
+
s.homepage = 'https://github.com/bricolages/redshift_csv_file'
|
12
|
+
|
13
|
+
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
14
|
+
s.require_path = 'lib'
|
15
|
+
|
16
|
+
s.required_ruby_version = '>= 2.3.0'
|
17
|
+
s.add_development_dependency 'test-unit'
|
18
|
+
s.add_development_dependency 'rake'
|
19
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: redshift_csv_file
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Minero Aoki
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-07-16 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: test-unit
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: redshift_csv_file is a Redshift-specific CSV file parser. Amazon Redshift
|
42
|
+
generates non-standard CSV format, special handling is required.
|
43
|
+
email: aamine@loveruby.net
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".gitignore"
|
49
|
+
- Gemfile
|
50
|
+
- LICENSE
|
51
|
+
- README.md
|
52
|
+
- lib/redshift_csv_file.rb
|
53
|
+
- redshift_csv_file.gemspec
|
54
|
+
homepage: https://github.com/bricolages/redshift_csv_file
|
55
|
+
licenses:
|
56
|
+
- MIT
|
57
|
+
metadata: {}
|
58
|
+
post_install_message:
|
59
|
+
rdoc_options: []
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: 2.3.0
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubygems_version: 3.0.3
|
74
|
+
signing_key:
|
75
|
+
specification_version: 4
|
76
|
+
summary: Redshift unloaded CSV file parser
|
77
|
+
test_files: []
|