hadoop-csv 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/Gemfile +2 -0
- data/README.rdoc +14 -0
- data/Rakefile +4 -0
- data/hadoop-csv.gemspec +20 -0
- data/lib/hadoop/csv.rb +301 -0
- data/lib/hadoop/csv.rl +139 -0
- metadata +53 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
= Hadoop-CSV
|
2
|
+
|
3
|
+
Ruby reader for Hadoop CSV format.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
This gem implements a ragel based parser for Hadoop CSV format.
|
8
|
+
That format uses several complex types to represent vecotrs, maps
|
9
|
+
and structures.
|
10
|
+
|
11
|
+
More info: http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/record/package-summary.html
|
12
|
+
|
13
|
+
Note: the description lacks information about %7D special sequence which encodes a quoted
|
14
|
+
closing brace.
|
data/Rakefile
ADDED
data/hadoop-csv.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.unshift "lib"
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "hadoop-csv"
|
5
|
+
s.version = '0.0.3'
|
6
|
+
s.date = "#{Time.now.strftime("%Y-%m-%d")}"
|
7
|
+
s.authors = ['Aleksander Pohl']
|
8
|
+
s.email = ["apohllo@o2.pl"]
|
9
|
+
#s.homepage = "http://github.com/apohllo/rod"
|
10
|
+
s.summary = "Hadoop CSV format parser."
|
11
|
+
s.description = "Hadoop CSV format parser."
|
12
|
+
|
13
|
+
s.rubyforge_project = "hadoop-csv"
|
14
|
+
#s.rdoc_options = ["--main", "README.rdoc"]
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_path = "lib"
|
20
|
+
end
|
data/lib/hadoop/csv.rb
ADDED
@@ -0,0 +1,301 @@
|
|
1
|
+
|
2
|
+
# line 1 "lib/hadoop/csv.rl"
|
3
|
+
# vim: filetype=ruby
|
4
|
+
=begin
|
5
|
+
|
6
|
+
# line 19 "lib/hadoop/csv.rl"
|
7
|
+
|
8
|
+
=end
|
9
|
+
module Hadoop
|
10
|
+
class Csv
|
11
|
+
SINGLE_QUOTE_CODE = "'".codepoints.first
|
12
|
+
DASH_CODE = "#".codepoints.first
|
13
|
+
S_CODE = "s".codepoints.first
|
14
|
+
V_CODE = "v".codepoints.first
|
15
|
+
M_CODE = "m".codepoints.first
|
16
|
+
OPENING_BRACE_CODE = "{".codepoints.first
|
17
|
+
|
18
|
+
attr_reader :path
|
19
|
+
|
20
|
+
# Create new Hadoop CSV parser. If +path+ is given,
|
21
|
+
# the file will be parsed in +each+ method.
|
22
|
+
def initialize(path=nil)
|
23
|
+
@path = path
|
24
|
+
|
25
|
+
# line 26 "lib/hadoop/csv.rb"
|
26
|
+
class << self
|
27
|
+
attr_accessor :_csv_trans_keys
|
28
|
+
private :_csv_trans_keys, :_csv_trans_keys=
|
29
|
+
end
|
30
|
+
self._csv_trans_keys = [
|
31
|
+
0, 0, 0, 125, 48, 55,
|
32
|
+
48, 65, 53, 67, 68,
|
33
|
+
68, 0, 0, 0
|
34
|
+
]
|
35
|
+
|
36
|
+
class << self
|
37
|
+
attr_accessor :_csv_key_spans
|
38
|
+
private :_csv_key_spans, :_csv_key_spans=
|
39
|
+
end
|
40
|
+
self._csv_key_spans = [
|
41
|
+
0, 126, 8, 18, 15, 1, 0
|
42
|
+
]
|
43
|
+
|
44
|
+
class << self
|
45
|
+
attr_accessor :_csv_index_offsets
|
46
|
+
private :_csv_index_offsets, :_csv_index_offsets=
|
47
|
+
end
|
48
|
+
self._csv_index_offsets = [
|
49
|
+
0, 0, 127, 136, 155, 171, 173
|
50
|
+
]
|
51
|
+
|
52
|
+
class << self
|
53
|
+
attr_accessor :_csv_indicies
|
54
|
+
private :_csv_indicies, :_csv_indicies=
|
55
|
+
end
|
56
|
+
self._csv_indicies = [
|
57
|
+
1, 0, 0, 0, 0, 0, 0, 0,
|
58
|
+
0, 0, 2, 0, 0, 0, 0, 0,
|
59
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
60
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
61
|
+
0, 0, 0, 3, 0, 4, 0, 3,
|
62
|
+
0, 0, 0, 0, 5, 3, 0, 0,
|
63
|
+
3, 3, 3, 3, 3, 3, 3, 3,
|
64
|
+
3, 3, 0, 3, 0, 0, 0, 0,
|
65
|
+
0, 0, 0, 0, 0, 0, 3, 0,
|
66
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
67
|
+
0, 0, 0, 0, 3, 0, 0, 0,
|
68
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
69
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
70
|
+
0, 0, 0, 0, 0, 3, 0, 0,
|
71
|
+
0, 0, 0, 3, 0, 0, 3, 0,
|
72
|
+
0, 0, 0, 0, 0, 5, 0, 6,
|
73
|
+
1, 7, 1, 1, 1, 1, 8, 1,
|
74
|
+
0, 1, 1, 1, 1, 1, 1, 1,
|
75
|
+
1, 1, 1, 1, 1, 1, 1, 1,
|
76
|
+
1, 0, 1, 0, 1, 1, 1, 1,
|
77
|
+
1, 1, 1, 1, 1, 1, 1, 1,
|
78
|
+
1, 0, 1, 0, 1, 1, 0
|
79
|
+
]
|
80
|
+
|
81
|
+
class << self
|
82
|
+
attr_accessor :_csv_trans_targs
|
83
|
+
private :_csv_trans_targs, :_csv_trans_targs=
|
84
|
+
end
|
85
|
+
self._csv_trans_targs = [
|
86
|
+
1, 0, 6, 1, 2, 1, 3, 4,
|
87
|
+
5
|
88
|
+
]
|
89
|
+
|
90
|
+
class << self
|
91
|
+
attr_accessor :_csv_trans_actions
|
92
|
+
private :_csv_trans_actions, :_csv_trans_actions=
|
93
|
+
end
|
94
|
+
self._csv_trans_actions = [
|
95
|
+
0, 0, 1, 2, 0, 1, 0, 0,
|
96
|
+
0
|
97
|
+
]
|
98
|
+
|
99
|
+
class << self
|
100
|
+
attr_accessor :csv_start
|
101
|
+
end
|
102
|
+
self.csv_start = 1;
|
103
|
+
class << self
|
104
|
+
attr_accessor :csv_first_final
|
105
|
+
end
|
106
|
+
self.csv_first_final = 6;
|
107
|
+
class << self
|
108
|
+
attr_accessor :csv_error
|
109
|
+
end
|
110
|
+
self.csv_error = 0;
|
111
|
+
|
112
|
+
class << self
|
113
|
+
attr_accessor :csv_en_main
|
114
|
+
end
|
115
|
+
self.csv_en_main = 1;
|
116
|
+
|
117
|
+
|
118
|
+
# line 37 "lib/hadoop/csv.rl"
|
119
|
+
# % (this fixes syntax highlighting)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Opens the file given in constructor and yields
|
123
|
+
# the parsed results.
|
124
|
+
def each
|
125
|
+
if block_given?
|
126
|
+
File.open(path) do |f|
|
127
|
+
while !f.eof? && line = f.readline
|
128
|
+
yield parse(line)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
enum_for(:each)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Parse single line of Hadoop CSV. The line must end with '\n'.
|
137
|
+
def parse(line)
|
138
|
+
# So that ragel doesn't try to get it from data.length
|
139
|
+
pe = :ignored
|
140
|
+
eof = :ignored
|
141
|
+
|
142
|
+
# line 143 "lib/hadoop/csv.rb"
|
143
|
+
begin
|
144
|
+
p ||= 0
|
145
|
+
pe ||= data.length
|
146
|
+
cs = csv_start
|
147
|
+
end
|
148
|
+
|
149
|
+
# line 60 "lib/hadoop/csv.rl"
|
150
|
+
# % (this fixes syntax highlighting)
|
151
|
+
@result = [[]]
|
152
|
+
@position = 0
|
153
|
+
@states = [:default]
|
154
|
+
data = line.unpack('c*')
|
155
|
+
p = 0
|
156
|
+
pe = data.length
|
157
|
+
|
158
|
+
# line 159 "lib/hadoop/csv.rb"
|
159
|
+
begin
|
160
|
+
testEof = false
|
161
|
+
_slen, _trans, _keys, _inds, _acts, _nacts = nil
|
162
|
+
_goto_level = 0
|
163
|
+
_resume = 10
|
164
|
+
_eof_trans = 15
|
165
|
+
_again = 20
|
166
|
+
_test_eof = 30
|
167
|
+
_out = 40
|
168
|
+
while true
|
169
|
+
if _goto_level <= 0
|
170
|
+
if p == pe
|
171
|
+
_goto_level = _test_eof
|
172
|
+
next
|
173
|
+
end
|
174
|
+
if cs == 0
|
175
|
+
_goto_level = _out
|
176
|
+
next
|
177
|
+
end
|
178
|
+
end
|
179
|
+
if _goto_level <= _resume
|
180
|
+
_keys = cs << 1
|
181
|
+
_inds = _csv_index_offsets[cs]
|
182
|
+
_slen = _csv_key_spans[cs]
|
183
|
+
_trans = if ( _slen > 0 &&
|
184
|
+
_csv_trans_keys[_keys] <= data[p].ord &&
|
185
|
+
data[p].ord <= _csv_trans_keys[_keys + 1]
|
186
|
+
) then
|
187
|
+
_csv_indicies[ _inds + data[p].ord - _csv_trans_keys[_keys] ]
|
188
|
+
else
|
189
|
+
_csv_indicies[ _inds + _slen ]
|
190
|
+
end
|
191
|
+
cs = _csv_trans_targs[_trans]
|
192
|
+
if _csv_trans_actions[_trans] != 0
|
193
|
+
case _csv_trans_actions[_trans]
|
194
|
+
when 2 then
|
195
|
+
# line 6 "lib/hadoop/csv.rl"
|
196
|
+
begin
|
197
|
+
|
198
|
+
register_start(p,data[p],data[p+1])
|
199
|
+
end
|
200
|
+
when 1 then
|
201
|
+
# line 10 "lib/hadoop/csv.rl"
|
202
|
+
begin
|
203
|
+
|
204
|
+
register_end(data[p],data,p)
|
205
|
+
end
|
206
|
+
# line 207 "lib/hadoop/csv.rb"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
if _goto_level <= _again
|
211
|
+
if cs == 0
|
212
|
+
_goto_level = _out
|
213
|
+
next
|
214
|
+
end
|
215
|
+
p += 1
|
216
|
+
if p != pe
|
217
|
+
_goto_level = _resume
|
218
|
+
next
|
219
|
+
end
|
220
|
+
end
|
221
|
+
if _goto_level <= _test_eof
|
222
|
+
end
|
223
|
+
if _goto_level <= _out
|
224
|
+
break
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
# line 68 "lib/hadoop/csv.rl"
|
230
|
+
# % (this fixes syntax highlighting)
|
231
|
+
@result[0]
|
232
|
+
end
|
233
|
+
|
234
|
+
protected
|
235
|
+
def register_start(position,char_code,next_char_code)
|
236
|
+
case @states.last
|
237
|
+
when :default
|
238
|
+
@position = position
|
239
|
+
process_char(char_code,next_char_code)
|
240
|
+
when :string
|
241
|
+
# ignore
|
242
|
+
when :bytes
|
243
|
+
#ignore
|
244
|
+
when :struct
|
245
|
+
@position = position
|
246
|
+
process_char(char_code,next_char_code)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def process_char(char_code,next_char_code)
|
251
|
+
case char_code
|
252
|
+
when SINGLE_QUOTE_CODE
|
253
|
+
@states << :string
|
254
|
+
when DASH_CODE
|
255
|
+
@states << :bytes
|
256
|
+
when S_CODE, V_CODE, M_CODE
|
257
|
+
if next_char_code == OPENING_BRACE_CODE
|
258
|
+
@states << :struct
|
259
|
+
@result << []
|
260
|
+
end
|
261
|
+
else
|
262
|
+
@states << :other
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
def register_end(char_code,data,position)
|
267
|
+
# TODO there seems to be ambiguity in the CSV format:
|
268
|
+
# unicode string/byte sequence containing the closing brace
|
269
|
+
# TODO fix char -> char_code
|
270
|
+
#if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
|
271
|
+
last_start = @position
|
272
|
+
new_data = data[last_start..position-1].pack("c*")
|
273
|
+
case new_data[0]
|
274
|
+
when "'"
|
275
|
+
@result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
|
276
|
+
gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
|
277
|
+
when "T","F"
|
278
|
+
if new_data == "T"
|
279
|
+
@result.last << true
|
280
|
+
else
|
281
|
+
@result.last << false
|
282
|
+
end
|
283
|
+
when "}"
|
284
|
+
subresult = @result.pop
|
285
|
+
@result.last << subresult
|
286
|
+
else
|
287
|
+
if new_data =~ /^-?\d+(\.)?/
|
288
|
+
if $~[1].nil?
|
289
|
+
@result.last << new_data.to_i
|
290
|
+
else
|
291
|
+
@result.last << new_data.to_f
|
292
|
+
end
|
293
|
+
else
|
294
|
+
raise "CSV error: #{new_data}"
|
295
|
+
end
|
296
|
+
end
|
297
|
+
@position = position
|
298
|
+
@states.pop
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
data/lib/hadoop/csv.rl
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# vim: filetype=ruby
|
2
|
+
=begin
|
3
|
+
%%{
|
4
|
+
machine csv;
|
5
|
+
|
6
|
+
action value_start {
|
7
|
+
register_start(p,data[p],data[p+1])
|
8
|
+
}
|
9
|
+
|
10
|
+
action value_end {
|
11
|
+
register_end(data[p],data,p)
|
12
|
+
}
|
13
|
+
|
14
|
+
start_tag = ('T' | 'F' | '-' | digit | ';' | "'" | "#" | 's{' | 'v{' | 'm{') >value_start;
|
15
|
+
end_tag = (',' | '}') >value_end;
|
16
|
+
normal = (any - [\0\n%,}] | '%00' | '%0A' | '%25' | '%2C' | '%7D');
|
17
|
+
main := (start_tag | normal | end_tag) * . "\n" >value_end;
|
18
|
+
|
19
|
+
}%%
|
20
|
+
=end
|
21
|
+
module Hadoop
|
22
|
+
class Csv
|
23
|
+
SINGLE_QUOTE_CODE = "'".codepoints.first
|
24
|
+
DASH_CODE = "#".codepoints.first
|
25
|
+
S_CODE = "s".codepoints.first
|
26
|
+
V_CODE = "v".codepoints.first
|
27
|
+
M_CODE = "m".codepoints.first
|
28
|
+
OPENING_BRACE_CODE = "{".codepoints.first
|
29
|
+
|
30
|
+
attr_reader :path
|
31
|
+
|
32
|
+
# Create new Hadoop CSV parser. If +path+ is given,
|
33
|
+
# the file will be parsed in +each+ method.
|
34
|
+
def initialize(path=nil)
|
35
|
+
@path = path
|
36
|
+
%% write data;
|
37
|
+
# % (this fixes syntax highlighting)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Opens the file given in constructor and yields
|
41
|
+
# the parsed results.
|
42
|
+
def each
|
43
|
+
if block_given?
|
44
|
+
File.open(path) do |f|
|
45
|
+
while !f.eof? && line = f.readline
|
46
|
+
yield parse(line)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
else
|
50
|
+
enum_for(:each)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Parse single line of Hadoop CSV. The line must end with '\n'.
|
55
|
+
def parse(line)
|
56
|
+
# So that ragel doesn't try to get it from data.length
|
57
|
+
pe = :ignored
|
58
|
+
eof = :ignored
|
59
|
+
%% write init;
|
60
|
+
# % (this fixes syntax highlighting)
|
61
|
+
@result = [[]]
|
62
|
+
@position = 0
|
63
|
+
@states = [:default]
|
64
|
+
data = line.unpack('c*')
|
65
|
+
p = 0
|
66
|
+
pe = data.length
|
67
|
+
%% write exec;
|
68
|
+
# % (this fixes syntax highlighting)
|
69
|
+
@result[0]
|
70
|
+
end
|
71
|
+
|
72
|
+
protected
|
73
|
+
def register_start(position,char_code,next_char_code)
|
74
|
+
case @states.last
|
75
|
+
when :default
|
76
|
+
@position = position
|
77
|
+
process_char(char_code,next_char_code)
|
78
|
+
when :string
|
79
|
+
# ignore
|
80
|
+
when :bytes
|
81
|
+
#ignore
|
82
|
+
when :struct
|
83
|
+
@position = position
|
84
|
+
process_char(char_code,next_char_code)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def process_char(char_code,next_char_code)
|
89
|
+
case char_code
|
90
|
+
when SINGLE_QUOTE_CODE
|
91
|
+
@states << :string
|
92
|
+
when DASH_CODE
|
93
|
+
@states << :bytes
|
94
|
+
when S_CODE, V_CODE, M_CODE
|
95
|
+
if next_char_code == OPENING_BRACE_CODE
|
96
|
+
@states << :struct
|
97
|
+
@result << []
|
98
|
+
end
|
99
|
+
else
|
100
|
+
@states << :other
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def register_end(char_code,data,position)
|
105
|
+
# TODO there seems to be ambiguity in the CSV format:
|
106
|
+
# unicode string/byte sequence containing the closing brace
|
107
|
+
# TODO fix char -> char_code
|
108
|
+
#if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
|
109
|
+
last_start = @position
|
110
|
+
new_data = data[last_start..position-1].pack("c*")
|
111
|
+
case new_data[0]
|
112
|
+
when "'"
|
113
|
+
@result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
|
114
|
+
gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
|
115
|
+
when "T","F"
|
116
|
+
if new_data == "T"
|
117
|
+
@result.last << true
|
118
|
+
else
|
119
|
+
@result.last << false
|
120
|
+
end
|
121
|
+
when "}"
|
122
|
+
subresult = @result.pop
|
123
|
+
@result.last << subresult
|
124
|
+
else
|
125
|
+
if new_data =~ /^-?\d+(\.)?/
|
126
|
+
if $~[1].nil?
|
127
|
+
@result.last << new_data.to_i
|
128
|
+
else
|
129
|
+
@result.last << new_data.to_f
|
130
|
+
end
|
131
|
+
else
|
132
|
+
raise "CSV error: #{new_data}"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
@position = position
|
136
|
+
@states.pop
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hadoop-csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-16 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Hadoop CSV format parser.
|
15
|
+
email:
|
16
|
+
- apohllo@o2.pl
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- Gemfile
|
23
|
+
- README.rdoc
|
24
|
+
- Rakefile
|
25
|
+
- hadoop-csv.gemspec
|
26
|
+
- lib/hadoop/csv.rb
|
27
|
+
- lib/hadoop/csv.rl
|
28
|
+
homepage:
|
29
|
+
licenses: []
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
requirements: []
|
47
|
+
rubyforge_project: hadoop-csv
|
48
|
+
rubygems_version: 1.8.24
|
49
|
+
signing_key:
|
50
|
+
specification_version: 3
|
51
|
+
summary: Hadoop CSV format parser.
|
52
|
+
test_files: []
|
53
|
+
has_rdoc:
|