hadoop-csv 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/Gemfile +2 -0
- data/README.rdoc +14 -0
- data/Rakefile +4 -0
- data/hadoop-csv.gemspec +20 -0
- data/lib/hadoop/csv.rb +301 -0
- data/lib/hadoop/csv.rl +139 -0
- metadata +53 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
= Hadoop-CSV
|
2
|
+
|
3
|
+
Ruby reader for Hadoop CSV format.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
This gem implements a ragel based parser for Hadoop CSV format.
|
8
|
+
That format uses several complex types to represent vecotrs, maps
|
9
|
+
and structures.
|
10
|
+
|
11
|
+
More info: http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/record/package-summary.html
|
12
|
+
|
13
|
+
Note: the description lacks information about %7D special sequence which encodes a quoted
|
14
|
+
closing brace.
|
data/Rakefile
ADDED
data/hadoop-csv.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.unshift "lib"
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "hadoop-csv"
|
5
|
+
s.version = '0.0.3'
|
6
|
+
s.date = "#{Time.now.strftime("%Y-%m-%d")}"
|
7
|
+
s.authors = ['Aleksander Pohl']
|
8
|
+
s.email = ["apohllo@o2.pl"]
|
9
|
+
#s.homepage = "http://github.com/apohllo/rod"
|
10
|
+
s.summary = "Hadoop CSV format parser."
|
11
|
+
s.description = "Hadoop CSV format parser."
|
12
|
+
|
13
|
+
s.rubyforge_project = "hadoop-csv"
|
14
|
+
#s.rdoc_options = ["--main", "README.rdoc"]
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_path = "lib"
|
20
|
+
end
|
data/lib/hadoop/csv.rb
ADDED
@@ -0,0 +1,301 @@
|
|
1
|
+
|
2
|
+
# line 1 "lib/hadoop/csv.rl"
|
3
|
+
# vim: filetype=ruby
|
4
|
+
=begin
|
5
|
+
|
6
|
+
# line 19 "lib/hadoop/csv.rl"
|
7
|
+
|
8
|
+
=end
|
9
|
+
module Hadoop
|
10
|
+
class Csv
|
11
|
+
SINGLE_QUOTE_CODE = "'".codepoints.first
|
12
|
+
DASH_CODE = "#".codepoints.first
|
13
|
+
S_CODE = "s".codepoints.first
|
14
|
+
V_CODE = "v".codepoints.first
|
15
|
+
M_CODE = "m".codepoints.first
|
16
|
+
OPENING_BRACE_CODE = "{".codepoints.first
|
17
|
+
|
18
|
+
attr_reader :path
|
19
|
+
|
20
|
+
# Create new Hadoop CSV parser. If +path+ is given,
|
21
|
+
# the file will be parsed in +each+ method.
|
22
|
+
def initialize(path=nil)
|
23
|
+
@path = path
|
24
|
+
|
25
|
+
# line 26 "lib/hadoop/csv.rb"
|
26
|
+
class << self
|
27
|
+
attr_accessor :_csv_trans_keys
|
28
|
+
private :_csv_trans_keys, :_csv_trans_keys=
|
29
|
+
end
|
30
|
+
self._csv_trans_keys = [
|
31
|
+
0, 0, 0, 125, 48, 55,
|
32
|
+
48, 65, 53, 67, 68,
|
33
|
+
68, 0, 0, 0
|
34
|
+
]
|
35
|
+
|
36
|
+
class << self
|
37
|
+
attr_accessor :_csv_key_spans
|
38
|
+
private :_csv_key_spans, :_csv_key_spans=
|
39
|
+
end
|
40
|
+
self._csv_key_spans = [
|
41
|
+
0, 126, 8, 18, 15, 1, 0
|
42
|
+
]
|
43
|
+
|
44
|
+
class << self
|
45
|
+
attr_accessor :_csv_index_offsets
|
46
|
+
private :_csv_index_offsets, :_csv_index_offsets=
|
47
|
+
end
|
48
|
+
self._csv_index_offsets = [
|
49
|
+
0, 0, 127, 136, 155, 171, 173
|
50
|
+
]
|
51
|
+
|
52
|
+
class << self
|
53
|
+
attr_accessor :_csv_indicies
|
54
|
+
private :_csv_indicies, :_csv_indicies=
|
55
|
+
end
|
56
|
+
self._csv_indicies = [
|
57
|
+
1, 0, 0, 0, 0, 0, 0, 0,
|
58
|
+
0, 0, 2, 0, 0, 0, 0, 0,
|
59
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
60
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
61
|
+
0, 0, 0, 3, 0, 4, 0, 3,
|
62
|
+
0, 0, 0, 0, 5, 3, 0, 0,
|
63
|
+
3, 3, 3, 3, 3, 3, 3, 3,
|
64
|
+
3, 3, 0, 3, 0, 0, 0, 0,
|
65
|
+
0, 0, 0, 0, 0, 0, 3, 0,
|
66
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
67
|
+
0, 0, 0, 0, 3, 0, 0, 0,
|
68
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
69
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
70
|
+
0, 0, 0, 0, 0, 3, 0, 0,
|
71
|
+
0, 0, 0, 3, 0, 0, 3, 0,
|
72
|
+
0, 0, 0, 0, 0, 5, 0, 6,
|
73
|
+
1, 7, 1, 1, 1, 1, 8, 1,
|
74
|
+
0, 1, 1, 1, 1, 1, 1, 1,
|
75
|
+
1, 1, 1, 1, 1, 1, 1, 1,
|
76
|
+
1, 0, 1, 0, 1, 1, 1, 1,
|
77
|
+
1, 1, 1, 1, 1, 1, 1, 1,
|
78
|
+
1, 0, 1, 0, 1, 1, 0
|
79
|
+
]
|
80
|
+
|
81
|
+
class << self
|
82
|
+
attr_accessor :_csv_trans_targs
|
83
|
+
private :_csv_trans_targs, :_csv_trans_targs=
|
84
|
+
end
|
85
|
+
self._csv_trans_targs = [
|
86
|
+
1, 0, 6, 1, 2, 1, 3, 4,
|
87
|
+
5
|
88
|
+
]
|
89
|
+
|
90
|
+
class << self
|
91
|
+
attr_accessor :_csv_trans_actions
|
92
|
+
private :_csv_trans_actions, :_csv_trans_actions=
|
93
|
+
end
|
94
|
+
self._csv_trans_actions = [
|
95
|
+
0, 0, 1, 2, 0, 1, 0, 0,
|
96
|
+
0
|
97
|
+
]
|
98
|
+
|
99
|
+
class << self
|
100
|
+
attr_accessor :csv_start
|
101
|
+
end
|
102
|
+
self.csv_start = 1;
|
103
|
+
class << self
|
104
|
+
attr_accessor :csv_first_final
|
105
|
+
end
|
106
|
+
self.csv_first_final = 6;
|
107
|
+
class << self
|
108
|
+
attr_accessor :csv_error
|
109
|
+
end
|
110
|
+
self.csv_error = 0;
|
111
|
+
|
112
|
+
class << self
|
113
|
+
attr_accessor :csv_en_main
|
114
|
+
end
|
115
|
+
self.csv_en_main = 1;
|
116
|
+
|
117
|
+
|
118
|
+
# line 37 "lib/hadoop/csv.rl"
|
119
|
+
# % (this fixes syntax highlighting)
|
120
|
+
end
|
121
|
+
|
122
|
+
# Opens the file given in constructor and yields
|
123
|
+
# the parsed results.
|
124
|
+
def each
|
125
|
+
if block_given?
|
126
|
+
File.open(path) do |f|
|
127
|
+
while !f.eof? && line = f.readline
|
128
|
+
yield parse(line)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
else
|
132
|
+
enum_for(:each)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Parse single line of Hadoop CSV. The line must end with '\n'.
|
137
|
+
def parse(line)
|
138
|
+
# So that ragel doesn't try to get it from data.length
|
139
|
+
pe = :ignored
|
140
|
+
eof = :ignored
|
141
|
+
|
142
|
+
# line 143 "lib/hadoop/csv.rb"
|
143
|
+
begin
|
144
|
+
p ||= 0
|
145
|
+
pe ||= data.length
|
146
|
+
cs = csv_start
|
147
|
+
end
|
148
|
+
|
149
|
+
# line 60 "lib/hadoop/csv.rl"
|
150
|
+
# % (this fixes syntax highlighting)
|
151
|
+
@result = [[]]
|
152
|
+
@position = 0
|
153
|
+
@states = [:default]
|
154
|
+
data = line.unpack('c*')
|
155
|
+
p = 0
|
156
|
+
pe = data.length
|
157
|
+
|
158
|
+
# line 159 "lib/hadoop/csv.rb"
|
159
|
+
begin
|
160
|
+
testEof = false
|
161
|
+
_slen, _trans, _keys, _inds, _acts, _nacts = nil
|
162
|
+
_goto_level = 0
|
163
|
+
_resume = 10
|
164
|
+
_eof_trans = 15
|
165
|
+
_again = 20
|
166
|
+
_test_eof = 30
|
167
|
+
_out = 40
|
168
|
+
while true
|
169
|
+
if _goto_level <= 0
|
170
|
+
if p == pe
|
171
|
+
_goto_level = _test_eof
|
172
|
+
next
|
173
|
+
end
|
174
|
+
if cs == 0
|
175
|
+
_goto_level = _out
|
176
|
+
next
|
177
|
+
end
|
178
|
+
end
|
179
|
+
if _goto_level <= _resume
|
180
|
+
_keys = cs << 1
|
181
|
+
_inds = _csv_index_offsets[cs]
|
182
|
+
_slen = _csv_key_spans[cs]
|
183
|
+
_trans = if ( _slen > 0 &&
|
184
|
+
_csv_trans_keys[_keys] <= data[p].ord &&
|
185
|
+
data[p].ord <= _csv_trans_keys[_keys + 1]
|
186
|
+
) then
|
187
|
+
_csv_indicies[ _inds + data[p].ord - _csv_trans_keys[_keys] ]
|
188
|
+
else
|
189
|
+
_csv_indicies[ _inds + _slen ]
|
190
|
+
end
|
191
|
+
cs = _csv_trans_targs[_trans]
|
192
|
+
if _csv_trans_actions[_trans] != 0
|
193
|
+
case _csv_trans_actions[_trans]
|
194
|
+
when 2 then
|
195
|
+
# line 6 "lib/hadoop/csv.rl"
|
196
|
+
begin
|
197
|
+
|
198
|
+
register_start(p,data[p],data[p+1])
|
199
|
+
end
|
200
|
+
when 1 then
|
201
|
+
# line 10 "lib/hadoop/csv.rl"
|
202
|
+
begin
|
203
|
+
|
204
|
+
register_end(data[p],data,p)
|
205
|
+
end
|
206
|
+
# line 207 "lib/hadoop/csv.rb"
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
if _goto_level <= _again
|
211
|
+
if cs == 0
|
212
|
+
_goto_level = _out
|
213
|
+
next
|
214
|
+
end
|
215
|
+
p += 1
|
216
|
+
if p != pe
|
217
|
+
_goto_level = _resume
|
218
|
+
next
|
219
|
+
end
|
220
|
+
end
|
221
|
+
if _goto_level <= _test_eof
|
222
|
+
end
|
223
|
+
if _goto_level <= _out
|
224
|
+
break
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
228
|
+
|
229
|
+
# line 68 "lib/hadoop/csv.rl"
|
230
|
+
# % (this fixes syntax highlighting)
|
231
|
+
@result[0]
|
232
|
+
end
|
233
|
+
|
234
|
+
protected
|
235
|
+
def register_start(position,char_code,next_char_code)
|
236
|
+
case @states.last
|
237
|
+
when :default
|
238
|
+
@position = position
|
239
|
+
process_char(char_code,next_char_code)
|
240
|
+
when :string
|
241
|
+
# ignore
|
242
|
+
when :bytes
|
243
|
+
#ignore
|
244
|
+
when :struct
|
245
|
+
@position = position
|
246
|
+
process_char(char_code,next_char_code)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def process_char(char_code,next_char_code)
|
251
|
+
case char_code
|
252
|
+
when SINGLE_QUOTE_CODE
|
253
|
+
@states << :string
|
254
|
+
when DASH_CODE
|
255
|
+
@states << :bytes
|
256
|
+
when S_CODE, V_CODE, M_CODE
|
257
|
+
if next_char_code == OPENING_BRACE_CODE
|
258
|
+
@states << :struct
|
259
|
+
@result << []
|
260
|
+
end
|
261
|
+
else
|
262
|
+
@states << :other
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
def register_end(char_code,data,position)
|
267
|
+
# TODO there seems to be ambiguity in the CSV format:
|
268
|
+
# unicode string/byte sequence containing the closing brace
|
269
|
+
# TODO fix char -> char_code
|
270
|
+
#if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
|
271
|
+
last_start = @position
|
272
|
+
new_data = data[last_start..position-1].pack("c*")
|
273
|
+
case new_data[0]
|
274
|
+
when "'"
|
275
|
+
@result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
|
276
|
+
gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
|
277
|
+
when "T","F"
|
278
|
+
if new_data == "T"
|
279
|
+
@result.last << true
|
280
|
+
else
|
281
|
+
@result.last << false
|
282
|
+
end
|
283
|
+
when "}"
|
284
|
+
subresult = @result.pop
|
285
|
+
@result.last << subresult
|
286
|
+
else
|
287
|
+
if new_data =~ /^-?\d+(\.)?/
|
288
|
+
if $~[1].nil?
|
289
|
+
@result.last << new_data.to_i
|
290
|
+
else
|
291
|
+
@result.last << new_data.to_f
|
292
|
+
end
|
293
|
+
else
|
294
|
+
raise "CSV error: #{new_data}"
|
295
|
+
end
|
296
|
+
end
|
297
|
+
@position = position
|
298
|
+
@states.pop
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
data/lib/hadoop/csv.rl
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
# vim: filetype=ruby
|
2
|
+
=begin
|
3
|
+
%%{
|
4
|
+
machine csv;
|
5
|
+
|
6
|
+
action value_start {
|
7
|
+
register_start(p,data[p],data[p+1])
|
8
|
+
}
|
9
|
+
|
10
|
+
action value_end {
|
11
|
+
register_end(data[p],data,p)
|
12
|
+
}
|
13
|
+
|
14
|
+
start_tag = ('T' | 'F' | '-' | digit | ';' | "'" | "#" | 's{' | 'v{' | 'm{') >value_start;
|
15
|
+
end_tag = (',' | '}') >value_end;
|
16
|
+
normal = (any - [\0\n%,}] | '%00' | '%0A' | '%25' | '%2C' | '%7D');
|
17
|
+
main := (start_tag | normal | end_tag) * . "\n" >value_end;
|
18
|
+
|
19
|
+
}%%
|
20
|
+
=end
|
21
|
+
module Hadoop
|
22
|
+
class Csv
|
23
|
+
SINGLE_QUOTE_CODE = "'".codepoints.first
|
24
|
+
DASH_CODE = "#".codepoints.first
|
25
|
+
S_CODE = "s".codepoints.first
|
26
|
+
V_CODE = "v".codepoints.first
|
27
|
+
M_CODE = "m".codepoints.first
|
28
|
+
OPENING_BRACE_CODE = "{".codepoints.first
|
29
|
+
|
30
|
+
attr_reader :path
|
31
|
+
|
32
|
+
# Create new Hadoop CSV parser. If +path+ is given,
|
33
|
+
# the file will be parsed in +each+ method.
|
34
|
+
def initialize(path=nil)
|
35
|
+
@path = path
|
36
|
+
%% write data;
|
37
|
+
# % (this fixes syntax highlighting)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Opens the file given in constructor and yields
|
41
|
+
# the parsed results.
|
42
|
+
def each
|
43
|
+
if block_given?
|
44
|
+
File.open(path) do |f|
|
45
|
+
while !f.eof? && line = f.readline
|
46
|
+
yield parse(line)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
else
|
50
|
+
enum_for(:each)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Parse single line of Hadoop CSV. The line must end with '\n'.
|
55
|
+
def parse(line)
|
56
|
+
# So that ragel doesn't try to get it from data.length
|
57
|
+
pe = :ignored
|
58
|
+
eof = :ignored
|
59
|
+
%% write init;
|
60
|
+
# % (this fixes syntax highlighting)
|
61
|
+
@result = [[]]
|
62
|
+
@position = 0
|
63
|
+
@states = [:default]
|
64
|
+
data = line.unpack('c*')
|
65
|
+
p = 0
|
66
|
+
pe = data.length
|
67
|
+
%% write exec;
|
68
|
+
# % (this fixes syntax highlighting)
|
69
|
+
@result[0]
|
70
|
+
end
|
71
|
+
|
72
|
+
protected
|
73
|
+
def register_start(position,char_code,next_char_code)
|
74
|
+
case @states.last
|
75
|
+
when :default
|
76
|
+
@position = position
|
77
|
+
process_char(char_code,next_char_code)
|
78
|
+
when :string
|
79
|
+
# ignore
|
80
|
+
when :bytes
|
81
|
+
#ignore
|
82
|
+
when :struct
|
83
|
+
@position = position
|
84
|
+
process_char(char_code,next_char_code)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def process_char(char_code,next_char_code)
|
89
|
+
case char_code
|
90
|
+
when SINGLE_QUOTE_CODE
|
91
|
+
@states << :string
|
92
|
+
when DASH_CODE
|
93
|
+
@states << :bytes
|
94
|
+
when S_CODE, V_CODE, M_CODE
|
95
|
+
if next_char_code == OPENING_BRACE_CODE
|
96
|
+
@states << :struct
|
97
|
+
@result << []
|
98
|
+
end
|
99
|
+
else
|
100
|
+
@states << :other
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def register_end(char_code,data,position)
|
105
|
+
# TODO there seems to be ambiguity in the CSV format:
|
106
|
+
# unicode string/byte sequence containing the closing brace
|
107
|
+
# TODO fix char -> char_code
|
108
|
+
#if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
|
109
|
+
last_start = @position
|
110
|
+
new_data = data[last_start..position-1].pack("c*")
|
111
|
+
case new_data[0]
|
112
|
+
when "'"
|
113
|
+
@result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
|
114
|
+
gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
|
115
|
+
when "T","F"
|
116
|
+
if new_data == "T"
|
117
|
+
@result.last << true
|
118
|
+
else
|
119
|
+
@result.last << false
|
120
|
+
end
|
121
|
+
when "}"
|
122
|
+
subresult = @result.pop
|
123
|
+
@result.last << subresult
|
124
|
+
else
|
125
|
+
if new_data =~ /^-?\d+(\.)?/
|
126
|
+
if $~[1].nil?
|
127
|
+
@result.last << new_data.to_i
|
128
|
+
else
|
129
|
+
@result.last << new_data.to_f
|
130
|
+
end
|
131
|
+
else
|
132
|
+
raise "CSV error: #{new_data}"
|
133
|
+
end
|
134
|
+
end
|
135
|
+
@position = position
|
136
|
+
@states.pop
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hadoop-csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-16 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: Hadoop CSV format parser.
|
15
|
+
email:
|
16
|
+
- apohllo@o2.pl
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- Gemfile
|
23
|
+
- README.rdoc
|
24
|
+
- Rakefile
|
25
|
+
- hadoop-csv.gemspec
|
26
|
+
- lib/hadoop/csv.rb
|
27
|
+
- lib/hadoop/csv.rl
|
28
|
+
homepage:
|
29
|
+
licenses: []
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
none: false
|
36
|
+
requirements:
|
37
|
+
- - ! '>='
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
requirements: []
|
47
|
+
rubyforge_project: hadoop-csv
|
48
|
+
rubygems_version: 1.8.24
|
49
|
+
signing_key:
|
50
|
+
specification_version: 3
|
51
|
+
summary: Hadoop CSV format parser.
|
52
|
+
test_files: []
|
53
|
+
has_rdoc:
|