hadoop-csv 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ .*.sw?
2
+ work
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "http://www.rubygems.org"
2
+ gemspec
@@ -0,0 +1,14 @@
1
+ = Hadoop-CSV
2
+
3
+ Ruby reader for Hadoop CSV format.
4
+
5
+ == Description
6
+
7
+ This gem implements a ragel based parser for Hadoop CSV format.
8
+ That format uses several complex types to represent vecotrs, maps
9
+ and structures.
10
+
11
+ More info: http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/record/package-summary.html
12
+
13
+ Note: the description lacks information about %7D special sequence which encodes a quoted
14
+ closing brace.
@@ -0,0 +1,4 @@
1
+ desc "Generate the Ruby state machine"
2
+ task :generate do
3
+ sh "ragel -T1 -F1 -R lib/hadoop/csv.rl"
4
+ end
@@ -0,0 +1,20 @@
1
+ $:.unshift "lib"
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "hadoop-csv"
5
+ s.version = '0.0.3'
6
+ s.date = "#{Time.now.strftime("%Y-%m-%d")}"
7
+ s.authors = ['Aleksander Pohl']
8
+ s.email = ["apohllo@o2.pl"]
9
+ #s.homepage = "http://github.com/apohllo/rod"
10
+ s.summary = "Hadoop CSV format parser."
11
+ s.description = "Hadoop CSV format parser."
12
+
13
+ s.rubyforge_project = "hadoop-csv"
14
+ #s.rdoc_options = ["--main", "README.rdoc"]
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_path = "lib"
20
+ end
@@ -0,0 +1,301 @@
1
+
2
+ # line 1 "lib/hadoop/csv.rl"
3
+ # vim: filetype=ruby
4
+ =begin
5
+
6
+ # line 19 "lib/hadoop/csv.rl"
7
+
8
+ =end
9
+ module Hadoop
10
+ class Csv
11
+ SINGLE_QUOTE_CODE = "'".codepoints.first
12
+ DASH_CODE = "#".codepoints.first
13
+ S_CODE = "s".codepoints.first
14
+ V_CODE = "v".codepoints.first
15
+ M_CODE = "m".codepoints.first
16
+ OPENING_BRACE_CODE = "{".codepoints.first
17
+
18
+ attr_reader :path
19
+
20
+ # Create new Hadoop CSV parser. If +path+ is given,
21
+ # the file will be parsed in +each+ method.
22
+ def initialize(path=nil)
23
+ @path = path
24
+
25
+ # line 26 "lib/hadoop/csv.rb"
26
+ class << self
27
+ attr_accessor :_csv_trans_keys
28
+ private :_csv_trans_keys, :_csv_trans_keys=
29
+ end
30
+ self._csv_trans_keys = [
31
+ 0, 0, 0, 125, 48, 55,
32
+ 48, 65, 53, 67, 68,
33
+ 68, 0, 0, 0
34
+ ]
35
+
36
+ class << self
37
+ attr_accessor :_csv_key_spans
38
+ private :_csv_key_spans, :_csv_key_spans=
39
+ end
40
+ self._csv_key_spans = [
41
+ 0, 126, 8, 18, 15, 1, 0
42
+ ]
43
+
44
+ class << self
45
+ attr_accessor :_csv_index_offsets
46
+ private :_csv_index_offsets, :_csv_index_offsets=
47
+ end
48
+ self._csv_index_offsets = [
49
+ 0, 0, 127, 136, 155, 171, 173
50
+ ]
51
+
52
+ class << self
53
+ attr_accessor :_csv_indicies
54
+ private :_csv_indicies, :_csv_indicies=
55
+ end
56
+ self._csv_indicies = [
57
+ 1, 0, 0, 0, 0, 0, 0, 0,
58
+ 0, 0, 2, 0, 0, 0, 0, 0,
59
+ 0, 0, 0, 0, 0, 0, 0, 0,
60
+ 0, 0, 0, 0, 0, 0, 0, 0,
61
+ 0, 0, 0, 3, 0, 4, 0, 3,
62
+ 0, 0, 0, 0, 5, 3, 0, 0,
63
+ 3, 3, 3, 3, 3, 3, 3, 3,
64
+ 3, 3, 0, 3, 0, 0, 0, 0,
65
+ 0, 0, 0, 0, 0, 0, 3, 0,
66
+ 0, 0, 0, 0, 0, 0, 0, 0,
67
+ 0, 0, 0, 0, 3, 0, 0, 0,
68
+ 0, 0, 0, 0, 0, 0, 0, 0,
69
+ 0, 0, 0, 0, 0, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 3, 0, 0,
71
+ 0, 0, 0, 3, 0, 0, 3, 0,
72
+ 0, 0, 0, 0, 0, 5, 0, 6,
73
+ 1, 7, 1, 1, 1, 1, 8, 1,
74
+ 0, 1, 1, 1, 1, 1, 1, 1,
75
+ 1, 1, 1, 1, 1, 1, 1, 1,
76
+ 1, 0, 1, 0, 1, 1, 1, 1,
77
+ 1, 1, 1, 1, 1, 1, 1, 1,
78
+ 1, 0, 1, 0, 1, 1, 0
79
+ ]
80
+
81
+ class << self
82
+ attr_accessor :_csv_trans_targs
83
+ private :_csv_trans_targs, :_csv_trans_targs=
84
+ end
85
+ self._csv_trans_targs = [
86
+ 1, 0, 6, 1, 2, 1, 3, 4,
87
+ 5
88
+ ]
89
+
90
+ class << self
91
+ attr_accessor :_csv_trans_actions
92
+ private :_csv_trans_actions, :_csv_trans_actions=
93
+ end
94
+ self._csv_trans_actions = [
95
+ 0, 0, 1, 2, 0, 1, 0, 0,
96
+ 0
97
+ ]
98
+
99
+ class << self
100
+ attr_accessor :csv_start
101
+ end
102
+ self.csv_start = 1;
103
+ class << self
104
+ attr_accessor :csv_first_final
105
+ end
106
+ self.csv_first_final = 6;
107
+ class << self
108
+ attr_accessor :csv_error
109
+ end
110
+ self.csv_error = 0;
111
+
112
+ class << self
113
+ attr_accessor :csv_en_main
114
+ end
115
+ self.csv_en_main = 1;
116
+
117
+
118
+ # line 37 "lib/hadoop/csv.rl"
119
+ # % (this fixes syntax highlighting)
120
+ end
121
+
122
+ # Opens the file given in constructor and yields
123
+ # the parsed results.
124
+ def each
125
+ if block_given?
126
+ File.open(path) do |f|
127
+ while !f.eof? && line = f.readline
128
+ yield parse(line)
129
+ end
130
+ end
131
+ else
132
+ enum_for(:each)
133
+ end
134
+ end
135
+
136
+ # Parse single line of Hadoop CSV. The line must end with '\n'.
137
+ def parse(line)
138
+ # So that ragel doesn't try to get it from data.length
139
+ pe = :ignored
140
+ eof = :ignored
141
+
142
+ # line 143 "lib/hadoop/csv.rb"
143
+ begin
144
+ p ||= 0
145
+ pe ||= data.length
146
+ cs = csv_start
147
+ end
148
+
149
+ # line 60 "lib/hadoop/csv.rl"
150
+ # % (this fixes syntax highlighting)
151
+ @result = [[]]
152
+ @position = 0
153
+ @states = [:default]
154
+ data = line.unpack('c*')
155
+ p = 0
156
+ pe = data.length
157
+
158
+ # line 159 "lib/hadoop/csv.rb"
159
+ begin
160
+ testEof = false
161
+ _slen, _trans, _keys, _inds, _acts, _nacts = nil
162
+ _goto_level = 0
163
+ _resume = 10
164
+ _eof_trans = 15
165
+ _again = 20
166
+ _test_eof = 30
167
+ _out = 40
168
+ while true
169
+ if _goto_level <= 0
170
+ if p == pe
171
+ _goto_level = _test_eof
172
+ next
173
+ end
174
+ if cs == 0
175
+ _goto_level = _out
176
+ next
177
+ end
178
+ end
179
+ if _goto_level <= _resume
180
+ _keys = cs << 1
181
+ _inds = _csv_index_offsets[cs]
182
+ _slen = _csv_key_spans[cs]
183
+ _trans = if ( _slen > 0 &&
184
+ _csv_trans_keys[_keys] <= data[p].ord &&
185
+ data[p].ord <= _csv_trans_keys[_keys + 1]
186
+ ) then
187
+ _csv_indicies[ _inds + data[p].ord - _csv_trans_keys[_keys] ]
188
+ else
189
+ _csv_indicies[ _inds + _slen ]
190
+ end
191
+ cs = _csv_trans_targs[_trans]
192
+ if _csv_trans_actions[_trans] != 0
193
+ case _csv_trans_actions[_trans]
194
+ when 2 then
195
+ # line 6 "lib/hadoop/csv.rl"
196
+ begin
197
+
198
+ register_start(p,data[p],data[p+1])
199
+ end
200
+ when 1 then
201
+ # line 10 "lib/hadoop/csv.rl"
202
+ begin
203
+
204
+ register_end(data[p],data,p)
205
+ end
206
+ # line 207 "lib/hadoop/csv.rb"
207
+ end
208
+ end
209
+ end
210
+ if _goto_level <= _again
211
+ if cs == 0
212
+ _goto_level = _out
213
+ next
214
+ end
215
+ p += 1
216
+ if p != pe
217
+ _goto_level = _resume
218
+ next
219
+ end
220
+ end
221
+ if _goto_level <= _test_eof
222
+ end
223
+ if _goto_level <= _out
224
+ break
225
+ end
226
+ end
227
+ end
228
+
229
+ # line 68 "lib/hadoop/csv.rl"
230
+ # % (this fixes syntax highlighting)
231
+ @result[0]
232
+ end
233
+
234
+ protected
235
+ def register_start(position,char_code,next_char_code)
236
+ case @states.last
237
+ when :default
238
+ @position = position
239
+ process_char(char_code,next_char_code)
240
+ when :string
241
+ # ignore
242
+ when :bytes
243
+ #ignore
244
+ when :struct
245
+ @position = position
246
+ process_char(char_code,next_char_code)
247
+ end
248
+ end
249
+
250
+ def process_char(char_code,next_char_code)
251
+ case char_code
252
+ when SINGLE_QUOTE_CODE
253
+ @states << :string
254
+ when DASH_CODE
255
+ @states << :bytes
256
+ when S_CODE, V_CODE, M_CODE
257
+ if next_char_code == OPENING_BRACE_CODE
258
+ @states << :struct
259
+ @result << []
260
+ end
261
+ else
262
+ @states << :other
263
+ end
264
+ end
265
+
266
+ def register_end(char_code,data,position)
267
+ # TODO there seems to be ambiguity in the CSV format:
268
+ # unicode string/byte sequence containing the closing brace
269
+ # TODO fix char -> char_code
270
+ #if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
271
+ last_start = @position
272
+ new_data = data[last_start..position-1].pack("c*")
273
+ case new_data[0]
274
+ when "'"
275
+ @result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
276
+ gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
277
+ when "T","F"
278
+ if new_data == "T"
279
+ @result.last << true
280
+ else
281
+ @result.last << false
282
+ end
283
+ when "}"
284
+ subresult = @result.pop
285
+ @result.last << subresult
286
+ else
287
+ if new_data =~ /^-?\d+(\.)?/
288
+ if $~[1].nil?
289
+ @result.last << new_data.to_i
290
+ else
291
+ @result.last << new_data.to_f
292
+ end
293
+ else
294
+ raise "CSV error: #{new_data}"
295
+ end
296
+ end
297
+ @position = position
298
+ @states.pop
299
+ end
300
+ end
301
+ end
@@ -0,0 +1,139 @@
1
+ # vim: filetype=ruby
2
+ =begin
3
+ %%{
4
+ machine csv;
5
+
6
+ action value_start {
7
+ register_start(p,data[p],data[p+1])
8
+ }
9
+
10
+ action value_end {
11
+ register_end(data[p],data,p)
12
+ }
13
+
14
+ start_tag = ('T' | 'F' | '-' | digit | ';' | "'" | "#" | 's{' | 'v{' | 'm{') >value_start;
15
+ end_tag = (',' | '}') >value_end;
16
+ normal = (any - [\0\n%,}] | '%00' | '%0A' | '%25' | '%2C' | '%7D');
17
+ main := (start_tag | normal | end_tag) * . "\n" >value_end;
18
+
19
+ }%%
20
+ =end
21
+ module Hadoop
22
+ class Csv
23
+ SINGLE_QUOTE_CODE = "'".codepoints.first
24
+ DASH_CODE = "#".codepoints.first
25
+ S_CODE = "s".codepoints.first
26
+ V_CODE = "v".codepoints.first
27
+ M_CODE = "m".codepoints.first
28
+ OPENING_BRACE_CODE = "{".codepoints.first
29
+
30
+ attr_reader :path
31
+
32
+ # Create new Hadoop CSV parser. If +path+ is given,
33
+ # the file will be parsed in +each+ method.
34
+ def initialize(path=nil)
35
+ @path = path
36
+ %% write data;
37
+ # % (this fixes syntax highlighting)
38
+ end
39
+
40
+ # Opens the file given in constructor and yields
41
+ # the parsed results.
42
+ def each
43
+ if block_given?
44
+ File.open(path) do |f|
45
+ while !f.eof? && line = f.readline
46
+ yield parse(line)
47
+ end
48
+ end
49
+ else
50
+ enum_for(:each)
51
+ end
52
+ end
53
+
54
+ # Parse single line of Hadoop CSV. The line must end with '\n'.
55
+ def parse(line)
56
+ # So that ragel doesn't try to get it from data.length
57
+ pe = :ignored
58
+ eof = :ignored
59
+ %% write init;
60
+ # % (this fixes syntax highlighting)
61
+ @result = [[]]
62
+ @position = 0
63
+ @states = [:default]
64
+ data = line.unpack('c*')
65
+ p = 0
66
+ pe = data.length
67
+ %% write exec;
68
+ # % (this fixes syntax highlighting)
69
+ @result[0]
70
+ end
71
+
72
+ protected
73
+ def register_start(position,char_code,next_char_code)
74
+ case @states.last
75
+ when :default
76
+ @position = position
77
+ process_char(char_code,next_char_code)
78
+ when :string
79
+ # ignore
80
+ when :bytes
81
+ #ignore
82
+ when :struct
83
+ @position = position
84
+ process_char(char_code,next_char_code)
85
+ end
86
+ end
87
+
88
+ def process_char(char_code,next_char_code)
89
+ case char_code
90
+ when SINGLE_QUOTE_CODE
91
+ @states << :string
92
+ when DASH_CODE
93
+ @states << :bytes
94
+ when S_CODE, V_CODE, M_CODE
95
+ if next_char_code == OPENING_BRACE_CODE
96
+ @states << :struct
97
+ @result << []
98
+ end
99
+ else
100
+ @states << :other
101
+ end
102
+ end
103
+
104
+ def register_end(char_code,data,position)
105
+ # TODO there seems to be ambiguity in the CSV format:
106
+ # unicode string/byte sequence containing the closing brace
107
+ # TODO fix char -> char_code
108
+ #if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
109
+ last_start = @position
110
+ new_data = data[last_start..position-1].pack("c*")
111
+ case new_data[0]
112
+ when "'"
113
+ @result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
114
+ gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
115
+ when "T","F"
116
+ if new_data == "T"
117
+ @result.last << true
118
+ else
119
+ @result.last << false
120
+ end
121
+ when "}"
122
+ subresult = @result.pop
123
+ @result.last << subresult
124
+ else
125
+ if new_data =~ /^-?\d+(\.)?/
126
+ if $~[1].nil?
127
+ @result.last << new_data.to_i
128
+ else
129
+ @result.last << new_data.to_f
130
+ end
131
+ else
132
+ raise "CSV error: #{new_data}"
133
+ end
134
+ end
135
+ @position = position
136
+ @states.pop
137
+ end
138
+ end
139
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hadoop-csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-16 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Hadoop CSV format parser.
15
+ email:
16
+ - apohllo@o2.pl
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - Gemfile
23
+ - README.rdoc
24
+ - Rakefile
25
+ - hadoop-csv.gemspec
26
+ - lib/hadoop/csv.rb
27
+ - lib/hadoop/csv.rl
28
+ homepage:
29
+ licenses: []
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ required_rubygems_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubyforge_project: hadoop-csv
48
+ rubygems_version: 1.8.24
49
+ signing_key:
50
+ specification_version: 3
51
+ summary: Hadoop CSV format parser.
52
+ test_files: []
53
+ has_rdoc: