hadoop-csv 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ .*.sw?
2
+ work
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "http://www.rubygems.org"
2
+ gemspec
@@ -0,0 +1,14 @@
1
+ = Hadoop-CSV
2
+
3
+ Ruby reader for Hadoop CSV format.
4
+
5
+ == Description
6
+
7
+ This gem implements a ragel based parser for Hadoop CSV format.
8
+ That format uses several complex types to represent vecotrs, maps
9
+ and structures.
10
+
11
+ More info: http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/record/package-summary.html
12
+
13
+ Note: the description lacks information about %7D special sequence which encodes a quoted
14
+ closing brace.
@@ -0,0 +1,4 @@
1
+ desc "Generate the Ruby state machine"
2
+ task :generate do
3
+ sh "ragel -T1 -F1 -R lib/hadoop/csv.rl"
4
+ end
@@ -0,0 +1,20 @@
1
+ $:.unshift "lib"
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "hadoop-csv"
5
+ s.version = '0.0.3'
6
+ s.date = "#{Time.now.strftime("%Y-%m-%d")}"
7
+ s.authors = ['Aleksander Pohl']
8
+ s.email = ["apohllo@o2.pl"]
9
+ #s.homepage = "http://github.com/apohllo/rod"
10
+ s.summary = "Hadoop CSV format parser."
11
+ s.description = "Hadoop CSV format parser."
12
+
13
+ s.rubyforge_project = "hadoop-csv"
14
+ #s.rdoc_options = ["--main", "README.rdoc"]
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_path = "lib"
20
+ end
@@ -0,0 +1,301 @@
1
+
2
+ # line 1 "lib/hadoop/csv.rl"
3
+ # vim: filetype=ruby
4
+ =begin
5
+
6
+ # line 19 "lib/hadoop/csv.rl"
7
+
8
+ =end
9
+ module Hadoop
10
+ class Csv
11
+ SINGLE_QUOTE_CODE = "'".codepoints.first
12
+ DASH_CODE = "#".codepoints.first
13
+ S_CODE = "s".codepoints.first
14
+ V_CODE = "v".codepoints.first
15
+ M_CODE = "m".codepoints.first
16
+ OPENING_BRACE_CODE = "{".codepoints.first
17
+
18
+ attr_reader :path
19
+
20
+ # Create new Hadoop CSV parser. If +path+ is given,
21
+ # the file will be parsed in +each+ method.
22
+ def initialize(path=nil)
23
+ @path = path
24
+
25
+ # line 26 "lib/hadoop/csv.rb"
26
+ class << self
27
+ attr_accessor :_csv_trans_keys
28
+ private :_csv_trans_keys, :_csv_trans_keys=
29
+ end
30
+ self._csv_trans_keys = [
31
+ 0, 0, 0, 125, 48, 55,
32
+ 48, 65, 53, 67, 68,
33
+ 68, 0, 0, 0
34
+ ]
35
+
36
+ class << self
37
+ attr_accessor :_csv_key_spans
38
+ private :_csv_key_spans, :_csv_key_spans=
39
+ end
40
+ self._csv_key_spans = [
41
+ 0, 126, 8, 18, 15, 1, 0
42
+ ]
43
+
44
+ class << self
45
+ attr_accessor :_csv_index_offsets
46
+ private :_csv_index_offsets, :_csv_index_offsets=
47
+ end
48
+ self._csv_index_offsets = [
49
+ 0, 0, 127, 136, 155, 171, 173
50
+ ]
51
+
52
+ class << self
53
+ attr_accessor :_csv_indicies
54
+ private :_csv_indicies, :_csv_indicies=
55
+ end
56
+ self._csv_indicies = [
57
+ 1, 0, 0, 0, 0, 0, 0, 0,
58
+ 0, 0, 2, 0, 0, 0, 0, 0,
59
+ 0, 0, 0, 0, 0, 0, 0, 0,
60
+ 0, 0, 0, 0, 0, 0, 0, 0,
61
+ 0, 0, 0, 3, 0, 4, 0, 3,
62
+ 0, 0, 0, 0, 5, 3, 0, 0,
63
+ 3, 3, 3, 3, 3, 3, 3, 3,
64
+ 3, 3, 0, 3, 0, 0, 0, 0,
65
+ 0, 0, 0, 0, 0, 0, 3, 0,
66
+ 0, 0, 0, 0, 0, 0, 0, 0,
67
+ 0, 0, 0, 0, 3, 0, 0, 0,
68
+ 0, 0, 0, 0, 0, 0, 0, 0,
69
+ 0, 0, 0, 0, 0, 0, 0, 0,
70
+ 0, 0, 0, 0, 0, 3, 0, 0,
71
+ 0, 0, 0, 3, 0, 0, 3, 0,
72
+ 0, 0, 0, 0, 0, 5, 0, 6,
73
+ 1, 7, 1, 1, 1, 1, 8, 1,
74
+ 0, 1, 1, 1, 1, 1, 1, 1,
75
+ 1, 1, 1, 1, 1, 1, 1, 1,
76
+ 1, 0, 1, 0, 1, 1, 1, 1,
77
+ 1, 1, 1, 1, 1, 1, 1, 1,
78
+ 1, 0, 1, 0, 1, 1, 0
79
+ ]
80
+
81
+ class << self
82
+ attr_accessor :_csv_trans_targs
83
+ private :_csv_trans_targs, :_csv_trans_targs=
84
+ end
85
+ self._csv_trans_targs = [
86
+ 1, 0, 6, 1, 2, 1, 3, 4,
87
+ 5
88
+ ]
89
+
90
+ class << self
91
+ attr_accessor :_csv_trans_actions
92
+ private :_csv_trans_actions, :_csv_trans_actions=
93
+ end
94
+ self._csv_trans_actions = [
95
+ 0, 0, 1, 2, 0, 1, 0, 0,
96
+ 0
97
+ ]
98
+
99
+ class << self
100
+ attr_accessor :csv_start
101
+ end
102
+ self.csv_start = 1;
103
+ class << self
104
+ attr_accessor :csv_first_final
105
+ end
106
+ self.csv_first_final = 6;
107
+ class << self
108
+ attr_accessor :csv_error
109
+ end
110
+ self.csv_error = 0;
111
+
112
+ class << self
113
+ attr_accessor :csv_en_main
114
+ end
115
+ self.csv_en_main = 1;
116
+
117
+
118
+ # line 37 "lib/hadoop/csv.rl"
119
+ # % (this fixes syntax highlighting)
120
+ end
121
+
122
+ # Opens the file given in constructor and yields
123
+ # the parsed results.
124
+ def each
125
+ if block_given?
126
+ File.open(path) do |f|
127
+ while !f.eof? && line = f.readline
128
+ yield parse(line)
129
+ end
130
+ end
131
+ else
132
+ enum_for(:each)
133
+ end
134
+ end
135
+
136
+ # Parse single line of Hadoop CSV. The line must end with '\n'.
137
+ def parse(line)
138
+ # So that ragel doesn't try to get it from data.length
139
+ pe = :ignored
140
+ eof = :ignored
141
+
142
+ # line 143 "lib/hadoop/csv.rb"
143
+ begin
144
+ p ||= 0
145
+ pe ||= data.length
146
+ cs = csv_start
147
+ end
148
+
149
+ # line 60 "lib/hadoop/csv.rl"
150
+ # % (this fixes syntax highlighting)
151
+ @result = [[]]
152
+ @position = 0
153
+ @states = [:default]
154
+ data = line.unpack('c*')
155
+ p = 0
156
+ pe = data.length
157
+
158
+ # line 159 "lib/hadoop/csv.rb"
159
+ begin
160
+ testEof = false
161
+ _slen, _trans, _keys, _inds, _acts, _nacts = nil
162
+ _goto_level = 0
163
+ _resume = 10
164
+ _eof_trans = 15
165
+ _again = 20
166
+ _test_eof = 30
167
+ _out = 40
168
+ while true
169
+ if _goto_level <= 0
170
+ if p == pe
171
+ _goto_level = _test_eof
172
+ next
173
+ end
174
+ if cs == 0
175
+ _goto_level = _out
176
+ next
177
+ end
178
+ end
179
+ if _goto_level <= _resume
180
+ _keys = cs << 1
181
+ _inds = _csv_index_offsets[cs]
182
+ _slen = _csv_key_spans[cs]
183
+ _trans = if ( _slen > 0 &&
184
+ _csv_trans_keys[_keys] <= data[p].ord &&
185
+ data[p].ord <= _csv_trans_keys[_keys + 1]
186
+ ) then
187
+ _csv_indicies[ _inds + data[p].ord - _csv_trans_keys[_keys] ]
188
+ else
189
+ _csv_indicies[ _inds + _slen ]
190
+ end
191
+ cs = _csv_trans_targs[_trans]
192
+ if _csv_trans_actions[_trans] != 0
193
+ case _csv_trans_actions[_trans]
194
+ when 2 then
195
+ # line 6 "lib/hadoop/csv.rl"
196
+ begin
197
+
198
+ register_start(p,data[p],data[p+1])
199
+ end
200
+ when 1 then
201
+ # line 10 "lib/hadoop/csv.rl"
202
+ begin
203
+
204
+ register_end(data[p],data,p)
205
+ end
206
+ # line 207 "lib/hadoop/csv.rb"
207
+ end
208
+ end
209
+ end
210
+ if _goto_level <= _again
211
+ if cs == 0
212
+ _goto_level = _out
213
+ next
214
+ end
215
+ p += 1
216
+ if p != pe
217
+ _goto_level = _resume
218
+ next
219
+ end
220
+ end
221
+ if _goto_level <= _test_eof
222
+ end
223
+ if _goto_level <= _out
224
+ break
225
+ end
226
+ end
227
+ end
228
+
229
+ # line 68 "lib/hadoop/csv.rl"
230
+ # % (this fixes syntax highlighting)
231
+ @result[0]
232
+ end
233
+
234
+ protected
235
+ def register_start(position,char_code,next_char_code)
236
+ case @states.last
237
+ when :default
238
+ @position = position
239
+ process_char(char_code,next_char_code)
240
+ when :string
241
+ # ignore
242
+ when :bytes
243
+ #ignore
244
+ when :struct
245
+ @position = position
246
+ process_char(char_code,next_char_code)
247
+ end
248
+ end
249
+
250
+ def process_char(char_code,next_char_code)
251
+ case char_code
252
+ when SINGLE_QUOTE_CODE
253
+ @states << :string
254
+ when DASH_CODE
255
+ @states << :bytes
256
+ when S_CODE, V_CODE, M_CODE
257
+ if next_char_code == OPENING_BRACE_CODE
258
+ @states << :struct
259
+ @result << []
260
+ end
261
+ else
262
+ @states << :other
263
+ end
264
+ end
265
+
266
+ def register_end(char_code,data,position)
267
+ # TODO there seems to be ambiguity in the CSV format:
268
+ # unicode string/byte sequence containing the closing brace
269
+ # TODO fix char -> char_code
270
+ #if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
271
+ last_start = @position
272
+ new_data = data[last_start..position-1].pack("c*")
273
+ case new_data[0]
274
+ when "'"
275
+ @result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
276
+ gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
277
+ when "T","F"
278
+ if new_data == "T"
279
+ @result.last << true
280
+ else
281
+ @result.last << false
282
+ end
283
+ when "}"
284
+ subresult = @result.pop
285
+ @result.last << subresult
286
+ else
287
+ if new_data =~ /^-?\d+(\.)?/
288
+ if $~[1].nil?
289
+ @result.last << new_data.to_i
290
+ else
291
+ @result.last << new_data.to_f
292
+ end
293
+ else
294
+ raise "CSV error: #{new_data}"
295
+ end
296
+ end
297
+ @position = position
298
+ @states.pop
299
+ end
300
+ end
301
+ end
@@ -0,0 +1,139 @@
1
+ # vim: filetype=ruby
2
+ =begin
3
+ %%{
4
+ machine csv;
5
+
6
+ action value_start {
7
+ register_start(p,data[p],data[p+1])
8
+ }
9
+
10
+ action value_end {
11
+ register_end(data[p],data,p)
12
+ }
13
+
14
+ start_tag = ('T' | 'F' | '-' | digit | ';' | "'" | "#" | 's{' | 'v{' | 'm{') >value_start;
15
+ end_tag = (',' | '}') >value_end;
16
+ normal = (any - [\0\n%,}] | '%00' | '%0A' | '%25' | '%2C' | '%7D');
17
+ main := (start_tag | normal | end_tag) * . "\n" >value_end;
18
+
19
+ }%%
20
+ =end
21
+ module Hadoop
22
+ class Csv
23
+ SINGLE_QUOTE_CODE = "'".codepoints.first
24
+ DASH_CODE = "#".codepoints.first
25
+ S_CODE = "s".codepoints.first
26
+ V_CODE = "v".codepoints.first
27
+ M_CODE = "m".codepoints.first
28
+ OPENING_BRACE_CODE = "{".codepoints.first
29
+
30
+ attr_reader :path
31
+
32
+ # Create new Hadoop CSV parser. If +path+ is given,
33
+ # the file will be parsed in +each+ method.
34
+ def initialize(path=nil)
35
+ @path = path
36
+ %% write data;
37
+ # % (this fixes syntax highlighting)
38
+ end
39
+
40
+ # Opens the file given in constructor and yields
41
+ # the parsed results.
42
+ def each
43
+ if block_given?
44
+ File.open(path) do |f|
45
+ while !f.eof? && line = f.readline
46
+ yield parse(line)
47
+ end
48
+ end
49
+ else
50
+ enum_for(:each)
51
+ end
52
+ end
53
+
54
+ # Parse single line of Hadoop CSV. The line must end with '\n'.
55
+ def parse(line)
56
+ # So that ragel doesn't try to get it from data.length
57
+ pe = :ignored
58
+ eof = :ignored
59
+ %% write init;
60
+ # % (this fixes syntax highlighting)
61
+ @result = [[]]
62
+ @position = 0
63
+ @states = [:default]
64
+ data = line.unpack('c*')
65
+ p = 0
66
+ pe = data.length
67
+ %% write exec;
68
+ # % (this fixes syntax highlighting)
69
+ @result[0]
70
+ end
71
+
72
+ protected
73
+ def register_start(position,char_code,next_char_code)
74
+ case @states.last
75
+ when :default
76
+ @position = position
77
+ process_char(char_code,next_char_code)
78
+ when :string
79
+ # ignore
80
+ when :bytes
81
+ #ignore
82
+ when :struct
83
+ @position = position
84
+ process_char(char_code,next_char_code)
85
+ end
86
+ end
87
+
88
+ def process_char(char_code,next_char_code)
89
+ case char_code
90
+ when SINGLE_QUOTE_CODE
91
+ @states << :string
92
+ when DASH_CODE
93
+ @states << :bytes
94
+ when S_CODE, V_CODE, M_CODE
95
+ if next_char_code == OPENING_BRACE_CODE
96
+ @states << :struct
97
+ @result << []
98
+ end
99
+ else
100
+ @states << :other
101
+ end
102
+ end
103
+
104
+ def register_end(char_code,data,position)
105
+ # TODO there seems to be ambiguity in the CSV format:
106
+ # unicode string/byte sequence containing the closing brace
107
+ # TODO fix char -> char_code
108
+ #if char == "," || char == "}" #|| (@states.last != :string && @states.last != :bytes)
109
+ last_start = @position
110
+ new_data = data[last_start..position-1].pack("c*")
111
+ case new_data[0]
112
+ when "'"
113
+ @result.last << new_data[1..-1].gsub(/%00/,"\0").gsub(/%0A/,"\n").
114
+ gsub(/%25/,"%").gsub(/%2C/,",").gsub(/%7D/,"}").force_encoding("utf-8")
115
+ when "T","F"
116
+ if new_data == "T"
117
+ @result.last << true
118
+ else
119
+ @result.last << false
120
+ end
121
+ when "}"
122
+ subresult = @result.pop
123
+ @result.last << subresult
124
+ else
125
+ if new_data =~ /^-?\d+(\.)?/
126
+ if $~[1].nil?
127
+ @result.last << new_data.to_i
128
+ else
129
+ @result.last << new_data.to_f
130
+ end
131
+ else
132
+ raise "CSV error: #{new_data}"
133
+ end
134
+ end
135
+ @position = position
136
+ @states.pop
137
+ end
138
+ end
139
+ end
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hadoop-csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-16 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: Hadoop CSV format parser.
15
+ email:
16
+ - apohllo@o2.pl
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - Gemfile
23
+ - README.rdoc
24
+ - Rakefile
25
+ - hadoop-csv.gemspec
26
+ - lib/hadoop/csv.rb
27
+ - lib/hadoop/csv.rl
28
+ homepage:
29
+ licenses: []
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ none: false
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ required_rubygems_version: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ requirements: []
47
+ rubyforge_project: hadoop-csv
48
+ rubygems_version: 1.8.24
49
+ signing_key:
50
+ specification_version: 3
51
+ summary: Hadoop CSV format parser.
52
+ test_files: []
53
+ has_rdoc: