wukong 2.0.1 → 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -98,8 +98,8 @@ FORMAT_GUESSING_LINES = 500
98
98
  # widest column to set
99
99
  MAX_MAX_WIDTH = 100
100
100
 
101
- INT_RE = /\A\d+\z/
102
- FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
101
+ INT_RE = /\A[\d,]+\z/
102
+ FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
103
103
 
104
104
  def get_type val
105
105
  case
@@ -161,7 +161,7 @@ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type,
161
161
  case type
162
162
  when :mixed, nil then lambda{|s| "%-#{width}s" % s }
163
163
  when :str then lambda{|s| "%-#{width}s" % s }
164
- when :int then lambda{|s| "%#{width}d" % s.to_i }
164
+ when :int then lambda{|s| "%#{width}d" % s.gsub(/\D+/, "").to_i }
165
165
  when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
166
166
  else raise "oops type #{type}" end
167
167
  end
@@ -10,6 +10,7 @@ module Wukong
10
10
  autoload :Streamer, 'wukong/streamer'
11
11
  autoload :Store, 'wukong/store'
12
12
  autoload :FilenamePattern, 'wukong/filename_pattern'
13
+ autoload :Decorator, 'wukong/decorator'
13
14
 
14
15
  def self.run mapper, reducer=nil, options={}
15
16
  Wukong::Script.new(mapper, reducer, options).run
@@ -0,0 +1,28 @@
1
+ require 'java'
2
+
3
+ java_import 'com.cloudera.flume.core.Event'
4
+ java_import 'com.cloudera.flume.core.EventImpl'
5
+ java_import 'com.cloudera.flume.core.EventSinkDecorator'
6
+
7
+ module Wukong
8
+ class Decorator < EventSinkDecorator
9
+
10
+ def initialize(mapper, reducer=nil, options={})
11
+ super(nil)
12
+ @mapper = mapper.new
13
+ end
14
+
15
+ def append(e)
16
+ line = String.from_java_bytes(e.getBody)
17
+ record = @mapper.recordize(line.chomp)
18
+ @mapper.process(*record) do |output|
19
+ processed = output.to_flat.join("\t")
20
+ event = EventImpl.new(processed.to_java_bytes, e.getTimestamp, e.getPriority, e.getNanos, e.getHost, e.getAttrs)
21
+ super event
22
+ end
23
+ end
24
+
25
+ def run() self ; end
26
+
27
+ end
28
+ end
@@ -68,7 +68,7 @@ module Wukong
68
68
  #
69
69
  def self.decode_str str, strategy=:xml
70
70
  case strategy
71
- when :xml then HTMLEntities.decode_entities(str)
71
+ when :xml then self.html_encoder.decode(str)
72
72
  when :url then Addressable::URI.unencode_component(str)
73
73
  else raise "Don't know how to decode with strategy #{strategy}"
74
74
  end
@@ -4,21 +4,14 @@
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
- s.name = %q{wukong}
8
- s.version = "2.0.1"
7
+ s.name = "wukong"
8
+ s.version = "2.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2011-07-01}
13
- s.description = %q{ Treat your dataset like a:
14
-
15
- * stream of lines when it's efficient to process by lines
16
- * stream of field arrays when it's efficient to deal directly with fields
17
- * stream of lightweight objects when it's efficient to deal with objects
18
-
19
- Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
20
- }
21
- s.email = %q{flip@infochimps.org}
12
+ s.date = "2011-11-04"
13
+ s.description = " Treat your dataset like a:\n\n * stream of lines when it's efficient to process by lines\n * stream of field arrays when it's efficient to deal directly with fields\n * stream of lightweight objects when it's efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
14
+ s.email = "flip@infochimps.org"
22
15
  s.executables = ["hdp-du", "hdp-sync", "hdp-wc", "wu-lign", "wu-sum", "hdp-parts_to_keys.rb"]
23
16
  s.extra_rdoc_files = [
24
17
  "LICENSE.textile",
@@ -180,6 +173,7 @@ Gem::Specification.new do |s|
180
173
  "lib/wukong/datatypes.rb",
181
174
  "lib/wukong/datatypes/enum.rb",
182
175
  "lib/wukong/datatypes/fake_types.rb",
176
+ "lib/wukong/decorator.rb",
183
177
  "lib/wukong/encoding.rb",
184
178
  "lib/wukong/encoding/asciize.rb",
185
179
  "lib/wukong/extensions.rb",
@@ -254,46 +248,10 @@ Gem::Specification.new do |s|
254
248
  "spec/wukong/script_spec.rb",
255
249
  "wukong.gemspec"
256
250
  ]
257
- s.homepage = %q{http://mrflip.github.com/wukong}
251
+ s.homepage = "http://mrflip.github.com/wukong"
258
252
  s.require_paths = ["lib"]
259
- s.rubygems_version = %q{1.5.0}
260
- s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
261
- s.test_files = [
262
- "examples/contrib/jeans/normalize.rb",
263
- "examples/contrib/jeans/sizes.rb",
264
- "examples/corpus/bucket_counter.rb",
265
- "examples/corpus/dbpedia_abstract_to_sentences.rb",
266
- "examples/corpus/sentence_bigrams.rb",
267
- "examples/corpus/sentence_coocurrence.rb",
268
- "examples/corpus/words_to_bigrams.rb",
269
- "examples/emr/elastic_mapreduce_example.rb",
270
- "examples/ignore_me/counting.rb",
271
- "examples/ignore_me/grouper.rb",
272
- "examples/network_graph/adjacency_list.rb",
273
- "examples/network_graph/breadth_first_search.rb",
274
- "examples/network_graph/gen_2paths.rb",
275
- "examples/network_graph/gen_multi_edge.rb",
276
- "examples/network_graph/gen_symmetric_links.rb",
277
- "examples/pagerank/pagerank.rb",
278
- "examples/pagerank/pagerank_initialize.rb",
279
- "examples/sample_records.rb",
280
- "examples/server_logs/apache_log_parser.rb",
281
- "examples/server_logs/breadcrumbs.rb",
282
- "examples/server_logs/logline.rb",
283
- "examples/server_logs/nook.rb",
284
- "examples/server_logs/nook/faraday_dummy_adapter.rb",
285
- "examples/server_logs/user_agent.rb",
286
- "examples/simple_word_count.rb",
287
- "examples/size.rb",
288
- "examples/stats/avg_value_frequency.rb",
289
- "examples/stats/binning_percentile_estimator.rb",
290
- "examples/stats/rank_and_bin.rb",
291
- "examples/stupidly_simple_filter.rb",
292
- "examples/word_count.rb",
293
- "spec/spec_helper.rb",
294
- "spec/wukong/encoding_spec.rb",
295
- "spec/wukong/script_spec.rb"
296
- ]
253
+ s.rubygems_version = "1.8.10"
254
+ s.summary = "Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease."
297
255
 
298
256
  if s.respond_to? :specification_version then
299
257
  s.specification_version = 3
metadata CHANGED
@@ -1,87 +1,89 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.2
4
5
  prerelease:
5
- version: 2.0.1
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Philip (flip) Kromer
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-07-01 00:00:00 -05:00
14
- default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
12
+ date: 2011-11-04 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
17
15
  name: rspec
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: &2152107000 !ruby/object:Gem::Requirement
20
17
  none: false
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
24
21
  version: 1.2.9
25
22
  type: :development
26
- version_requirements: *id001
27
- - !ruby/object:Gem::Dependency
28
- name: yard
29
23
  prerelease: false
30
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: *2152107000
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ requirement: &2152105540 !ruby/object:Gem::Requirement
31
28
  none: false
32
- requirements:
33
- - - ">="
34
- - !ruby/object:Gem::Version
35
- version: "0"
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
36
33
  type: :development
37
- version_requirements: *id002
38
- - !ruby/object:Gem::Dependency
39
- name: addressable
40
34
  prerelease: false
41
- requirement: &id003 !ruby/object:Gem::Requirement
35
+ version_requirements: *2152105540
36
+ - !ruby/object:Gem::Dependency
37
+ name: addressable
38
+ requirement: &2152104640 !ruby/object:Gem::Requirement
42
39
  none: false
43
- requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: "0"
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
47
44
  type: :runtime
48
- version_requirements: *id003
49
- - !ruby/object:Gem::Dependency
50
- name: extlib
51
45
  prerelease: false
52
- requirement: &id004 !ruby/object:Gem::Requirement
46
+ version_requirements: *2152104640
47
+ - !ruby/object:Gem::Dependency
48
+ name: extlib
49
+ requirement: &2152103820 !ruby/object:Gem::Requirement
53
50
  none: false
54
- requirements:
55
- - - ">="
56
- - !ruby/object:Gem::Version
57
- version: "0"
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
58
55
  type: :runtime
59
- version_requirements: *id004
60
- - !ruby/object:Gem::Dependency
61
- name: htmlentities
62
56
  prerelease: false
63
- requirement: &id005 !ruby/object:Gem::Requirement
57
+ version_requirements: *2152103820
58
+ - !ruby/object:Gem::Dependency
59
+ name: htmlentities
60
+ requirement: &2152102940 !ruby/object:Gem::Requirement
64
61
  none: false
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: "0"
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
69
66
  type: :runtime
70
- version_requirements: *id005
71
- - !ruby/object:Gem::Dependency
72
- name: configliere
73
67
  prerelease: false
74
- requirement: &id006 !ruby/object:Gem::Requirement
68
+ version_requirements: *2152102940
69
+ - !ruby/object:Gem::Dependency
70
+ name: configliere
71
+ requirement: &2152101420 !ruby/object:Gem::Requirement
75
72
  none: false
76
- requirements:
77
- - - ">="
78
- - !ruby/object:Gem::Version
79
- version: "0"
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
80
77
  type: :runtime
81
- version_requirements: *id006
82
- description: " Treat your dataset like a:\n\n * stream of lines when it's efficient to process by lines\n * stream of field arrays when it's efficient to deal directly with fields\n * stream of lightweight objects when it's efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
78
+ prerelease: false
79
+ version_requirements: *2152101420
80
+ description: ! " Treat your dataset like a:\n\n * stream of lines when it's
81
+ efficient to process by lines\n * stream of field arrays when it's efficient
82
+ to deal directly with fields\n * stream of lightweight objects when it's efficient
83
+ to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
84
+ language, and the cat on your command line.\n"
83
85
  email: flip@infochimps.org
84
- executables:
86
+ executables:
85
87
  - hdp-du
86
88
  - hdp-sync
87
89
  - hdp-wc
@@ -89,11 +91,10 @@ executables:
89
91
  - wu-sum
90
92
  - hdp-parts_to_keys.rb
91
93
  extensions: []
92
-
93
- extra_rdoc_files:
94
+ extra_rdoc_files:
94
95
  - LICENSE.textile
95
96
  - README.textile
96
- files:
97
+ files:
97
98
  - CHANGELOG.textile
98
99
  - INSTALL.textile
99
100
  - LICENSE.textile
@@ -249,6 +250,7 @@ files:
249
250
  - lib/wukong/datatypes.rb
250
251
  - lib/wukong/datatypes/enum.rb
251
252
  - lib/wukong/datatypes/fake_types.rb
253
+ - lib/wukong/decorator.rb
252
254
  - lib/wukong/encoding.rb
253
255
  - lib/wukong/encoding/asciize.rb
254
256
  - lib/wukong/extensions.rb
@@ -322,66 +324,29 @@ files:
322
324
  - spec/wukong/encoding_spec.rb
323
325
  - spec/wukong/script_spec.rb
324
326
  - wukong.gemspec
325
- has_rdoc: true
326
327
  homepage: http://mrflip.github.com/wukong
327
328
  licenses: []
328
-
329
329
  post_install_message:
330
330
  rdoc_options: []
331
-
332
- require_paths:
331
+ require_paths:
333
332
  - lib
334
- required_ruby_version: !ruby/object:Gem::Requirement
333
+ required_ruby_version: !ruby/object:Gem::Requirement
335
334
  none: false
336
- requirements:
337
- - - ">="
338
- - !ruby/object:Gem::Version
339
- version: "0"
340
- required_rubygems_version: !ruby/object:Gem::Requirement
335
+ requirements:
336
+ - - ! '>='
337
+ - !ruby/object:Gem::Version
338
+ version: '0'
339
+ required_rubygems_version: !ruby/object:Gem::Requirement
341
340
  none: false
342
- requirements:
343
- - - ">="
344
- - !ruby/object:Gem::Version
345
- version: "0"
341
+ requirements:
342
+ - - ! '>='
343
+ - !ruby/object:Gem::Version
344
+ version: '0'
346
345
  requirements: []
347
-
348
346
  rubyforge_project:
349
- rubygems_version: 1.5.0
347
+ rubygems_version: 1.8.10
350
348
  signing_key:
351
349
  specification_version: 3
352
- summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
353
- test_files:
354
- - examples/contrib/jeans/normalize.rb
355
- - examples/contrib/jeans/sizes.rb
356
- - examples/corpus/bucket_counter.rb
357
- - examples/corpus/dbpedia_abstract_to_sentences.rb
358
- - examples/corpus/sentence_bigrams.rb
359
- - examples/corpus/sentence_coocurrence.rb
360
- - examples/corpus/words_to_bigrams.rb
361
- - examples/emr/elastic_mapreduce_example.rb
362
- - examples/ignore_me/counting.rb
363
- - examples/ignore_me/grouper.rb
364
- - examples/network_graph/adjacency_list.rb
365
- - examples/network_graph/breadth_first_search.rb
366
- - examples/network_graph/gen_2paths.rb
367
- - examples/network_graph/gen_multi_edge.rb
368
- - examples/network_graph/gen_symmetric_links.rb
369
- - examples/pagerank/pagerank.rb
370
- - examples/pagerank/pagerank_initialize.rb
371
- - examples/sample_records.rb
372
- - examples/server_logs/apache_log_parser.rb
373
- - examples/server_logs/breadcrumbs.rb
374
- - examples/server_logs/logline.rb
375
- - examples/server_logs/nook.rb
376
- - examples/server_logs/nook/faraday_dummy_adapter.rb
377
- - examples/server_logs/user_agent.rb
378
- - examples/simple_word_count.rb
379
- - examples/size.rb
380
- - examples/stats/avg_value_frequency.rb
381
- - examples/stats/binning_percentile_estimator.rb
382
- - examples/stats/rank_and_bin.rb
383
- - examples/stupidly_simple_filter.rb
384
- - examples/word_count.rb
385
- - spec/spec_helper.rb
386
- - spec/wukong/encoding_spec.rb
387
- - spec/wukong/script_spec.rb
350
+ summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
351
+ it, yet handles terabyte-scale computation with ease.
352
+ test_files: []