wukong 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -98,8 +98,8 @@ FORMAT_GUESSING_LINES = 500
98
98
  # widest column to set
99
99
  MAX_MAX_WIDTH = 100
100
100
 
101
- INT_RE = /\A\d+\z/
102
- FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
101
+ INT_RE = /\A[\d,]+\z/
102
+ FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
103
103
 
104
104
  def get_type val
105
105
  case
@@ -161,7 +161,7 @@ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type,
161
161
  case type
162
162
  when :mixed, nil then lambda{|s| "%-#{width}s" % s }
163
163
  when :str then lambda{|s| "%-#{width}s" % s }
164
- when :int then lambda{|s| "%#{width}d" % s.to_i }
164
+ when :int then lambda{|s| "%#{width}d" % s.gsub(/\D+/, "").to_i }
165
165
  when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
166
166
  else raise "oops type #{type}" end
167
167
  end
@@ -10,6 +10,7 @@ module Wukong
10
10
  autoload :Streamer, 'wukong/streamer'
11
11
  autoload :Store, 'wukong/store'
12
12
  autoload :FilenamePattern, 'wukong/filename_pattern'
13
+ autoload :Decorator, 'wukong/decorator'
13
14
 
14
15
  def self.run mapper, reducer=nil, options={}
15
16
  Wukong::Script.new(mapper, reducer, options).run
@@ -0,0 +1,28 @@
1
+ require 'java'
2
+
3
+ java_import 'com.cloudera.flume.core.Event'
4
+ java_import 'com.cloudera.flume.core.EventImpl'
5
+ java_import 'com.cloudera.flume.core.EventSinkDecorator'
6
+
7
+ module Wukong
8
+ class Decorator < EventSinkDecorator
9
+
10
+ def initialize(mapper, reducer=nil, options={})
11
+ super(nil)
12
+ @mapper = mapper.new
13
+ end
14
+
15
+ def append(e)
16
+ line = String.from_java_bytes(e.getBody)
17
+ record = @mapper.recordize(line.chomp)
18
+ @mapper.process(*record) do |output|
19
+ processed = output.to_flat.join("\t")
20
+ event = EventImpl.new(processed.to_java_bytes, e.getTimestamp, e.getPriority, e.getNanos, e.getHost, e.getAttrs)
21
+ super event
22
+ end
23
+ end
24
+
25
+ def run() self ; end
26
+
27
+ end
28
+ end
@@ -68,7 +68,7 @@ module Wukong
68
68
  #
69
69
  def self.decode_str str, strategy=:xml
70
70
  case strategy
71
- when :xml then HTMLEntities.decode_entities(str)
71
+ when :xml then self.html_encoder.decode(str)
72
72
  when :url then Addressable::URI.unencode_component(str)
73
73
  else raise "Don't know how to decode with strategy #{strategy}"
74
74
  end
@@ -4,21 +4,14 @@
4
4
  # -*- encoding: utf-8 -*-
5
5
 
6
6
  Gem::Specification.new do |s|
7
- s.name = %q{wukong}
8
- s.version = "2.0.1"
7
+ s.name = "wukong"
8
+ s.version = "2.0.2"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2011-07-01}
13
- s.description = %q{ Treat your dataset like a:
14
-
15
- * stream of lines when it's efficient to process by lines
16
- * stream of field arrays when it's efficient to deal directly with fields
17
- * stream of lightweight objects when it's efficient to deal with objects
18
-
19
- Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
20
- }
21
- s.email = %q{flip@infochimps.org}
12
+ s.date = "2011-11-04"
13
+ s.description = " Treat your dataset like a:\n\n * stream of lines when it's efficient to process by lines\n * stream of field arrays when it's efficient to deal directly with fields\n * stream of lightweight objects when it's efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
14
+ s.email = "flip@infochimps.org"
22
15
  s.executables = ["hdp-du", "hdp-sync", "hdp-wc", "wu-lign", "wu-sum", "hdp-parts_to_keys.rb"]
23
16
  s.extra_rdoc_files = [
24
17
  "LICENSE.textile",
@@ -180,6 +173,7 @@ Gem::Specification.new do |s|
180
173
  "lib/wukong/datatypes.rb",
181
174
  "lib/wukong/datatypes/enum.rb",
182
175
  "lib/wukong/datatypes/fake_types.rb",
176
+ "lib/wukong/decorator.rb",
183
177
  "lib/wukong/encoding.rb",
184
178
  "lib/wukong/encoding/asciize.rb",
185
179
  "lib/wukong/extensions.rb",
@@ -254,46 +248,10 @@ Gem::Specification.new do |s|
254
248
  "spec/wukong/script_spec.rb",
255
249
  "wukong.gemspec"
256
250
  ]
257
- s.homepage = %q{http://mrflip.github.com/wukong}
251
+ s.homepage = "http://mrflip.github.com/wukong"
258
252
  s.require_paths = ["lib"]
259
- s.rubygems_version = %q{1.5.0}
260
- s.summary = %q{Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.}
261
- s.test_files = [
262
- "examples/contrib/jeans/normalize.rb",
263
- "examples/contrib/jeans/sizes.rb",
264
- "examples/corpus/bucket_counter.rb",
265
- "examples/corpus/dbpedia_abstract_to_sentences.rb",
266
- "examples/corpus/sentence_bigrams.rb",
267
- "examples/corpus/sentence_coocurrence.rb",
268
- "examples/corpus/words_to_bigrams.rb",
269
- "examples/emr/elastic_mapreduce_example.rb",
270
- "examples/ignore_me/counting.rb",
271
- "examples/ignore_me/grouper.rb",
272
- "examples/network_graph/adjacency_list.rb",
273
- "examples/network_graph/breadth_first_search.rb",
274
- "examples/network_graph/gen_2paths.rb",
275
- "examples/network_graph/gen_multi_edge.rb",
276
- "examples/network_graph/gen_symmetric_links.rb",
277
- "examples/pagerank/pagerank.rb",
278
- "examples/pagerank/pagerank_initialize.rb",
279
- "examples/sample_records.rb",
280
- "examples/server_logs/apache_log_parser.rb",
281
- "examples/server_logs/breadcrumbs.rb",
282
- "examples/server_logs/logline.rb",
283
- "examples/server_logs/nook.rb",
284
- "examples/server_logs/nook/faraday_dummy_adapter.rb",
285
- "examples/server_logs/user_agent.rb",
286
- "examples/simple_word_count.rb",
287
- "examples/size.rb",
288
- "examples/stats/avg_value_frequency.rb",
289
- "examples/stats/binning_percentile_estimator.rb",
290
- "examples/stats/rank_and_bin.rb",
291
- "examples/stupidly_simple_filter.rb",
292
- "examples/word_count.rb",
293
- "spec/spec_helper.rb",
294
- "spec/wukong/encoding_spec.rb",
295
- "spec/wukong/script_spec.rb"
296
- ]
253
+ s.rubygems_version = "1.8.10"
254
+ s.summary = "Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease."
297
255
 
298
256
  if s.respond_to? :specification_version then
299
257
  s.specification_version = 3
metadata CHANGED
@@ -1,87 +1,89 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 2.0.2
4
5
  prerelease:
5
- version: 2.0.1
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Philip (flip) Kromer
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-07-01 00:00:00 -05:00
14
- default_executable:
15
- dependencies:
16
- - !ruby/object:Gem::Dependency
12
+ date: 2011-11-04 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
17
15
  name: rspec
18
- prerelease: false
19
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: &2152107000 !ruby/object:Gem::Requirement
20
17
  none: false
21
- requirements:
22
- - - ">="
23
- - !ruby/object:Gem::Version
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
24
21
  version: 1.2.9
25
22
  type: :development
26
- version_requirements: *id001
27
- - !ruby/object:Gem::Dependency
28
- name: yard
29
23
  prerelease: false
30
- requirement: &id002 !ruby/object:Gem::Requirement
24
+ version_requirements: *2152107000
25
+ - !ruby/object:Gem::Dependency
26
+ name: yard
27
+ requirement: &2152105540 !ruby/object:Gem::Requirement
31
28
  none: false
32
- requirements:
33
- - - ">="
34
- - !ruby/object:Gem::Version
35
- version: "0"
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
36
33
  type: :development
37
- version_requirements: *id002
38
- - !ruby/object:Gem::Dependency
39
- name: addressable
40
34
  prerelease: false
41
- requirement: &id003 !ruby/object:Gem::Requirement
35
+ version_requirements: *2152105540
36
+ - !ruby/object:Gem::Dependency
37
+ name: addressable
38
+ requirement: &2152104640 !ruby/object:Gem::Requirement
42
39
  none: false
43
- requirements:
44
- - - ">="
45
- - !ruby/object:Gem::Version
46
- version: "0"
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
47
44
  type: :runtime
48
- version_requirements: *id003
49
- - !ruby/object:Gem::Dependency
50
- name: extlib
51
45
  prerelease: false
52
- requirement: &id004 !ruby/object:Gem::Requirement
46
+ version_requirements: *2152104640
47
+ - !ruby/object:Gem::Dependency
48
+ name: extlib
49
+ requirement: &2152103820 !ruby/object:Gem::Requirement
53
50
  none: false
54
- requirements:
55
- - - ">="
56
- - !ruby/object:Gem::Version
57
- version: "0"
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
58
55
  type: :runtime
59
- version_requirements: *id004
60
- - !ruby/object:Gem::Dependency
61
- name: htmlentities
62
56
  prerelease: false
63
- requirement: &id005 !ruby/object:Gem::Requirement
57
+ version_requirements: *2152103820
58
+ - !ruby/object:Gem::Dependency
59
+ name: htmlentities
60
+ requirement: &2152102940 !ruby/object:Gem::Requirement
64
61
  none: false
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: "0"
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
69
66
  type: :runtime
70
- version_requirements: *id005
71
- - !ruby/object:Gem::Dependency
72
- name: configliere
73
67
  prerelease: false
74
- requirement: &id006 !ruby/object:Gem::Requirement
68
+ version_requirements: *2152102940
69
+ - !ruby/object:Gem::Dependency
70
+ name: configliere
71
+ requirement: &2152101420 !ruby/object:Gem::Requirement
75
72
  none: false
76
- requirements:
77
- - - ">="
78
- - !ruby/object:Gem::Version
79
- version: "0"
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
80
77
  type: :runtime
81
- version_requirements: *id006
82
- description: " Treat your dataset like a:\n\n * stream of lines when it's efficient to process by lines\n * stream of field arrays when it's efficient to deal directly with fields\n * stream of lightweight objects when it's efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
78
+ prerelease: false
79
+ version_requirements: *2152101420
80
+ description: ! " Treat your dataset like a:\n\n * stream of lines when it's
81
+ efficient to process by lines\n * stream of field arrays when it's efficient
82
+ to deal directly with fields\n * stream of lightweight objects when it's efficient
83
+ to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
84
+ language, and the cat on your command line.\n"
83
85
  email: flip@infochimps.org
84
- executables:
86
+ executables:
85
87
  - hdp-du
86
88
  - hdp-sync
87
89
  - hdp-wc
@@ -89,11 +91,10 @@ executables:
89
91
  - wu-sum
90
92
  - hdp-parts_to_keys.rb
91
93
  extensions: []
92
-
93
- extra_rdoc_files:
94
+ extra_rdoc_files:
94
95
  - LICENSE.textile
95
96
  - README.textile
96
- files:
97
+ files:
97
98
  - CHANGELOG.textile
98
99
  - INSTALL.textile
99
100
  - LICENSE.textile
@@ -249,6 +250,7 @@ files:
249
250
  - lib/wukong/datatypes.rb
250
251
  - lib/wukong/datatypes/enum.rb
251
252
  - lib/wukong/datatypes/fake_types.rb
253
+ - lib/wukong/decorator.rb
252
254
  - lib/wukong/encoding.rb
253
255
  - lib/wukong/encoding/asciize.rb
254
256
  - lib/wukong/extensions.rb
@@ -322,66 +324,29 @@ files:
322
324
  - spec/wukong/encoding_spec.rb
323
325
  - spec/wukong/script_spec.rb
324
326
  - wukong.gemspec
325
- has_rdoc: true
326
327
  homepage: http://mrflip.github.com/wukong
327
328
  licenses: []
328
-
329
329
  post_install_message:
330
330
  rdoc_options: []
331
-
332
- require_paths:
331
+ require_paths:
333
332
  - lib
334
- required_ruby_version: !ruby/object:Gem::Requirement
333
+ required_ruby_version: !ruby/object:Gem::Requirement
335
334
  none: false
336
- requirements:
337
- - - ">="
338
- - !ruby/object:Gem::Version
339
- version: "0"
340
- required_rubygems_version: !ruby/object:Gem::Requirement
335
+ requirements:
336
+ - - ! '>='
337
+ - !ruby/object:Gem::Version
338
+ version: '0'
339
+ required_rubygems_version: !ruby/object:Gem::Requirement
341
340
  none: false
342
- requirements:
343
- - - ">="
344
- - !ruby/object:Gem::Version
345
- version: "0"
341
+ requirements:
342
+ - - ! '>='
343
+ - !ruby/object:Gem::Version
344
+ version: '0'
346
345
  requirements: []
347
-
348
346
  rubyforge_project:
349
- rubygems_version: 1.5.0
347
+ rubygems_version: 1.8.10
350
348
  signing_key:
351
349
  specification_version: 3
352
- summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.
353
- test_files:
354
- - examples/contrib/jeans/normalize.rb
355
- - examples/contrib/jeans/sizes.rb
356
- - examples/corpus/bucket_counter.rb
357
- - examples/corpus/dbpedia_abstract_to_sentences.rb
358
- - examples/corpus/sentence_bigrams.rb
359
- - examples/corpus/sentence_coocurrence.rb
360
- - examples/corpus/words_to_bigrams.rb
361
- - examples/emr/elastic_mapreduce_example.rb
362
- - examples/ignore_me/counting.rb
363
- - examples/ignore_me/grouper.rb
364
- - examples/network_graph/adjacency_list.rb
365
- - examples/network_graph/breadth_first_search.rb
366
- - examples/network_graph/gen_2paths.rb
367
- - examples/network_graph/gen_multi_edge.rb
368
- - examples/network_graph/gen_symmetric_links.rb
369
- - examples/pagerank/pagerank.rb
370
- - examples/pagerank/pagerank_initialize.rb
371
- - examples/sample_records.rb
372
- - examples/server_logs/apache_log_parser.rb
373
- - examples/server_logs/breadcrumbs.rb
374
- - examples/server_logs/logline.rb
375
- - examples/server_logs/nook.rb
376
- - examples/server_logs/nook/faraday_dummy_adapter.rb
377
- - examples/server_logs/user_agent.rb
378
- - examples/simple_word_count.rb
379
- - examples/size.rb
380
- - examples/stats/avg_value_frequency.rb
381
- - examples/stats/binning_percentile_estimator.rb
382
- - examples/stats/rank_and_bin.rb
383
- - examples/stupidly_simple_filter.rb
384
- - examples/word_count.rb
385
- - spec/spec_helper.rb
386
- - spec/wukong/encoding_spec.rb
387
- - spec/wukong/script_spec.rb
350
+ summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
351
+ it, yet handles terabyte-scale computation with ease.
352
+ test_files: []