wukong 2.0.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/wu-lign +3 -3
- data/lib/wukong.rb +1 -0
- data/lib/wukong/decorator.rb +28 -0
- data/lib/wukong/encoding.rb +1 -1
- data/wukong.gemspec +9 -51
- metadata +77 -112
data/bin/wu-lign
CHANGED
@@ -98,8 +98,8 @@ FORMAT_GUESSING_LINES = 500
|
|
98
98
|
# widest column to set
|
99
99
|
MAX_MAX_WIDTH = 100
|
100
100
|
|
101
|
-
INT_RE = /\A\d+\z/
|
102
|
-
FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
101
|
+
INT_RE = /\A[\d,]+\z/
|
102
|
+
FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
103
103
|
|
104
104
|
def get_type val
|
105
105
|
case
|
@@ -161,7 +161,7 @@ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type,
|
|
161
161
|
case type
|
162
162
|
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
|
163
163
|
when :str then lambda{|s| "%-#{width}s" % s }
|
164
|
-
when :int then lambda{|s| "%#{width}d" % s.to_i }
|
164
|
+
when :int then lambda{|s| "%#{width}d" % s.gsub(/\D+/, "").to_i }
|
165
165
|
when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
|
166
166
|
else raise "oops type #{type}" end
|
167
167
|
end
|
data/lib/wukong.rb
CHANGED
@@ -10,6 +10,7 @@ module Wukong
|
|
10
10
|
autoload :Streamer, 'wukong/streamer'
|
11
11
|
autoload :Store, 'wukong/store'
|
12
12
|
autoload :FilenamePattern, 'wukong/filename_pattern'
|
13
|
+
autoload :Decorator, 'wukong/decorator'
|
13
14
|
|
14
15
|
def self.run mapper, reducer=nil, options={}
|
15
16
|
Wukong::Script.new(mapper, reducer, options).run
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
java_import 'com.cloudera.flume.core.Event'
|
4
|
+
java_import 'com.cloudera.flume.core.EventImpl'
|
5
|
+
java_import 'com.cloudera.flume.core.EventSinkDecorator'
|
6
|
+
|
7
|
+
module Wukong
|
8
|
+
class Decorator < EventSinkDecorator
|
9
|
+
|
10
|
+
def initialize(mapper, reducer=nil, options={})
|
11
|
+
super(nil)
|
12
|
+
@mapper = mapper.new
|
13
|
+
end
|
14
|
+
|
15
|
+
def append(e)
|
16
|
+
line = String.from_java_bytes(e.getBody)
|
17
|
+
record = @mapper.recordize(line.chomp)
|
18
|
+
@mapper.process(*record) do |output|
|
19
|
+
processed = output.to_flat.join("\t")
|
20
|
+
event = EventImpl.new(processed.to_java_bytes, e.getTimestamp, e.getPriority, e.getNanos, e.getHost, e.getAttrs)
|
21
|
+
super event
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def run() self ; end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
data/lib/wukong/encoding.rb
CHANGED
@@ -68,7 +68,7 @@ module Wukong
|
|
68
68
|
#
|
69
69
|
def self.decode_str str, strategy=:xml
|
70
70
|
case strategy
|
71
|
-
when :xml then
|
71
|
+
when :xml then self.html_encoder.decode(str)
|
72
72
|
when :url then Addressable::URI.unencode_component(str)
|
73
73
|
else raise "Don't know how to decode with strategy #{strategy}"
|
74
74
|
end
|
data/wukong.gemspec
CHANGED
@@ -4,21 +4,14 @@
|
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
|
-
s.name =
|
8
|
-
s.version = "2.0.
|
7
|
+
s.name = "wukong"
|
8
|
+
s.version = "2.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date =
|
13
|
-
s.description =
|
14
|
-
|
15
|
-
* stream of lines when it's efficient to process by lines
|
16
|
-
* stream of field arrays when it's efficient to deal directly with fields
|
17
|
-
* stream of lightweight objects when it's efficient to deal with objects
|
18
|
-
|
19
|
-
Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
|
20
|
-
}
|
21
|
-
s.email = %q{flip@infochimps.org}
|
12
|
+
s.date = "2011-11-04"
|
13
|
+
s.description = " Treat your dataset like a:\n\n * stream of lines when it's efficient to process by lines\n * stream of field arrays when it's efficient to deal directly with fields\n * stream of lightweight objects when it's efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
|
14
|
+
s.email = "flip@infochimps.org"
|
22
15
|
s.executables = ["hdp-du", "hdp-sync", "hdp-wc", "wu-lign", "wu-sum", "hdp-parts_to_keys.rb"]
|
23
16
|
s.extra_rdoc_files = [
|
24
17
|
"LICENSE.textile",
|
@@ -180,6 +173,7 @@ Gem::Specification.new do |s|
|
|
180
173
|
"lib/wukong/datatypes.rb",
|
181
174
|
"lib/wukong/datatypes/enum.rb",
|
182
175
|
"lib/wukong/datatypes/fake_types.rb",
|
176
|
+
"lib/wukong/decorator.rb",
|
183
177
|
"lib/wukong/encoding.rb",
|
184
178
|
"lib/wukong/encoding/asciize.rb",
|
185
179
|
"lib/wukong/extensions.rb",
|
@@ -254,46 +248,10 @@ Gem::Specification.new do |s|
|
|
254
248
|
"spec/wukong/script_spec.rb",
|
255
249
|
"wukong.gemspec"
|
256
250
|
]
|
257
|
-
s.homepage =
|
251
|
+
s.homepage = "http://mrflip.github.com/wukong"
|
258
252
|
s.require_paths = ["lib"]
|
259
|
-
s.rubygems_version =
|
260
|
-
s.summary =
|
261
|
-
s.test_files = [
|
262
|
-
"examples/contrib/jeans/normalize.rb",
|
263
|
-
"examples/contrib/jeans/sizes.rb",
|
264
|
-
"examples/corpus/bucket_counter.rb",
|
265
|
-
"examples/corpus/dbpedia_abstract_to_sentences.rb",
|
266
|
-
"examples/corpus/sentence_bigrams.rb",
|
267
|
-
"examples/corpus/sentence_coocurrence.rb",
|
268
|
-
"examples/corpus/words_to_bigrams.rb",
|
269
|
-
"examples/emr/elastic_mapreduce_example.rb",
|
270
|
-
"examples/ignore_me/counting.rb",
|
271
|
-
"examples/ignore_me/grouper.rb",
|
272
|
-
"examples/network_graph/adjacency_list.rb",
|
273
|
-
"examples/network_graph/breadth_first_search.rb",
|
274
|
-
"examples/network_graph/gen_2paths.rb",
|
275
|
-
"examples/network_graph/gen_multi_edge.rb",
|
276
|
-
"examples/network_graph/gen_symmetric_links.rb",
|
277
|
-
"examples/pagerank/pagerank.rb",
|
278
|
-
"examples/pagerank/pagerank_initialize.rb",
|
279
|
-
"examples/sample_records.rb",
|
280
|
-
"examples/server_logs/apache_log_parser.rb",
|
281
|
-
"examples/server_logs/breadcrumbs.rb",
|
282
|
-
"examples/server_logs/logline.rb",
|
283
|
-
"examples/server_logs/nook.rb",
|
284
|
-
"examples/server_logs/nook/faraday_dummy_adapter.rb",
|
285
|
-
"examples/server_logs/user_agent.rb",
|
286
|
-
"examples/simple_word_count.rb",
|
287
|
-
"examples/size.rb",
|
288
|
-
"examples/stats/avg_value_frequency.rb",
|
289
|
-
"examples/stats/binning_percentile_estimator.rb",
|
290
|
-
"examples/stats/rank_and_bin.rb",
|
291
|
-
"examples/stupidly_simple_filter.rb",
|
292
|
-
"examples/word_count.rb",
|
293
|
-
"spec/spec_helper.rb",
|
294
|
-
"spec/wukong/encoding_spec.rb",
|
295
|
-
"spec/wukong/script_spec.rb"
|
296
|
-
]
|
253
|
+
s.rubygems_version = "1.8.10"
|
254
|
+
s.summary = "Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease."
|
297
255
|
|
298
256
|
if s.respond_to? :specification_version then
|
299
257
|
s.specification_version = 3
|
metadata
CHANGED
@@ -1,87 +1,89 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.2
|
4
5
|
prerelease:
|
5
|
-
version: 2.0.1
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Philip (flip) Kromer
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
dependencies:
|
16
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2011-11-04 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
17
15
|
name: rspec
|
18
|
-
|
19
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: &2152107000 !ruby/object:Gem::Requirement
|
20
17
|
none: false
|
21
|
-
requirements:
|
22
|
-
- -
|
23
|
-
- !ruby/object:Gem::Version
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
24
21
|
version: 1.2.9
|
25
22
|
type: :development
|
26
|
-
version_requirements: *id001
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: yard
|
29
23
|
prerelease: false
|
30
|
-
|
24
|
+
version_requirements: *2152107000
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
requirement: &2152105540 !ruby/object:Gem::Requirement
|
31
28
|
none: false
|
32
|
-
requirements:
|
33
|
-
- -
|
34
|
-
- !ruby/object:Gem::Version
|
35
|
-
version:
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
36
33
|
type: :development
|
37
|
-
version_requirements: *id002
|
38
|
-
- !ruby/object:Gem::Dependency
|
39
|
-
name: addressable
|
40
34
|
prerelease: false
|
41
|
-
|
35
|
+
version_requirements: *2152105540
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: addressable
|
38
|
+
requirement: &2152104640 !ruby/object:Gem::Requirement
|
42
39
|
none: false
|
43
|
-
requirements:
|
44
|
-
- -
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version:
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
47
44
|
type: :runtime
|
48
|
-
version_requirements: *id003
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: extlib
|
51
45
|
prerelease: false
|
52
|
-
|
46
|
+
version_requirements: *2152104640
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: extlib
|
49
|
+
requirement: &2152103820 !ruby/object:Gem::Requirement
|
53
50
|
none: false
|
54
|
-
requirements:
|
55
|
-
- -
|
56
|
-
- !ruby/object:Gem::Version
|
57
|
-
version:
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
58
55
|
type: :runtime
|
59
|
-
version_requirements: *id004
|
60
|
-
- !ruby/object:Gem::Dependency
|
61
|
-
name: htmlentities
|
62
56
|
prerelease: false
|
63
|
-
|
57
|
+
version_requirements: *2152103820
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: htmlentities
|
60
|
+
requirement: &2152102940 !ruby/object:Gem::Requirement
|
64
61
|
none: false
|
65
|
-
requirements:
|
66
|
-
- -
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version:
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
69
66
|
type: :runtime
|
70
|
-
version_requirements: *id005
|
71
|
-
- !ruby/object:Gem::Dependency
|
72
|
-
name: configliere
|
73
67
|
prerelease: false
|
74
|
-
|
68
|
+
version_requirements: *2152102940
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: configliere
|
71
|
+
requirement: &2152101420 !ruby/object:Gem::Requirement
|
75
72
|
none: false
|
76
|
-
requirements:
|
77
|
-
- -
|
78
|
-
- !ruby/object:Gem::Version
|
79
|
-
version:
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
80
77
|
type: :runtime
|
81
|
-
|
82
|
-
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *2152101420
|
80
|
+
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
81
|
+
efficient to process by lines\n * stream of field arrays when it's efficient
|
82
|
+
to deal directly with fields\n * stream of lightweight objects when it's efficient
|
83
|
+
to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
|
84
|
+
language, and the cat on your command line.\n"
|
83
85
|
email: flip@infochimps.org
|
84
|
-
executables:
|
86
|
+
executables:
|
85
87
|
- hdp-du
|
86
88
|
- hdp-sync
|
87
89
|
- hdp-wc
|
@@ -89,11 +91,10 @@ executables:
|
|
89
91
|
- wu-sum
|
90
92
|
- hdp-parts_to_keys.rb
|
91
93
|
extensions: []
|
92
|
-
|
93
|
-
extra_rdoc_files:
|
94
|
+
extra_rdoc_files:
|
94
95
|
- LICENSE.textile
|
95
96
|
- README.textile
|
96
|
-
files:
|
97
|
+
files:
|
97
98
|
- CHANGELOG.textile
|
98
99
|
- INSTALL.textile
|
99
100
|
- LICENSE.textile
|
@@ -249,6 +250,7 @@ files:
|
|
249
250
|
- lib/wukong/datatypes.rb
|
250
251
|
- lib/wukong/datatypes/enum.rb
|
251
252
|
- lib/wukong/datatypes/fake_types.rb
|
253
|
+
- lib/wukong/decorator.rb
|
252
254
|
- lib/wukong/encoding.rb
|
253
255
|
- lib/wukong/encoding/asciize.rb
|
254
256
|
- lib/wukong/extensions.rb
|
@@ -322,66 +324,29 @@ files:
|
|
322
324
|
- spec/wukong/encoding_spec.rb
|
323
325
|
- spec/wukong/script_spec.rb
|
324
326
|
- wukong.gemspec
|
325
|
-
has_rdoc: true
|
326
327
|
homepage: http://mrflip.github.com/wukong
|
327
328
|
licenses: []
|
328
|
-
|
329
329
|
post_install_message:
|
330
330
|
rdoc_options: []
|
331
|
-
|
332
|
-
require_paths:
|
331
|
+
require_paths:
|
333
332
|
- lib
|
334
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
333
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
335
334
|
none: false
|
336
|
-
requirements:
|
337
|
-
- -
|
338
|
-
- !ruby/object:Gem::Version
|
339
|
-
version:
|
340
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - ! '>='
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: '0'
|
339
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
341
340
|
none: false
|
342
|
-
requirements:
|
343
|
-
- -
|
344
|
-
- !ruby/object:Gem::Version
|
345
|
-
version:
|
341
|
+
requirements:
|
342
|
+
- - ! '>='
|
343
|
+
- !ruby/object:Gem::Version
|
344
|
+
version: '0'
|
346
345
|
requirements: []
|
347
|
-
|
348
346
|
rubyforge_project:
|
349
|
-
rubygems_version: 1.
|
347
|
+
rubygems_version: 1.8.10
|
350
348
|
signing_key:
|
351
349
|
specification_version: 3
|
352
|
-
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|
353
|
-
|
354
|
-
|
355
|
-
- examples/contrib/jeans/sizes.rb
|
356
|
-
- examples/corpus/bucket_counter.rb
|
357
|
-
- examples/corpus/dbpedia_abstract_to_sentences.rb
|
358
|
-
- examples/corpus/sentence_bigrams.rb
|
359
|
-
- examples/corpus/sentence_coocurrence.rb
|
360
|
-
- examples/corpus/words_to_bigrams.rb
|
361
|
-
- examples/emr/elastic_mapreduce_example.rb
|
362
|
-
- examples/ignore_me/counting.rb
|
363
|
-
- examples/ignore_me/grouper.rb
|
364
|
-
- examples/network_graph/adjacency_list.rb
|
365
|
-
- examples/network_graph/breadth_first_search.rb
|
366
|
-
- examples/network_graph/gen_2paths.rb
|
367
|
-
- examples/network_graph/gen_multi_edge.rb
|
368
|
-
- examples/network_graph/gen_symmetric_links.rb
|
369
|
-
- examples/pagerank/pagerank.rb
|
370
|
-
- examples/pagerank/pagerank_initialize.rb
|
371
|
-
- examples/sample_records.rb
|
372
|
-
- examples/server_logs/apache_log_parser.rb
|
373
|
-
- examples/server_logs/breadcrumbs.rb
|
374
|
-
- examples/server_logs/logline.rb
|
375
|
-
- examples/server_logs/nook.rb
|
376
|
-
- examples/server_logs/nook/faraday_dummy_adapter.rb
|
377
|
-
- examples/server_logs/user_agent.rb
|
378
|
-
- examples/simple_word_count.rb
|
379
|
-
- examples/size.rb
|
380
|
-
- examples/stats/avg_value_frequency.rb
|
381
|
-
- examples/stats/binning_percentile_estimator.rb
|
382
|
-
- examples/stats/rank_and_bin.rb
|
383
|
-
- examples/stupidly_simple_filter.rb
|
384
|
-
- examples/word_count.rb
|
385
|
-
- spec/spec_helper.rb
|
386
|
-
- spec/wukong/encoding_spec.rb
|
387
|
-
- spec/wukong/script_spec.rb
|
350
|
+
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|
351
|
+
it, yet handles terabyte-scale computation with ease.
|
352
|
+
test_files: []
|