wukong 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/wu-lign +3 -3
- data/lib/wukong.rb +1 -0
- data/lib/wukong/decorator.rb +28 -0
- data/lib/wukong/encoding.rb +1 -1
- data/wukong.gemspec +9 -51
- metadata +77 -112
data/bin/wu-lign
CHANGED
@@ -98,8 +98,8 @@ FORMAT_GUESSING_LINES = 500
|
|
98
98
|
# widest column to set
|
99
99
|
MAX_MAX_WIDTH = 100
|
100
100
|
|
101
|
-
INT_RE = /\A\d+\z/
|
102
|
-
FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
101
|
+
INT_RE = /\A[\d,]+\z/
|
102
|
+
FLOAT_RE = /\A([\d,]+)(?:\.(\d+))?(?:e-?\d+)?\z/
|
103
103
|
|
104
104
|
def get_type val
|
105
105
|
case
|
@@ -161,7 +161,7 @@ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type,
|
|
161
161
|
case type
|
162
162
|
when :mixed, nil then lambda{|s| "%-#{width}s" % s }
|
163
163
|
when :str then lambda{|s| "%-#{width}s" % s }
|
164
|
-
when :int then lambda{|s| "%#{width}d" % s.to_i }
|
164
|
+
when :int then lambda{|s| "%#{width}d" % s.gsub(/\D+/, "").to_i }
|
165
165
|
when :float then lambda{|s| "%#{maxmag+minmag+2}.#{minmag}f" % s.to_f }
|
166
166
|
else raise "oops type #{type}" end
|
167
167
|
end
|
data/lib/wukong.rb
CHANGED
@@ -10,6 +10,7 @@ module Wukong
|
|
10
10
|
autoload :Streamer, 'wukong/streamer'
|
11
11
|
autoload :Store, 'wukong/store'
|
12
12
|
autoload :FilenamePattern, 'wukong/filename_pattern'
|
13
|
+
autoload :Decorator, 'wukong/decorator'
|
13
14
|
|
14
15
|
def self.run mapper, reducer=nil, options={}
|
15
16
|
Wukong::Script.new(mapper, reducer, options).run
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'java'
|
2
|
+
|
3
|
+
java_import 'com.cloudera.flume.core.Event'
|
4
|
+
java_import 'com.cloudera.flume.core.EventImpl'
|
5
|
+
java_import 'com.cloudera.flume.core.EventSinkDecorator'
|
6
|
+
|
7
|
+
module Wukong
|
8
|
+
class Decorator < EventSinkDecorator
|
9
|
+
|
10
|
+
def initialize(mapper, reducer=nil, options={})
|
11
|
+
super(nil)
|
12
|
+
@mapper = mapper.new
|
13
|
+
end
|
14
|
+
|
15
|
+
def append(e)
|
16
|
+
line = String.from_java_bytes(e.getBody)
|
17
|
+
record = @mapper.recordize(line.chomp)
|
18
|
+
@mapper.process(*record) do |output|
|
19
|
+
processed = output.to_flat.join("\t")
|
20
|
+
event = EventImpl.new(processed.to_java_bytes, e.getTimestamp, e.getPriority, e.getNanos, e.getHost, e.getAttrs)
|
21
|
+
super event
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def run() self ; end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
data/lib/wukong/encoding.rb
CHANGED
@@ -68,7 +68,7 @@ module Wukong
|
|
68
68
|
#
|
69
69
|
def self.decode_str str, strategy=:xml
|
70
70
|
case strategy
|
71
|
-
when :xml then
|
71
|
+
when :xml then self.html_encoder.decode(str)
|
72
72
|
when :url then Addressable::URI.unencode_component(str)
|
73
73
|
else raise "Don't know how to decode with strategy #{strategy}"
|
74
74
|
end
|
data/wukong.gemspec
CHANGED
@@ -4,21 +4,14 @@
|
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
|
-
s.name =
|
8
|
-
s.version = "2.0.
|
7
|
+
s.name = "wukong"
|
8
|
+
s.version = "2.0.2"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date =
|
13
|
-
s.description =
|
14
|
-
|
15
|
-
* stream of lines when it's efficient to process by lines
|
16
|
-
* stream of field arrays when it's efficient to deal directly with fields
|
17
|
-
* stream of lightweight objects when it's efficient to deal with objects
|
18
|
-
|
19
|
-
Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
|
20
|
-
}
|
21
|
-
s.email = %q{flip@infochimps.org}
|
12
|
+
s.date = "2011-11-04"
|
13
|
+
s.description = " Treat your dataset like a:\n\n * stream of lines when it's efficient to process by lines\n * stream of field arrays when it's efficient to deal directly with fields\n * stream of lightweight objects when it's efficient to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.\n"
|
14
|
+
s.email = "flip@infochimps.org"
|
22
15
|
s.executables = ["hdp-du", "hdp-sync", "hdp-wc", "wu-lign", "wu-sum", "hdp-parts_to_keys.rb"]
|
23
16
|
s.extra_rdoc_files = [
|
24
17
|
"LICENSE.textile",
|
@@ -180,6 +173,7 @@ Gem::Specification.new do |s|
|
|
180
173
|
"lib/wukong/datatypes.rb",
|
181
174
|
"lib/wukong/datatypes/enum.rb",
|
182
175
|
"lib/wukong/datatypes/fake_types.rb",
|
176
|
+
"lib/wukong/decorator.rb",
|
183
177
|
"lib/wukong/encoding.rb",
|
184
178
|
"lib/wukong/encoding/asciize.rb",
|
185
179
|
"lib/wukong/extensions.rb",
|
@@ -254,46 +248,10 @@ Gem::Specification.new do |s|
|
|
254
248
|
"spec/wukong/script_spec.rb",
|
255
249
|
"wukong.gemspec"
|
256
250
|
]
|
257
|
-
s.homepage =
|
251
|
+
s.homepage = "http://mrflip.github.com/wukong"
|
258
252
|
s.require_paths = ["lib"]
|
259
|
-
s.rubygems_version =
|
260
|
-
s.summary =
|
261
|
-
s.test_files = [
|
262
|
-
"examples/contrib/jeans/normalize.rb",
|
263
|
-
"examples/contrib/jeans/sizes.rb",
|
264
|
-
"examples/corpus/bucket_counter.rb",
|
265
|
-
"examples/corpus/dbpedia_abstract_to_sentences.rb",
|
266
|
-
"examples/corpus/sentence_bigrams.rb",
|
267
|
-
"examples/corpus/sentence_coocurrence.rb",
|
268
|
-
"examples/corpus/words_to_bigrams.rb",
|
269
|
-
"examples/emr/elastic_mapreduce_example.rb",
|
270
|
-
"examples/ignore_me/counting.rb",
|
271
|
-
"examples/ignore_me/grouper.rb",
|
272
|
-
"examples/network_graph/adjacency_list.rb",
|
273
|
-
"examples/network_graph/breadth_first_search.rb",
|
274
|
-
"examples/network_graph/gen_2paths.rb",
|
275
|
-
"examples/network_graph/gen_multi_edge.rb",
|
276
|
-
"examples/network_graph/gen_symmetric_links.rb",
|
277
|
-
"examples/pagerank/pagerank.rb",
|
278
|
-
"examples/pagerank/pagerank_initialize.rb",
|
279
|
-
"examples/sample_records.rb",
|
280
|
-
"examples/server_logs/apache_log_parser.rb",
|
281
|
-
"examples/server_logs/breadcrumbs.rb",
|
282
|
-
"examples/server_logs/logline.rb",
|
283
|
-
"examples/server_logs/nook.rb",
|
284
|
-
"examples/server_logs/nook/faraday_dummy_adapter.rb",
|
285
|
-
"examples/server_logs/user_agent.rb",
|
286
|
-
"examples/simple_word_count.rb",
|
287
|
-
"examples/size.rb",
|
288
|
-
"examples/stats/avg_value_frequency.rb",
|
289
|
-
"examples/stats/binning_percentile_estimator.rb",
|
290
|
-
"examples/stats/rank_and_bin.rb",
|
291
|
-
"examples/stupidly_simple_filter.rb",
|
292
|
-
"examples/word_count.rb",
|
293
|
-
"spec/spec_helper.rb",
|
294
|
-
"spec/wukong/encoding_spec.rb",
|
295
|
-
"spec/wukong/script_spec.rb"
|
296
|
-
]
|
253
|
+
s.rubygems_version = "1.8.10"
|
254
|
+
s.summary = "Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease."
|
297
255
|
|
298
256
|
if s.respond_to? :specification_version then
|
299
257
|
s.specification_version = 3
|
metadata
CHANGED
@@ -1,87 +1,89 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2.0.2
|
4
5
|
prerelease:
|
5
|
-
version: 2.0.1
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Philip (flip) Kromer
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
dependencies:
|
16
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2011-11-04 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
17
15
|
name: rspec
|
18
|
-
|
19
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: &2152107000 !ruby/object:Gem::Requirement
|
20
17
|
none: false
|
21
|
-
requirements:
|
22
|
-
- -
|
23
|
-
- !ruby/object:Gem::Version
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
24
21
|
version: 1.2.9
|
25
22
|
type: :development
|
26
|
-
version_requirements: *id001
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: yard
|
29
23
|
prerelease: false
|
30
|
-
|
24
|
+
version_requirements: *2152107000
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
requirement: &2152105540 !ruby/object:Gem::Requirement
|
31
28
|
none: false
|
32
|
-
requirements:
|
33
|
-
- -
|
34
|
-
- !ruby/object:Gem::Version
|
35
|
-
version:
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
36
33
|
type: :development
|
37
|
-
version_requirements: *id002
|
38
|
-
- !ruby/object:Gem::Dependency
|
39
|
-
name: addressable
|
40
34
|
prerelease: false
|
41
|
-
|
35
|
+
version_requirements: *2152105540
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: addressable
|
38
|
+
requirement: &2152104640 !ruby/object:Gem::Requirement
|
42
39
|
none: false
|
43
|
-
requirements:
|
44
|
-
- -
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version:
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
47
44
|
type: :runtime
|
48
|
-
version_requirements: *id003
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: extlib
|
51
45
|
prerelease: false
|
52
|
-
|
46
|
+
version_requirements: *2152104640
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: extlib
|
49
|
+
requirement: &2152103820 !ruby/object:Gem::Requirement
|
53
50
|
none: false
|
54
|
-
requirements:
|
55
|
-
- -
|
56
|
-
- !ruby/object:Gem::Version
|
57
|
-
version:
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
58
55
|
type: :runtime
|
59
|
-
version_requirements: *id004
|
60
|
-
- !ruby/object:Gem::Dependency
|
61
|
-
name: htmlentities
|
62
56
|
prerelease: false
|
63
|
-
|
57
|
+
version_requirements: *2152103820
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: htmlentities
|
60
|
+
requirement: &2152102940 !ruby/object:Gem::Requirement
|
64
61
|
none: false
|
65
|
-
requirements:
|
66
|
-
- -
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version:
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
69
66
|
type: :runtime
|
70
|
-
version_requirements: *id005
|
71
|
-
- !ruby/object:Gem::Dependency
|
72
|
-
name: configliere
|
73
67
|
prerelease: false
|
74
|
-
|
68
|
+
version_requirements: *2152102940
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: configliere
|
71
|
+
requirement: &2152101420 !ruby/object:Gem::Requirement
|
75
72
|
none: false
|
76
|
-
requirements:
|
77
|
-
- -
|
78
|
-
- !ruby/object:Gem::Version
|
79
|
-
version:
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
80
77
|
type: :runtime
|
81
|
-
|
82
|
-
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *2152101420
|
80
|
+
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
81
|
+
efficient to process by lines\n * stream of field arrays when it's efficient
|
82
|
+
to deal directly with fields\n * stream of lightweight objects when it's efficient
|
83
|
+
to deal with objects\n\n Wukong is friends with Hadoop the elephant, Pig the query
|
84
|
+
language, and the cat on your command line.\n"
|
83
85
|
email: flip@infochimps.org
|
84
|
-
executables:
|
86
|
+
executables:
|
85
87
|
- hdp-du
|
86
88
|
- hdp-sync
|
87
89
|
- hdp-wc
|
@@ -89,11 +91,10 @@ executables:
|
|
89
91
|
- wu-sum
|
90
92
|
- hdp-parts_to_keys.rb
|
91
93
|
extensions: []
|
92
|
-
|
93
|
-
extra_rdoc_files:
|
94
|
+
extra_rdoc_files:
|
94
95
|
- LICENSE.textile
|
95
96
|
- README.textile
|
96
|
-
files:
|
97
|
+
files:
|
97
98
|
- CHANGELOG.textile
|
98
99
|
- INSTALL.textile
|
99
100
|
- LICENSE.textile
|
@@ -249,6 +250,7 @@ files:
|
|
249
250
|
- lib/wukong/datatypes.rb
|
250
251
|
- lib/wukong/datatypes/enum.rb
|
251
252
|
- lib/wukong/datatypes/fake_types.rb
|
253
|
+
- lib/wukong/decorator.rb
|
252
254
|
- lib/wukong/encoding.rb
|
253
255
|
- lib/wukong/encoding/asciize.rb
|
254
256
|
- lib/wukong/extensions.rb
|
@@ -322,66 +324,29 @@ files:
|
|
322
324
|
- spec/wukong/encoding_spec.rb
|
323
325
|
- spec/wukong/script_spec.rb
|
324
326
|
- wukong.gemspec
|
325
|
-
has_rdoc: true
|
326
327
|
homepage: http://mrflip.github.com/wukong
|
327
328
|
licenses: []
|
328
|
-
|
329
329
|
post_install_message:
|
330
330
|
rdoc_options: []
|
331
|
-
|
332
|
-
require_paths:
|
331
|
+
require_paths:
|
333
332
|
- lib
|
334
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
333
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
335
334
|
none: false
|
336
|
-
requirements:
|
337
|
-
- -
|
338
|
-
- !ruby/object:Gem::Version
|
339
|
-
version:
|
340
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
335
|
+
requirements:
|
336
|
+
- - ! '>='
|
337
|
+
- !ruby/object:Gem::Version
|
338
|
+
version: '0'
|
339
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
341
340
|
none: false
|
342
|
-
requirements:
|
343
|
-
- -
|
344
|
-
- !ruby/object:Gem::Version
|
345
|
-
version:
|
341
|
+
requirements:
|
342
|
+
- - ! '>='
|
343
|
+
- !ruby/object:Gem::Version
|
344
|
+
version: '0'
|
346
345
|
requirements: []
|
347
|
-
|
348
346
|
rubyforge_project:
|
349
|
-
rubygems_version: 1.
|
347
|
+
rubygems_version: 1.8.10
|
350
348
|
signing_key:
|
351
349
|
specification_version: 3
|
352
|
-
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|
353
|
-
|
354
|
-
|
355
|
-
- examples/contrib/jeans/sizes.rb
|
356
|
-
- examples/corpus/bucket_counter.rb
|
357
|
-
- examples/corpus/dbpedia_abstract_to_sentences.rb
|
358
|
-
- examples/corpus/sentence_bigrams.rb
|
359
|
-
- examples/corpus/sentence_coocurrence.rb
|
360
|
-
- examples/corpus/words_to_bigrams.rb
|
361
|
-
- examples/emr/elastic_mapreduce_example.rb
|
362
|
-
- examples/ignore_me/counting.rb
|
363
|
-
- examples/ignore_me/grouper.rb
|
364
|
-
- examples/network_graph/adjacency_list.rb
|
365
|
-
- examples/network_graph/breadth_first_search.rb
|
366
|
-
- examples/network_graph/gen_2paths.rb
|
367
|
-
- examples/network_graph/gen_multi_edge.rb
|
368
|
-
- examples/network_graph/gen_symmetric_links.rb
|
369
|
-
- examples/pagerank/pagerank.rb
|
370
|
-
- examples/pagerank/pagerank_initialize.rb
|
371
|
-
- examples/sample_records.rb
|
372
|
-
- examples/server_logs/apache_log_parser.rb
|
373
|
-
- examples/server_logs/breadcrumbs.rb
|
374
|
-
- examples/server_logs/logline.rb
|
375
|
-
- examples/server_logs/nook.rb
|
376
|
-
- examples/server_logs/nook/faraday_dummy_adapter.rb
|
377
|
-
- examples/server_logs/user_agent.rb
|
378
|
-
- examples/simple_word_count.rb
|
379
|
-
- examples/size.rb
|
380
|
-
- examples/stats/avg_value_frequency.rb
|
381
|
-
- examples/stats/binning_percentile_estimator.rb
|
382
|
-
- examples/stats/rank_and_bin.rb
|
383
|
-
- examples/stupidly_simple_filter.rb
|
384
|
-
- examples/word_count.rb
|
385
|
-
- spec/spec_helper.rb
|
386
|
-
- spec/wukong/encoding_spec.rb
|
387
|
-
- spec/wukong/script_spec.rb
|
350
|
+
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|
351
|
+
it, yet handles terabyte-scale computation with ease.
|
352
|
+
test_files: []
|