wukong 1.5.2 → 1.5.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +10 -0
- data/README.textile +1 -3
- data/bin/hdp-cp +3 -0
- data/bin/wu-lign +31 -36
- data/docpages/README-wulign.textile +6 -6
- data/examples/emr/elastic_mapreduce_example.rb +9 -0
- data/examples/emr/emr.yaml +52 -0
- data/lib/wukong.rb +5 -3
- data/lib/wukong/filename_pattern.rb +73 -0
- data/lib/wukong/keystore/tyrant_db.rb +11 -11
- data/lib/wukong/monitor/periodic_monitor.rb +62 -64
- data/lib/wukong/script/hadoop_command.rb +7 -6
- data/lib/wukong/store.rb +12 -11
- data/lib/wukong/store/base.rb +5 -7
- data/lib/wukong/store/chh_chunked_flat_file_store.rb +37 -0
- data/lib/wukong/store/chunked_flat_file_store.rb +23 -16
- data/lib/wukong/store/flat_file_store.rb +9 -10
- data/lib/wukong/streamer/em_streamer.rb +7 -0
- data/wukong.gemspec +7 -2
- metadata +9 -4
data/CHANGELOG.textile
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
h2. Wukong v1.5.3
|
2
|
+
|
3
|
+
* A couple of bugfixes. Sorry about that.
|
4
|
+
* Documentation fixes
|
5
|
+
|
1
6
|
h2. Wukong v1.5.0
|
2
7
|
|
3
8
|
h4. Elastic Map-Reduce
|
@@ -16,6 +21,11 @@ Incompatible changes to option handling and script launching:
|
|
16
21
|
* Script doesn't use extra_options any more. You should relocate them to the initializer or to configliere.
|
17
22
|
* there is no more default_mapper or default_reducer
|
18
23
|
|
24
|
+
h2. Wukong v.14.12 2010-08-31
|
25
|
+
|
26
|
+
* Improvements to the pig conversion methods
|
27
|
+
* @hdp-rm@ respects the -skipTrash method
|
28
|
+
|
19
29
|
|
20
30
|
h2. Wukong v1.4.11 2010-07-30
|
21
31
|
|
data/README.textile
CHANGED
@@ -30,9 +30,7 @@ I'm pushing to release "Wukong 3.0 the actual 1.0 release".
|
|
30
30
|
* Standardize the notion that wukong classes have a "key"; by default, it will be to_a.first for Structs/TypedStructs. This shouldn't break anything.
|
31
31
|
* May make some things that are derived classes into mixin'ed modules
|
32
32
|
* Will probably change the name of AccumulatingReducer into just Accumulator, and have all Accumulator-derived classes include Accumulator; I'll make sure the old names continue to work though.
|
33
|
-
|
34
|
-
|
35
|
-
*
|
33
|
+
|
36
34
|
|
37
35
|
h2. Help!
|
38
36
|
|
data/bin/hdp-cp
ADDED
data/bin/wu-lign
CHANGED
@@ -7,7 +7,7 @@ USAGE= %Q{
|
|
7
7
|
# space aligned file that is still suitable for further processing. For example,
|
8
8
|
# given the log-file input
|
9
9
|
#
|
10
|
-
#
|
10
|
+
# # cat tag_usage.tsv
|
11
11
|
# 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
|
12
12
|
# 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
|
13
13
|
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
|
@@ -15,11 +15,10 @@ USAGE= %Q{
|
|
15
15
|
# 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
|
16
16
|
# 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
|
17
17
|
# 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
|
18
|
-
# </code></pre>
|
19
18
|
#
|
20
19
|
# wulign will reformat it to read
|
21
20
|
#
|
22
|
-
#
|
21
|
+
# # cat tag_usage.tsv | wu-lign
|
23
22
|
# 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
|
24
23
|
# 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
|
25
24
|
# 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
|
@@ -27,65 +26,61 @@ USAGE= %Q{
|
|
27
26
|
# 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
|
28
27
|
# 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
|
29
28
|
# 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
|
30
|
-
# </code></pre>
|
31
29
|
#
|
32
30
|
# The fields are still tab-delimited by exactly one tab -- only spaces are used to
|
33
31
|
# pad out fields. You can still use cuttab and friends to manipulate columns.
|
34
32
|
#
|
35
|
-
# wulign isn't intended to be smart, or correct, or reliable -- only to be
|
36
|
-
# useful for previewing and organizing tab-formatted files. In general
|
37
|
-
# @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
|
38
|
-
# equivalent to its input. (That is, the only changes should be insertion of
|
39
|
-
# spaces and re-formatting of numerics.) But still -- reserve its use for human
|
40
|
-
# inspection only.
|
41
|
-
#
|
42
|
-
# (Note: tab characters in this source code file have been converted to spaces;
|
43
|
-
# replace whitespace with tab in the first example if you'd like to play along at
|
44
|
-
# home.)
|
45
|
-
#
|
46
|
-
# h2. How it works
|
47
|
-
#
|
48
|
-
# Wulign takes the first 1000 lines, splits by TAB characters into fields, and
|
49
|
-
# tries to guess the format -- int, float, or string -- for each. It builds a
|
50
|
-
# consensus of the width and type for corresponding columns in the chunk. If a
|
51
|
-
# column has mixed numeric and string formats it degrades to :mixed, which is
|
52
|
-
# basically treated as :string. If a column has mixed :float and :int elements all
|
53
|
-
# of them are formatted as float.
|
54
|
-
#
|
55
33
|
# h2. Command-line arguments
|
56
34
|
#
|
57
35
|
# You can give sprintf-style positional arguments on the command line that will be
|
58
36
|
# applied to the corresponding columns. (Blank args are used for placeholding and
|
59
37
|
# auto-formatting is still applied). So with the example above,
|
60
38
|
#
|
61
|
-
#
|
39
|
+
# cat foo | wulign '' '' '' '%8.4e'
|
62
40
|
#
|
63
41
|
# will format the fourth column with "%8.4e", while the first three columns and
|
64
42
|
# fifth-and-higher columns are formatted as usual.
|
65
43
|
#
|
66
|
-
# <pre><code>
|
67
44
|
# ...
|
68
45
|
# 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
|
69
46
|
# 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
|
70
47
|
# 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
|
71
48
|
# ....
|
72
|
-
#
|
49
|
+
#
|
50
|
+
# h2. How it works
|
51
|
+
#
|
52
|
+
# Wu-lign takes the first 500ish lines, splits into fields on TAB characters,
|
53
|
+
# and tries to guess the format (int, float, or string) for each. It builds a
|
54
|
+
# consensus of the width and type for corresponding columns in the chunk. If a
|
55
|
+
# column has mixed numeric and string formats it degrades to :mixed, which is
|
56
|
+
# basically treated as :string. If a column has mixed :float and :int elements all
|
57
|
+
# of them are formatted as float.
|
73
58
|
#
|
74
59
|
# h2. Notes
|
75
60
|
#
|
76
|
-
# *
|
61
|
+
# * Header rows: the first line is used for width alignment but not for type detection.
|
62
|
+
# This means that an initial row of text headers will inform column spacing
|
63
|
+
# but still allow a column of floats (say) to be properly aligned as floats.
|
77
64
|
#
|
78
|
-
# * It
|
79
|
-
#
|
65
|
+
# * It requires a unanimous vote. One screwy line can coerce the whole mess to
|
66
|
+
# :mixed; width formatting will still be applied, though.
|
80
67
|
#
|
81
|
-
# * It won't set columns wider than
|
68
|
+
# * It won't set columns wider than 100 chars -- this allows for the occasional
|
82
69
|
# super-wide column without completely breaking your screen.
|
83
70
|
#
|
84
71
|
# * For :float values, wulign tries to guess at the right number of significant
|
85
72
|
# digits to the left and right of the decimal point.
|
86
73
|
#
|
87
|
-
# * wulign
|
88
|
-
#
|
74
|
+
# * wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab
|
75
|
+
# delimits a field, every newline a record.
|
76
|
+
#
|
77
|
+
# wulign isn't intended to be smart, or correct, or reliable -- only to be
|
78
|
+
# useful for previewing and organizing tab-formatted files. In general
|
79
|
+
# wulign(foo).split("\t").map(&:strip) *should* give output semantically
|
80
|
+
# equivalent to its input. (That is, the only changes should be insertion of
|
81
|
+
# spaces and re-formatting of numerics.) But still -- reserve its use for human
|
82
|
+
# inspection only.
|
83
|
+
#
|
89
84
|
}
|
90
85
|
|
91
86
|
if ARGV[0] == '--help'
|
@@ -111,7 +106,7 @@ def get_type val
|
|
111
106
|
when val == '' then type = nil
|
112
107
|
when val =~ INT_RE then type = :int
|
113
108
|
when val =~ FLOAT_RE then type = :float
|
114
|
-
else type = :str end
|
109
|
+
else type = :str end
|
115
110
|
end
|
116
111
|
|
117
112
|
def consensus_type val, alltype, is_first
|
@@ -149,7 +144,7 @@ FORMAT_GUESSING_LINES.times do
|
|
149
144
|
row.each_with_index{|col,i|
|
150
145
|
next if skip_col[i]
|
151
146
|
# Let the first row be text (headers)
|
152
|
-
col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
|
147
|
+
col_types[i] = consensus_type(col, col_types[i], rows.length == 1)
|
153
148
|
if col_types[i] == :float
|
154
149
|
mantissa, radix = f_width(col)
|
155
150
|
col_minmag[i] = [radix, col_minmag[i], 1].compact.max
|
@@ -175,7 +170,7 @@ def dump_row row, format
|
|
175
170
|
puts row.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
|
176
171
|
end
|
177
172
|
def dump_header row, maxw
|
178
|
-
puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
|
173
|
+
puts row.zip(maxw).map{|col, width| "%-#{width}s" % col.to_s }.join("\t")
|
179
174
|
end
|
180
175
|
|
181
176
|
pad = [''] * maxw.length
|
@@ -38,7 +38,7 @@ wu-lign isn't intended to be smart, or correct, or reliable -- only to be useful
|
|
38
38
|
|
39
39
|
h2. How it works
|
40
40
|
|
41
|
-
Wu-Lign takes the first
|
41
|
+
Wu-Lign takes the first 500ish lines, splits into fields on TAB characters, and tries to guess the format (int, float, or string) for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
|
42
42
|
|
43
43
|
h2. Command-line arguments
|
44
44
|
|
@@ -58,8 +58,8 @@ will format the fourth column with "%8.4e", while the first three columns and fi
|
|
58
58
|
|
59
59
|
h2. Notes
|
60
60
|
|
61
|
-
*
|
62
|
-
* It
|
63
|
-
* It won't set columns wider than
|
64
|
-
* For :float values,
|
65
|
-
*
|
61
|
+
* Header rows: the first line is used for width alignment but not for type detection. This means that an initial row of text headers will inform column spacing but still allow a column of floats (say) to be properly aligned as floats.
|
62
|
+
* It requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
|
63
|
+
* It won't set columns wider than 100 chars -- this allows for the occasional super-wide column without completely breaking your screen.
|
64
|
+
* For :float values, wulign tries to guess at the right number of significant digits to the left and right of the decimal point.
|
65
|
+
* wulign parses only plain-jane 'TSV files': no quoting or escaping; every tab delimits a field, every newline a record.
|
@@ -3,6 +3,15 @@ Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
|
|
3
3
|
require 'rubygems'
|
4
4
|
require 'wukong'
|
5
5
|
|
6
|
+
#
|
7
|
+
# * Copy the emr.yaml from here into ~/.wukong/emr.yaml
|
8
|
+
# and edit it to suit.
|
9
|
+
# * Download the Amazon elastic-mapreduce runner. Get a copy from
|
10
|
+
# http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
11
|
+
# * Find out what breaks, fix it or ask us for help (coders@infochimps.org) and
|
12
|
+
# submit a patch
|
13
|
+
#
|
14
|
+
|
6
15
|
class FooStreamer < Wukong::Streamer::LineStreamer
|
7
16
|
def initialize *args
|
8
17
|
super *args
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# Elastic MapReduce config in wukong
|
3
|
+
#
|
4
|
+
|
5
|
+
#
|
6
|
+
# Infrastructure options
|
7
|
+
#
|
8
|
+
|
9
|
+
# == Fill all your information into yet another file with your amazon key Sorry
|
10
|
+
# that it needs to be in so many stupid places, nobody can agree on a
|
11
|
+
# filename or format.
|
12
|
+
:emr_credentials_file: ~/.wukong/credentials.json
|
13
|
+
#
|
14
|
+
# == Set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env vars, or enter them here:
|
15
|
+
# :access_key: ASDFAHKHASDF
|
16
|
+
# :secret_access_key: ADSGHASDFJASDFASDF
|
17
|
+
#
|
18
|
+
# == Path to your keypair file.
|
19
|
+
:key_pair_file: ~/.wukong/keypairs/gibbon.pem
|
20
|
+
# == Keypair will be named after your file, or force the name:
|
21
|
+
# :key_pair: ~
|
22
|
+
|
23
|
+
# == Path to the Amazon elastic-mapreduce runner. Get a copy from
|
24
|
+
# http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
|
25
|
+
:emr_runner: ~/ics/hadoop/elastic-mapreduce/elastic-mapreduce
|
26
|
+
|
27
|
+
#
|
28
|
+
# Cluster Config
|
29
|
+
#
|
30
|
+
:num_instances: 1
|
31
|
+
:instance_type: m2.xlarge
|
32
|
+
:master_instance_type: ~
|
33
|
+
:hadoop_version: '0.20'
|
34
|
+
:availability_zone: us-east-1b
|
35
|
+
|
36
|
+
#
|
37
|
+
# Running and reporting options
|
38
|
+
#
|
39
|
+
:alive: false
|
40
|
+
:enable_debugging: true
|
41
|
+
:emr_runner_verbose: true
|
42
|
+
:emr_runner_debug: ~
|
43
|
+
:step_action: CANCEL_AND_WAIT # CANCEL_AND_WAIT, TERMINATE_JOB_FLOW or CONTINUE
|
44
|
+
|
45
|
+
#
|
46
|
+
# Remote Paths
|
47
|
+
#
|
48
|
+
# Wukong is opinionated about the paths and locations of scripts and
|
49
|
+
# everything. Make an S3 bucket and let the wookiee win -- or hack
|
50
|
+
# lib/wukong/script/emr_command.rb to be more flexible and send us back a patch.
|
51
|
+
#
|
52
|
+
:emr_root: s3n://emr.infinitemonkeys.info
|
data/lib/wukong.rb
CHANGED
@@ -5,7 +5,9 @@ require 'wukong/bad_record'
|
|
5
5
|
autoload :TypedStruct, 'wukong/typed_struct'
|
6
6
|
require 'configliere'; Configliere.use :define
|
7
7
|
module Wukong
|
8
|
-
autoload :Dfs,
|
9
|
-
autoload :Script,
|
10
|
-
autoload :Streamer,
|
8
|
+
autoload :Dfs, 'wukong/dfs'
|
9
|
+
autoload :Script, 'wukong/script'
|
10
|
+
autoload :Streamer, 'wukong/streamer'
|
11
|
+
autoload :Store, 'wukong/store'
|
12
|
+
autoload :FilenamePattern, 'wukong/filename_pattern'
|
11
13
|
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Wukong
|
2
|
+
class FilenamePattern
|
3
|
+
# the filename pattern, e.g. 'ripd/:handle/:date/:handle+:timestamp-:pid-:hostname.tsv'
|
4
|
+
attr_accessor :pattern
|
5
|
+
# custom token replacements
|
6
|
+
attr_accessor :token_val_defaults
|
7
|
+
|
8
|
+
DEFAULT_PATTERN_STR = ":dest_dir/:handle_prefix/:handle/:date/:handle:timestamp-:pid-:hostname.tsv"
|
9
|
+
|
10
|
+
def initialize pattern, token_val_defaults={}
|
11
|
+
self.pattern = pattern
|
12
|
+
self.token_val_defaults = token_val_defaults
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# walk through pattern, replacing tokens (eg :time or :pid) with the
|
17
|
+
# corresponding value.
|
18
|
+
#
|
19
|
+
def make token_vals={}
|
20
|
+
token_vals = token_val_defaults.merge token_vals
|
21
|
+
token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
|
22
|
+
# CHH_NOTE: The following is broken for patterns that need a ":" or
|
23
|
+
# patterns that need text following a token with no special chars in
|
24
|
+
# between.
|
25
|
+
val = pattern.gsub(/:(\w+)/){ replace($1, token_vals) }
|
26
|
+
val
|
27
|
+
end
|
28
|
+
|
29
|
+
def to_s token_vals={}
|
30
|
+
make token_vals
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
# substitute for token
|
35
|
+
#
|
36
|
+
def replace token, token_vals
|
37
|
+
token = token.to_sym
|
38
|
+
return token_vals[token] if token_vals.include? token
|
39
|
+
case token
|
40
|
+
when :pid then pid
|
41
|
+
when :hostname then hostname
|
42
|
+
when :handle then token_vals[:handle]
|
43
|
+
when :handle_prefix then token_vals[:handle].to_s[0..5]
|
44
|
+
when :timestamp then token_vals[:timestamp]
|
45
|
+
when :date then token_vals[:timestamp][ 0..7]
|
46
|
+
when :time then token_vals[:timestamp][ 8..13]
|
47
|
+
when :hour then token_vals[:timestamp][ 8..9]
|
48
|
+
when :h4 then "%0.2d" % (( token_vals[:timestamp][8..9].to_i / 4 ) * 4)
|
49
|
+
when :min then token_vals[:timestamp][10..11]
|
50
|
+
when :sec then token_vals[:timestamp][12..13]
|
51
|
+
when :s10 then "%0.2d" % (( token_vals[:timestamp][12..13].to_i / 10 ) * 10)
|
52
|
+
else
|
53
|
+
raise "Don't know how to encode token #{token} #{token_vals[token]}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Memoized: the hostname for the machine running this script.
|
58
|
+
def hostname
|
59
|
+
@hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
|
60
|
+
end
|
61
|
+
# Memoized: the Process ID for this invocation.
|
62
|
+
def pid
|
63
|
+
@pid ||= Process.pid
|
64
|
+
end
|
65
|
+
|
66
|
+
# Characters deemed safe in a filename;
|
67
|
+
SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/\;'
|
68
|
+
def self.sanitize str
|
69
|
+
str.gsub(%r{[^#{SAFE_CHARS}]+}, '-')
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
@@ -66,17 +66,17 @@ module TokyoDbConnection
|
|
66
66
|
].freeze unless defined?(TokyoDbConnection::TyrantDb::DB_SERVERS)
|
67
67
|
|
68
68
|
DB_PORTS = {
|
69
|
-
:
|
70
|
-
:
|
69
|
+
:tw_screen_names => 12002,
|
70
|
+
:tw_search_ids => 12003,
|
71
71
|
#
|
72
|
-
:tw_user_info
|
73
|
-
:tw_wordbag
|
74
|
-
:tw_influence
|
75
|
-
:tw_trstrank
|
76
|
-
:tw_conversation
|
72
|
+
:tw_user_info => 14000,
|
73
|
+
:tw_wordbag => 14101,
|
74
|
+
:tw_influence => 14102,
|
75
|
+
:tw_trstrank => 14103,
|
76
|
+
:tw_conversation => 14104,
|
77
77
|
#
|
78
|
-
:
|
79
|
-
:
|
78
|
+
:tw_screen_names2 => 12004,
|
79
|
+
:tw_search_ids2 => 12005,
|
80
80
|
#
|
81
81
|
:tw_user_info2 => 14200,
|
82
82
|
:tw_wordbag2 => 14201,
|
@@ -84,7 +84,7 @@ module TokyoDbConnection
|
|
84
84
|
:tw_trstrank2 => 14203,
|
85
85
|
:tw_conversation2 => 14204,
|
86
86
|
:tw_strong_links2 => 14205,
|
87
|
-
:tw_word_stats2 =>
|
87
|
+
:tw_word_stats2 => 14210,
|
88
88
|
#
|
89
89
|
:ip_geo_census => 14400,
|
90
90
|
} unless defined?(TokyoDbConnection::TyrantDb::DB_PORTS)
|
@@ -123,7 +123,7 @@ module TokyoDbConnection
|
|
123
123
|
end
|
124
124
|
|
125
125
|
def handle_error action, e
|
126
|
-
warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
|
126
|
+
Log.warn "#{action} failed: #{e} #{e.backtrace.join("\t")}" ;
|
127
127
|
invalidate!
|
128
128
|
end
|
129
129
|
|
@@ -1,72 +1,70 @@
|
|
1
|
-
module
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
attr_accessor :last_time, :current_iter, :iter, :started_at
|
1
|
+
module Wukong::Monitor
|
2
|
+
#
|
3
|
+
# Accepts a lightweight call every iteration.
|
4
|
+
#
|
5
|
+
# Once either a time or an iteration criterion is met, executes the block
|
6
|
+
# and resets the timer until next execution.
|
7
|
+
#
|
8
|
+
# Note that the +time_interval+ is measured *excution to execution* and not
|
9
|
+
# in multiples of iter_interval. Say I set a time_interval of 300s, and
|
10
|
+
# happen to iterate at 297s and 310s after start. Then the monitor will
|
11
|
+
# execute at 310s, and the next execution will happen on or after 610s.
|
12
|
+
#
|
13
|
+
# Also note that when *either* criterion is met, *both* criteria are
|
14
|
+
# reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
|
15
|
+
# and that at 250s I reach iteration 10_000. Then the monitor will execute
|
16
|
+
# on or after 20_000 iteration or 550s, whichever happens first.
|
17
|
+
#
|
18
|
+
class PeriodicMonitor
|
19
|
+
attr_accessor :time_interval, :iter_interval
|
20
|
+
attr_accessor :last_time, :current_iter, :iter, :started_at
|
22
21
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
22
|
+
def initialize options={}
|
23
|
+
self.started_at = Time.now.utc.to_f
|
24
|
+
self.last_time = started_at
|
25
|
+
self.iter = 0
|
26
|
+
self.current_iter = 0
|
27
|
+
self.time_interval = options[:time]
|
28
|
+
self.iter_interval = options[:iters]
|
29
|
+
end
|
31
30
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
31
|
+
# True if more than +iter_interval+ has elapsed since last execution.
|
32
|
+
def enough_iterations?
|
33
|
+
iter % iter_interval == 0 if iter_interval
|
34
|
+
end
|
36
35
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
36
|
+
# True if more than +time_interval+ has elapsed since last execution.
|
37
|
+
def enough_time? now
|
38
|
+
(now - last_time) > time_interval if time_interval
|
39
|
+
end
|
41
40
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
41
|
+
# Time since monitor was created
|
42
|
+
def since
|
43
|
+
Time.now.utc.to_f - started_at
|
44
|
+
end
|
45
|
+
# Overall iterations per second
|
46
|
+
def rate
|
47
|
+
iter.to_f / since.to_f
|
48
|
+
end
|
49
|
+
# "Instantaneous" iterations per second
|
50
|
+
def inst_rate now
|
51
|
+
current_iter.to_f / (now-last_time).to_f
|
52
|
+
end
|
54
53
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
68
|
-
end
|
54
|
+
#
|
55
|
+
# if the interval conditions are met, executes block; otherwise just does
|
56
|
+
# bookkeeping and returns.
|
57
|
+
#
|
58
|
+
def periodically &block
|
59
|
+
self.iter += 1
|
60
|
+
self.current_iter += 1
|
61
|
+
now = Time.now.utc.to_f
|
62
|
+
if enough_iterations? || enough_time?(now)
|
63
|
+
block.call(iter, (now-last_time))
|
64
|
+
self.last_time = now
|
65
|
+
self.current_iter = 0
|
69
66
|
end
|
70
|
-
|
71
67
|
end
|
72
68
|
end
|
69
|
+
|
70
|
+
end
|
@@ -32,6 +32,7 @@ module Wukong
|
|
32
32
|
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
33
33
|
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
34
34
|
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
35
|
+
Settings.define :min_input_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
35
36
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
36
37
|
|
37
38
|
#
|
@@ -48,14 +49,14 @@ module Wukong
|
|
48
49
|
hadoop_commandline = [
|
49
50
|
hadoop_runner,
|
50
51
|
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
51
|
-
|
52
|
-
"-
|
53
|
-
"-mapper '#{map_commandline}'",
|
54
|
-
"-reducer '#{reduce_commandline}'",
|
52
|
+
"-mapper '#{mapper_commandline}'",
|
53
|
+
"-reducer '#{reducer_commandline}'",
|
55
54
|
"-input '#{input_paths}'",
|
56
55
|
"-output '#{output_path}'",
|
56
|
+
hadoop_jobconf_options,
|
57
|
+
"-jobconf mapred.job.name='#{job_name}'",
|
57
58
|
hadoop_recycle_env,
|
58
|
-
hadoop_other_args
|
59
|
+
hadoop_other_args,
|
59
60
|
].flatten.compact.join(" \t\\\n ")
|
60
61
|
Log.info " Launching hadoop!"
|
61
62
|
execute_command!(hadoop_commandline)
|
@@ -94,7 +95,7 @@ module Wukong
|
|
94
95
|
# if not, the resulting nil will be elided later
|
95
96
|
def jobconf option
|
96
97
|
if options[option]
|
97
|
-
"-
|
98
|
+
"-jobconf %s=%s" % [options.description_for(option), options[option]]
|
98
99
|
end
|
99
100
|
end
|
100
101
|
|
data/lib/wukong/store.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
|
-
module
|
1
|
+
module Wukong
|
2
2
|
module Store
|
3
|
-
extend FactoryModule
|
4
|
-
autoload :Base,
|
5
|
-
autoload :FlatFileStore,
|
6
|
-
autoload :ConditionalStore,
|
7
|
-
autoload :ChunkedFlatFileStore,
|
8
|
-
autoload :
|
9
|
-
autoload :
|
10
|
-
autoload :
|
11
|
-
autoload :
|
12
|
-
autoload :
|
3
|
+
# extend FactoryModule
|
4
|
+
autoload :Base, 'wukong/store/base'
|
5
|
+
autoload :FlatFileStore, 'wukong/store/flat_file_store'
|
6
|
+
# autoload :ConditionalStore, 'monkeyshines/store/conditional_store'
|
7
|
+
autoload :ChunkedFlatFileStore, 'wukong/store/chunked_flat_file_store'
|
8
|
+
autoload :ChhChunkedFlatFileStore, 'wukong/store/chh_chunked_flat_file_store'
|
9
|
+
# autoload :KeyStore, 'monkeyshines/store/key_store'
|
10
|
+
# autoload :TokyoTdbKeyStore, 'monkeyshines/store/tokyo_tdb_key_store'
|
11
|
+
# autoload :TyrantTdbKeyStore, 'monkeyshines/store/tyrant_tdb_key_store'
|
12
|
+
# autoload :TyrantRdbKeyStore, 'monkeyshines/store/tyrant_rdb_key_store'
|
13
|
+
# autoload :ReadThruStore, 'monkeyshines/store/read_thru_store'
|
13
14
|
end
|
14
15
|
end
|
data/lib/wukong/store/base.rb
CHANGED
@@ -1,18 +1,16 @@
|
|
1
|
-
module
|
1
|
+
module Wukong
|
2
2
|
module Store
|
3
3
|
class Base
|
4
|
-
|
5
|
-
|
6
|
-
self.options = _options
|
7
|
-
Log.info "Creating #{self.class}"
|
4
|
+
def initialize options={}
|
5
|
+
Log.info "Creating #{self.class} with #{options.inspect}"
|
8
6
|
end
|
9
7
|
|
10
|
-
#
|
8
|
+
#Iterate through each object casting it as a new object of klass.
|
11
9
|
def each_as klass, &block
|
12
10
|
self.each do |*args|
|
13
11
|
begin
|
14
12
|
item = klass.new *args[1..-1]
|
15
|
-
rescue
|
13
|
+
rescue StandardError => e
|
16
14
|
Log.info [args, e.to_s, self].join("\t")
|
17
15
|
raise e
|
18
16
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Store
|
3
|
+
class ChhChunkedFlatFileStore < Wukong::Store::FlatFileStore
|
4
|
+
attr_accessor :filename_pattern, :handle, :rootdir
|
5
|
+
|
6
|
+
# Move to configliere
|
7
|
+
Settings.define :chunk_file_pattern, :default => ":rootdir/:date/:handle:timestamp-:pid.tsv",:description => "The pattern for chunked files."
|
8
|
+
Settings.define :chunk_file_rootdir, :default => nil, :description => "The root directory for the chunked files."
|
9
|
+
|
10
|
+
#Note that filemode is inherited from flat_file
|
11
|
+
|
12
|
+
def initialize options={}
|
13
|
+
# super wants a :filename in the options or it will fail. We need to get the initial filename
|
14
|
+
# set up before we call super, so we need all of the parts of the pattern set up.
|
15
|
+
self.rootdir = options[:rootdir] || Settings[:chunk_file_rootdir]
|
16
|
+
self.handle = options[:handle]
|
17
|
+
pattern = options[:pattern] || Settings[:chunk_file_pattern]
|
18
|
+
self.filename_pattern = FilenamePattern.new(pattern, :handle => handle, :rootdir => self.rootdir)
|
19
|
+
options[:filename] = filename_pattern.make()
|
20
|
+
|
21
|
+
super options
|
22
|
+
|
23
|
+
self.mkdir!
|
24
|
+
end
|
25
|
+
|
26
|
+
def new_chunk
|
27
|
+
new_filename = filename_pattern.make()
|
28
|
+
Log.info "Rotating chunked file #{filename} into #{new_filename}"
|
29
|
+
self.flush
|
30
|
+
self.close
|
31
|
+
@filename = new_filename
|
32
|
+
self.mkdir!
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -1,22 +1,29 @@
|
|
1
|
-
module
|
1
|
+
module Wukong
|
2
2
|
module Store
|
3
|
-
class ChunkedFlatFileStore <
|
4
|
-
attr_accessor :filename_pattern, :chunk_monitor, :handle
|
3
|
+
class ChunkedFlatFileStore < Wukong::Store::FlatFileStore
|
4
|
+
attr_accessor :filename_pattern, :chunk_monitor, :handle, :chunktime, :rootdir
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
# Move to configliere
|
7
|
+
Settings.define :chunk_file_pattern, :default => ":rootdir/:date/:handle:timestamp-:pid.tsv",:description => "The pattern for chunked files."
|
8
|
+
Settings.define :chunk_file_chunktime, :default => 4*60*60,:description => "The time interval to keep a chunk file open."
|
9
|
+
Settings.define :chunk_file_rootdir, :default => nil, :description => "The root directory for the chunked files."
|
10
|
+
|
11
|
+
#Note that filemode is inherited from flat_file
|
12
12
|
|
13
|
-
def initialize
|
14
|
-
|
15
|
-
|
16
|
-
self.
|
17
|
-
self.
|
18
|
-
self.
|
19
|
-
|
13
|
+
def initialize options={}
|
14
|
+
# super wants a :filename in the options or it will fail. We need to get the initial filename
|
15
|
+
# set up before we call super, so we need all of the parts of the pattern set up.
|
16
|
+
self.chunktime = options[:chunktime] || Settings[:chunk_file_chunktime]
|
17
|
+
self.rootdir = options[:rootdir] || Settings[:chunk_file_rootdir]
|
18
|
+
self.handle = options[:handle]
|
19
|
+
pattern = options[:pattern] || Settings[:chunk_file_pattern]
|
20
|
+
self.filename_pattern = FilenamePattern.new(pattern, :handle => handle, :rootdir => self.rootdir)
|
21
|
+
options[:filename] = filename_pattern.make()
|
22
|
+
|
23
|
+
super options
|
24
|
+
|
25
|
+
Log.warn "You don't really want a chunk time this small: #{self.chunktime}" unless self.chunktime > 600
|
26
|
+
self.chunk_monitor = Wukong::PeriodicMonitor.new( :time => self.chunktime )
|
20
27
|
self.mkdir!
|
21
28
|
end
|
22
29
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'fileutils'; include FileUtils
|
2
2
|
|
3
|
-
module
|
3
|
+
module Wukong
|
4
4
|
module Store
|
5
5
|
#
|
6
6
|
class FlatFileStore < Store::Base
|
@@ -10,7 +10,7 @@ module Monkeyshines
|
|
10
10
|
# +filename_root+ : first part of name for files
|
11
11
|
#
|
12
12
|
def initialize options={}
|
13
|
-
|
13
|
+
super options
|
14
14
|
self.filename = options[:filename] or raise "Missing filename in #{self.class}"
|
15
15
|
self.filemode = options[:filemode] || 'r'
|
16
16
|
skip!(options[:skip]) if options[:skip]
|
@@ -21,7 +21,6 @@ module Monkeyshines
|
|
21
21
|
#
|
22
22
|
def each &block
|
23
23
|
file.each do |line|
|
24
|
-
next if line[0..0] == '#'
|
25
24
|
attrs = line.chomp.split("\t")
|
26
25
|
next if attrs.blank?
|
27
26
|
yield *attrs
|
@@ -54,6 +53,10 @@ module Monkeyshines
|
|
54
53
|
@file = nil
|
55
54
|
end
|
56
55
|
|
56
|
+
def flush
|
57
|
+
@file.flush if @file
|
58
|
+
end
|
59
|
+
|
57
60
|
# Ensure the file's directory exists
|
58
61
|
def mkdir!
|
59
62
|
dir = File.dirname(filename)
|
@@ -64,7 +67,7 @@ module Monkeyshines
|
|
64
67
|
|
65
68
|
# write to the file
|
66
69
|
def save obj
|
67
|
-
file
|
70
|
+
file.puts obj
|
68
71
|
obj
|
69
72
|
end
|
70
73
|
|
@@ -74,14 +77,10 @@ module Monkeyshines
|
|
74
77
|
File.size(filename)
|
75
78
|
end
|
76
79
|
|
77
|
-
|
78
|
-
tok, obj = block.call
|
79
|
-
save obj
|
80
|
-
end
|
81
|
-
|
82
|
-
# delegates to +#save+ -- writes the object to the file
|
80
|
+
# delegates to +#save+ -- writes the object to the file. Returns self for chaining on the stream.
|
83
81
|
def <<(obj)
|
84
82
|
save obj
|
83
|
+
self
|
85
84
|
end
|
86
85
|
|
87
86
|
end
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "1.5.
|
8
|
+
s.version = "1.5.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2010-08-
|
12
|
+
s.date = %q{2010-08-19}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
@@ -37,6 +37,7 @@ Gem::Specification.new do |s|
|
|
37
37
|
"bin/hdp-bzip",
|
38
38
|
"bin/hdp-cat",
|
39
39
|
"bin/hdp-catd",
|
40
|
+
"bin/hdp-cp",
|
40
41
|
"bin/hdp-du",
|
41
42
|
"bin/hdp-get",
|
42
43
|
"bin/hdp-kill",
|
@@ -138,6 +139,7 @@ Gem::Specification.new do |s|
|
|
138
139
|
"examples/count_keys.rb",
|
139
140
|
"examples/count_keys_at_mapper.rb",
|
140
141
|
"examples/emr/elastic_mapreduce_example.rb",
|
142
|
+
"examples/emr/emr.yaml",
|
141
143
|
"examples/keystore/cassandra_batch_test.rb",
|
142
144
|
"examples/keystore/conditional_outputter_example.rb",
|
143
145
|
"examples/network_graph/adjacency_list.rb",
|
@@ -185,6 +187,7 @@ Gem::Specification.new do |s|
|
|
185
187
|
"lib/wukong/extensions/string.rb",
|
186
188
|
"lib/wukong/extensions/struct.rb",
|
187
189
|
"lib/wukong/extensions/symbol.rb",
|
190
|
+
"lib/wukong/filename_pattern.rb",
|
188
191
|
"lib/wukong/keystore/cassandra_conditional_outputter.rb",
|
189
192
|
"lib/wukong/keystore/redis_db.rb",
|
190
193
|
"lib/wukong/keystore/tyrant_db.rb",
|
@@ -205,6 +208,7 @@ Gem::Specification.new do |s|
|
|
205
208
|
"lib/wukong/script/local_command.rb",
|
206
209
|
"lib/wukong/store.rb",
|
207
210
|
"lib/wukong/store/base.rb",
|
211
|
+
"lib/wukong/store/chh_chunked_flat_file_store.rb",
|
208
212
|
"lib/wukong/store/chunked_flat_file_store.rb",
|
209
213
|
"lib/wukong/store/conditional_store.rb",
|
210
214
|
"lib/wukong/store/factory.rb",
|
@@ -222,6 +226,7 @@ Gem::Specification.new do |s|
|
|
222
226
|
"lib/wukong/streamer/count_keys.rb",
|
223
227
|
"lib/wukong/streamer/count_lines.rb",
|
224
228
|
"lib/wukong/streamer/counting_reducer.rb",
|
229
|
+
"lib/wukong/streamer/em_streamer.rb",
|
225
230
|
"lib/wukong/streamer/filter.rb",
|
226
231
|
"lib/wukong/streamer/line_streamer.rb",
|
227
232
|
"lib/wukong/streamer/list_reducer.rb",
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 1.5.
|
9
|
+
- 3
|
10
|
+
version: 1.5.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Philip (flip) Kromer
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-19 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- bin/hdp-bzip
|
132
132
|
- bin/hdp-cat
|
133
133
|
- bin/hdp-catd
|
134
|
+
- bin/hdp-cp
|
134
135
|
- bin/hdp-du
|
135
136
|
- bin/hdp-get
|
136
137
|
- bin/hdp-kill
|
@@ -232,6 +233,7 @@ files:
|
|
232
233
|
- examples/count_keys.rb
|
233
234
|
- examples/count_keys_at_mapper.rb
|
234
235
|
- examples/emr/elastic_mapreduce_example.rb
|
236
|
+
- examples/emr/emr.yaml
|
235
237
|
- examples/keystore/cassandra_batch_test.rb
|
236
238
|
- examples/keystore/conditional_outputter_example.rb
|
237
239
|
- examples/network_graph/adjacency_list.rb
|
@@ -279,6 +281,7 @@ files:
|
|
279
281
|
- lib/wukong/extensions/string.rb
|
280
282
|
- lib/wukong/extensions/struct.rb
|
281
283
|
- lib/wukong/extensions/symbol.rb
|
284
|
+
- lib/wukong/filename_pattern.rb
|
282
285
|
- lib/wukong/keystore/cassandra_conditional_outputter.rb
|
283
286
|
- lib/wukong/keystore/redis_db.rb
|
284
287
|
- lib/wukong/keystore/tyrant_db.rb
|
@@ -299,6 +302,7 @@ files:
|
|
299
302
|
- lib/wukong/script/local_command.rb
|
300
303
|
- lib/wukong/store.rb
|
301
304
|
- lib/wukong/store/base.rb
|
305
|
+
- lib/wukong/store/chh_chunked_flat_file_store.rb
|
302
306
|
- lib/wukong/store/chunked_flat_file_store.rb
|
303
307
|
- lib/wukong/store/conditional_store.rb
|
304
308
|
- lib/wukong/store/factory.rb
|
@@ -316,6 +320,7 @@ files:
|
|
316
320
|
- lib/wukong/streamer/count_keys.rb
|
317
321
|
- lib/wukong/streamer/count_lines.rb
|
318
322
|
- lib/wukong/streamer/counting_reducer.rb
|
323
|
+
- lib/wukong/streamer/em_streamer.rb
|
319
324
|
- lib/wukong/streamer/filter.rb
|
320
325
|
- lib/wukong/streamer/line_streamer.rb
|
321
326
|
- lib/wukong/streamer/list_reducer.rb
|