mrflip-wukong 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +202 -0
- data/README-tutorial.textile +163 -0
- data/README.textile +165 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/UsingWukong-part1.textile +2 -0
- data/doc/UsingWukong-part2.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-setup.textile +21 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +65 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +112 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +40 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +39 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +20 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +173 -0
- metadata +208 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
# == Load/StoreFunctions ==
|
2
|
+
# BinaryDeserializer
|
3
|
+
# BinarySerializer
|
4
|
+
# BinStorage
|
5
|
+
# PigStorage
|
6
|
+
# PigDump
|
7
|
+
# TextLoader
|
8
|
+
|
9
|
+
module Wukong
|
10
|
+
module AndPig
|
11
|
+
class PigVar
|
12
|
+
#===========================================================================
|
13
|
+
#
|
14
|
+
# The "LOAD" pig expression:
|
15
|
+
# MyRelation = LOAD 'my_relation.tsv' AS (attr_a: int, attr_b: chararray) ;
|
16
|
+
#
|
17
|
+
# The AS type spec is generated from klass
|
18
|
+
#
|
19
|
+
def self.pig_load rel, klass, options={ }
|
20
|
+
filename = options[:filename] || default_filename(rel)
|
21
|
+
self.set rel, self.new(klass, rel, "LOAD '#{filename}' AS #{klass.typify(options[:has_rsrc])}")
|
22
|
+
if options[:has_rsrc]
|
23
|
+
lval = self[rel]
|
24
|
+
lval.generate lval, *lval.fields
|
25
|
+
end
|
26
|
+
rel
|
27
|
+
end
|
28
|
+
|
29
|
+
#===========================================================================
|
30
|
+
#
|
31
|
+
#
|
32
|
+
# The "STORE" pig imperative:
|
33
|
+
# STORE Relation INTO 'filename'
|
34
|
+
# If no filename is given, the relation's name is used
|
35
|
+
#
|
36
|
+
def store filename=nil
|
37
|
+
filename ||= default_filename
|
38
|
+
self.class.emit "STORE %-19s INTO '%s'" % [relation, filename]
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
# Store the relation, removing the existing file
|
43
|
+
def store! filename=nil
|
44
|
+
filename ||= default_filename
|
45
|
+
rmf! filename
|
46
|
+
mkdir File.dirname(filename)
|
47
|
+
store filename
|
48
|
+
end
|
49
|
+
|
50
|
+
# Force a store to disk, then load (so all calculations proceed from there)
|
51
|
+
def checkpoint! options={}
|
52
|
+
options = options.reverse_merge :filename => default_filename
|
53
|
+
store! options[:filename]
|
54
|
+
self.class.pig_load(self.name, klass, options)
|
55
|
+
end
|
56
|
+
|
57
|
+
def default_filename
|
58
|
+
self.class.default_filename self.name
|
59
|
+
end
|
60
|
+
def self.default_filename name
|
61
|
+
File.join(working_dir, name.to_s)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# == DiagnosticOperators
|
2
|
+
# describe
|
3
|
+
# dump
|
4
|
+
# explain
|
5
|
+
# illustrate
|
6
|
+
# == UDFStatements
|
7
|
+
# define
|
8
|
+
# register
|
9
|
+
|
10
|
+
module Wukong
|
11
|
+
module AndPig
|
12
|
+
class PigVar
|
13
|
+
# DESCRIBE pig imperative
|
14
|
+
def describe
|
15
|
+
self.class.describe self
|
16
|
+
end
|
17
|
+
def self.describe rel
|
18
|
+
emit %Q{ -- PREDICTED: #{rel.klass.typify} }
|
19
|
+
simple_declaration :describe, rel.relationize
|
20
|
+
rel
|
21
|
+
end
|
22
|
+
|
23
|
+
# DUMP pig imperative
|
24
|
+
def dump() simple_operation :dump end
|
25
|
+
|
26
|
+
# EXPLAIN pig imperative
|
27
|
+
def explain() simple_operation :explain end
|
28
|
+
|
29
|
+
# ILLUSTRATE pig imperative
|
30
|
+
def illustrate() simple_operation :illustrate end
|
31
|
+
|
32
|
+
|
33
|
+
def self.define pig_alias, *args
|
34
|
+
emit_imperative :DEFINE, pig_alias, args
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.register path_to_jar
|
38
|
+
emit_imperative :REGISTER, path_to_jar
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# == RelationalOperators
|
3
|
+
#
|
4
|
+
# GROUP, COGROUP, JOIN see groupies.rb
|
5
|
+
# CROSS see
|
6
|
+
|
7
|
+
# distinct
|
8
|
+
# filter
|
9
|
+
# limit
|
10
|
+
# order
|
11
|
+
# split
|
12
|
+
# union
|
13
|
+
|
14
|
+
#
|
15
|
+
# stream
|
16
|
+
# load
|
17
|
+
# store
|
18
|
+
#
|
19
|
+
module Wukong
|
20
|
+
module AndPig
|
21
|
+
class PigVar
|
22
|
+
|
23
|
+
# ===========================================================================
|
24
|
+
#
|
25
|
+
# Options
|
26
|
+
#
|
27
|
+
def self.parallelize! str, options
|
28
|
+
str << " PARALLEL #{options[:parallel]}" if options[:parallel]
|
29
|
+
end
|
30
|
+
|
31
|
+
# ===========================================================================
|
32
|
+
#
|
33
|
+
# DISTINCT
|
34
|
+
#
|
35
|
+
def distinct lval, options={}
|
36
|
+
self.class.distinct lval, self, options
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.distinct lval, rel, options={ }
|
40
|
+
cmd_str = rel.relationize
|
41
|
+
parallelize! cmd_str, options
|
42
|
+
simple_operation lval, rel, :distinct, cmd_str
|
43
|
+
end
|
44
|
+
|
45
|
+
# ===========================================================================
|
46
|
+
#
|
47
|
+
# FILTER
|
48
|
+
#
|
49
|
+
def filter by_str
|
50
|
+
new_in_chain klass, "FILTER #{relation} BY #{by_str}"
|
51
|
+
end
|
52
|
+
def self.filter lval, rel, by_str
|
53
|
+
simple_operation lval, rel, "FILTER", "#{rel.relation} BY #{by_str}"
|
54
|
+
end
|
55
|
+
|
56
|
+
# ===========================================================================
|
57
|
+
#
|
58
|
+
# LIMIT
|
59
|
+
#
|
60
|
+
def limit n
|
61
|
+
new_in_chain klass, "LIMIT #{relation} #{n}"
|
62
|
+
end
|
63
|
+
|
64
|
+
# ===========================================================================
|
65
|
+
#
|
66
|
+
# ORDER
|
67
|
+
#
|
68
|
+
# alias = ORDER alias BY { * [ASC|DESC] |
|
69
|
+
# field_alias [ASC|DESC] [, field_alias [ASC|DESC] …]
|
70
|
+
# } [PARALLEL n];
|
71
|
+
#
|
72
|
+
def order cmd_str, options={}
|
73
|
+
result = new_in_chain klass, "ORDER #{relation} BY #{cmd_str}"
|
74
|
+
parallelize! result.cmd, options
|
75
|
+
result
|
76
|
+
end
|
77
|
+
|
78
|
+
# ===========================================================================
|
79
|
+
#
|
80
|
+
# SPLIT
|
81
|
+
#
|
82
|
+
# SPLIT alias INTO alias IF expression, alias IF expression [, alias IF expression …];
|
83
|
+
#
|
84
|
+
#
|
85
|
+
def split relation_tests={}
|
86
|
+
split_str = relation_tests.map do |out_rel, test|
|
87
|
+
"#{out_rel} IF #{test}"
|
88
|
+
end.join(", ")
|
89
|
+
new_in_chain klass, "SPLIT #{relation} INTO #{split_str}"
|
90
|
+
end
|
91
|
+
|
92
|
+
# ===========================================================================
|
93
|
+
#
|
94
|
+
# CROSS
|
95
|
+
#
|
96
|
+
def cross *relations
|
97
|
+
options = relations.extract_options!
|
98
|
+
raise CrossArgumentError unless relations.length >= 1
|
99
|
+
relations_str = [self, *relations].map(&:relation).join(", ")
|
100
|
+
result = new_in_chain relations.first.klass, "CROSS #{relations_str}"
|
101
|
+
parallelize! result.cmd, options
|
102
|
+
result
|
103
|
+
end
|
104
|
+
|
105
|
+
# ===========================================================================
|
106
|
+
#
|
107
|
+
# UNION
|
108
|
+
#
|
109
|
+
# def self.union *relations
|
110
|
+
# raise UnionArgumentError unless relations.length >= 2
|
111
|
+
# new_in_chain relations.first.klass, "UNION #{relations}"
|
112
|
+
# end
|
113
|
+
|
114
|
+
# UNION as method
|
115
|
+
def union lval, *relations
|
116
|
+
self.class.union lval, [self]+relations
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.union lval, *relations
|
120
|
+
raise UnionArgumentError unless relations.length >= 2
|
121
|
+
relations_str = relations.map(&:relation).join(", ")
|
122
|
+
simple_operation lval, relations.first, :union, relations_str
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
CrossArgumentError = ArgumentError.new("CROSS requires at least two relations. Heh heh: relations.")
|
127
|
+
UnionArgumentError = ArgumentError.new("UNION requires at least two relations. Heh heh: relations.")
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Wukong
|
2
|
+
module PigStructMethods
|
3
|
+
module ClassMethods
|
4
|
+
#
|
5
|
+
# Pig type string --
|
6
|
+
# the pig type strings for each sub-element.
|
7
|
+
#
|
8
|
+
def typify has_rsrc=nil
|
9
|
+
vars_str = members.zip(mtypes).map do |attr, mtype|
|
10
|
+
"%s: %s" % [attr, mtype.typify]
|
11
|
+
end
|
12
|
+
vars_str = ["rsrc: chararray"] + vars_str if has_rsrc
|
13
|
+
"(#{vars_str.join(', ')})"
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def pig_load rel, *args
|
20
|
+
Wukong::AndPig::PigVar.pig_load rel, self, *args
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Returns type for a fieldspec
|
25
|
+
#
|
26
|
+
def field_type field
|
27
|
+
case field
|
28
|
+
when Symbol then members_types[field]
|
29
|
+
# when Array
|
30
|
+
# if field.length > 1 then members_types[field.first].field_type(field[1..-1])
|
31
|
+
# else field_type field.first
|
32
|
+
# end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
def self.included base
|
38
|
+
base.extend ClassMethods
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
Struct.class_eval do
|
44
|
+
include Wukong::PigStructMethods
|
45
|
+
def self.mtypes
|
46
|
+
members
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Wukong
|
2
|
+
module AndPig
|
3
|
+
|
4
|
+
#
|
5
|
+
# Make a PigVar understand the struct it describes
|
6
|
+
#
|
7
|
+
class PigVar
|
8
|
+
attr_accessor :klass, :name, :cmd
|
9
|
+
cattr_accessor :working_dir ; self.working_dir = '.'
|
10
|
+
def initialize klass, name, cmd
|
11
|
+
self.klass = klass
|
12
|
+
self.name = name
|
13
|
+
self.cmd = cmd
|
14
|
+
end
|
15
|
+
|
16
|
+
# Sugar for PigVar.new_relation
|
17
|
+
def self.[]= name, *args
|
18
|
+
set name, *args
|
19
|
+
end
|
20
|
+
# Sugar for PigVar.new_relation
|
21
|
+
def self.[] name
|
22
|
+
PIG_SYMBOLS[name]
|
23
|
+
end
|
24
|
+
|
25
|
+
# extract a field from an alias
|
26
|
+
def _ field
|
27
|
+
as_name = [name, field].join("_").to_sym
|
28
|
+
AS["#{relationize}.(#{field})", as_name, Bag.new([field, field_type(field)]), nil, :skip_type]
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
def self.set name, rval
|
33
|
+
PIG_SYMBOLS[name] = rval
|
34
|
+
rval.name = name
|
35
|
+
emit_setter rval.relation, rval
|
36
|
+
end
|
37
|
+
|
38
|
+
def relation
|
39
|
+
name.relationize
|
40
|
+
end
|
41
|
+
alias_method :relationize, :relation
|
42
|
+
|
43
|
+
#
|
44
|
+
# Create a name for a new anonymous relation
|
45
|
+
#
|
46
|
+
def self.anon slug
|
47
|
+
idx = (Wukong::AndPig.anon_var_idx += 1)
|
48
|
+
"anon_#{slug}_#{idx}_".to_sym
|
49
|
+
end
|
50
|
+
# Create a name building off this one
|
51
|
+
def anon
|
52
|
+
slug = name.to_s.gsub(/^anon_/,'').gsub(/_\d+_$/,'')
|
53
|
+
self.class.anon slug
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
def new_in_chain lval, l_klass, l_cmd
|
58
|
+
rval = self.class.new l_klass, lval, l_cmd
|
59
|
+
self.class.set lval, rval
|
60
|
+
end
|
61
|
+
|
62
|
+
# Delegate to klass
|
63
|
+
def field_type *args
|
64
|
+
self.klass.field_type *args
|
65
|
+
end
|
66
|
+
|
67
|
+
# Fields in this relation
|
68
|
+
def fields
|
69
|
+
klass.members.map(&:to_sym)
|
70
|
+
end
|
71
|
+
|
72
|
+
#
|
73
|
+
# Side-effect free operation
|
74
|
+
#
|
75
|
+
def simple_operation op
|
76
|
+
self.class.emit "#{op.to_s.upcase} #{relation}"
|
77
|
+
self
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.simple_operation lval, rel, op, r_str
|
81
|
+
cmd = "%-8s %s" % [op.to_s.upcase, r_str]
|
82
|
+
rval = new(rel.klass, lval, cmd)
|
83
|
+
set lval, rval
|
84
|
+
end
|
85
|
+
|
86
|
+
def self.simple_declaration op, r_str
|
87
|
+
cmd = "%-8s %s" % [op.to_s.upcase, r_str]
|
88
|
+
emit cmd
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Wukong
|
2
|
+
module AndPig
|
3
|
+
PIG_SYMBOLS = { }
|
4
|
+
mattr_accessor :anon_var_idx
|
5
|
+
self.anon_var_idx = 0
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
Symbol.class_eval do
|
11
|
+
def << relation
|
12
|
+
case
|
13
|
+
when relation.is_a?(Wukong::AndPig::PigVar)
|
14
|
+
Wukong::AndPig::PigVar.new_relation(self, relation)
|
15
|
+
when relation.is_a?(Symbol) && (pig_var = Wukong::AndPig::PIG_SYMBOLS[relation])
|
16
|
+
Wukong::AndPig::PigVar.new_relation(self, pig_var)
|
17
|
+
else raise "Don't know how to pigify RHS #{relation.inspect}"
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def method_missing method, *args
|
22
|
+
pig_var = Wukong::AndPig::PIG_SYMBOLS[self]
|
23
|
+
if pig_var && pig_var.respond_to?(method)
|
24
|
+
pig_var.send(method, *args)
|
25
|
+
else
|
26
|
+
super method, *args
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
File without changes
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#
|
2
|
+
# Easily serialize bad records in-band, for later analysis or to discard if
|
3
|
+
# neglectable.
|
4
|
+
#
|
5
|
+
# You can instantiate this as
|
6
|
+
# success = do_stuff_to record
|
7
|
+
# if ! success
|
8
|
+
# return BadRecord.new("do_stuff_to-failed", record)
|
9
|
+
# end
|
10
|
+
#
|
11
|
+
class BadRecord < Struct.new(
|
12
|
+
:errors,
|
13
|
+
:record
|
14
|
+
)
|
15
|
+
def initialize errors='', *record_fields
|
16
|
+
super errors, record_fields
|
17
|
+
end
|
18
|
+
end
|
data/lib/wukong/boot.rb
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
module Wukong
|
2
|
+
|
3
|
+
# ---------------------------------------------------------------------------
|
4
|
+
#
|
5
|
+
# Default options for Wukong
|
6
|
+
# http://github.com/infochimps/wukong
|
7
|
+
#
|
8
|
+
# If you set an environment variable WUKONG_CONFIG, *or* if the file
|
9
|
+
# $HOME/.wukong.rb exists, that file will be +require+'d as well.
|
10
|
+
#
|
11
|
+
# Important values to set:
|
12
|
+
#
|
13
|
+
# * Wukong::CONFIG[:hadoop_home] --
|
14
|
+
# Path to root of hadoop install. If your hadoop runner is
|
15
|
+
# /usr/local/share/hadoop/bin/hadoop
|
16
|
+
# then your hadoop_home is
|
17
|
+
# /usr/local/share/hadoop.
|
18
|
+
# You can also set a
|
19
|
+
#
|
20
|
+
# * Wukong::CONFIG[:default_run_mode] -- Whether to run using hadoop (and
|
21
|
+
# thus, requiring a working hadoop install), or to run in local mode
|
22
|
+
# (script --map | sort | script --reduce)
|
23
|
+
#
|
24
|
+
CONFIG = {
|
25
|
+
# Run as local or as hadoop?
|
26
|
+
:default_run_mode => 'hadoop',
|
27
|
+
|
28
|
+
# The command to run when a nil mapper or reducer is given.
|
29
|
+
:default_mapper => '/bin/cat',
|
30
|
+
:default_reducer => '/bin/cat',
|
31
|
+
|
32
|
+
# Anything in HADOOP_OPTIONS_MAP (see lib/wukong/script/hadoop_command.rb)
|
33
|
+
:runner_defaults => {
|
34
|
+
},
|
35
|
+
}
|
36
|
+
|
37
|
+
def self.config_options
|
38
|
+
# # override with site-specific options
|
39
|
+
site_config_filename = ENV['WUKONG_CONFIG'] || (ENV['HOME'].to_s+'/.wukong.rb')
|
40
|
+
require site_config_filename.gsub(/\.rb$/,'') if File.exists?(site_config_filename)
|
41
|
+
|
42
|
+
# try to guess a hadoop_home if none given
|
43
|
+
Wukong::CONFIG[:hadoop_home] ||= ENV['HADOOP_HOME'] || '/usr/lib/hadoop'
|
44
|
+
end
|
45
|
+
self.config_options
|
46
|
+
end
|
47
|
+
|