wukong 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
data/doc/TODO.textile
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
Utility
|
2
|
+
|
3
|
+
* columnizing / reconstituting
|
4
|
+
|
5
|
+
* Set up with JRuby
|
6
|
+
* Allow for direct HDFS operations
|
7
|
+
* Make the dfs commands slightly less stupid
|
8
|
+
* add more standard options
|
9
|
+
* Allow for combiners
|
10
|
+
* JobStarter / JobSteps
|
11
|
+
* might as well take dumbo's command line args
|
12
|
+
|
13
|
+
BUGS:
|
14
|
+
|
15
|
+
* Can't do multiple input files in local mode
|
16
|
+
|
17
|
+
Patterns to implement:
|
18
|
+
|
19
|
+
* Stats reducer (takes sum, avg, max, min, std.dev of a numeric field)
|
20
|
+
* Make StructRecordizer work generically with other reducers (spec. AccumulatingReducer)
|
21
|
+
|
22
|
+
Example graph scripts:
|
23
|
+
|
24
|
+
* Multigraph
|
25
|
+
* Pagerank (done)
|
26
|
+
* Breadth-first search
|
27
|
+
* Triangle enumeration
|
28
|
+
* Clustering
|
29
|
+
|
30
|
+
Example example scripts (from http://www.cloudera.com/resources/learning-mapreduce):
|
31
|
+
|
32
|
+
1. Find the [number of] hits by 5 minute timeslot for a website given its access logs.
|
33
|
+
|
34
|
+
2. Find the pages with over 1 million hits in day for a website given its access logs.
|
35
|
+
|
36
|
+
3. Find the pages that link to each page in a collection of webpages.
|
37
|
+
|
38
|
+
4. Calculate the proportion of lines that match a given regular expression for a collection of documents.
|
39
|
+
|
40
|
+
5. Sort tabular data by a primary and secondary column.
|
41
|
+
|
42
|
+
6. Find the most popular pages for a website given its access logs.
|
43
|
+
|
44
|
+
/can use
|
45
|
+
|
46
|
+
|
47
|
+
---------------------------------------------------------------------------
|
48
|
+
|
49
|
+
Add statistics helpers
|
50
|
+
|
51
|
+
* including "running standard deviation":http://www.johndcook.com/standard_deviation.html
|
52
|
+
|
53
|
+
|
54
|
+
---------------------------------------------------------------------------
|
55
|
+
|
56
|
+
Make wutils: tsv-oriented implementations of the coreutils (eg uniq, sort, cut, nl, wc, split, ls, df and du) to instrinsically accept and emit tab-separated records.
|
57
|
+
|
58
|
+
More example hadoop algorithms:
|
59
|
+
Bigram counts: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/bigrams.html
|
60
|
+
* Inverted index construction: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/indexer.html
|
61
|
+
* Pagerank : http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/pagerank.html
|
@@ -0,0 +1,132 @@
|
|
1
|
+
h1. Using Wukong and Wuclan, Part 3 - Parsing
|
2
|
+
|
3
|
+
In part 2 we begain a scraper to trawl our desired part of the social web. Now
|
4
|
+
we're ready to start using Wukong to process the files.
|
5
|
+
|
6
|
+
Files come off the wire as
|
7
|
+
|
8
|
+
:url :scraped_at :response_code :response_message :contents
|
9
|
+
String DateTime (flat) Integer String String (JSON-formatted, tab&newline-munged)
|
10
|
+
|
11
|
+
The contents field is a JSON-formatted mix of records:
|
12
|
+
|
13
|
+
* TwitterFollowersRequest and TwitterFriendsRequest yield an @Array[Hash{user => raw_tweet}]@. We want to extract a stream of AFollowsB (with the request user as user_a for a friends request and user_b for a followers request) along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
|
14
|
+
* TwitterFavoritesRequest yields an array of @Array[Hash{tweet_hash => user_hash}]. We want to extract a stream of AFavoritesB along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records
|
15
|
+
* TwitterUser yields a single @user_hash@ making one each of TwitterUser, TwitterUserProfile and TwitterUserStyle.
|
16
|
+
* UserTimelineRequest and PublicTimelineRequest yield an Array[Hash{tweet => user}]. We want to extract the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
|
17
|
+
* TwitterFollowerIdsRequest and TwitterFriendIdsRequest return an Array[user_ids] (each user_id is a simple Integer). We extract a series of AFollowsB (using the request's user_id as user_a_id or user_b_id)
|
18
|
+
|
19
|
+
We want to split each API response into a stream of those TwitterUser, Tweet, etc. records.
|
20
|
+
|
21
|
+
# Stream in each line (each line holds one request)
|
22
|
+
# turn the line into the corresponding TwitterRequest
|
23
|
+
# have the TwitterRequest parse its JSON contents and construct the TwitterUser, Tweet, etc.
|
24
|
+
# seriealize those records back out as tab-separated lines suitable for further processing with Wukong
|
25
|
+
|
26
|
+
h4. The basics of StructStreamer
|
27
|
+
|
28
|
+
Wukong handles the first and last steps through its StructStreamer and the standard .to_flat method. So the actual structure is really simple:
|
29
|
+
|
30
|
+
#
|
31
|
+
# Instantiate each incoming request.
|
32
|
+
# Stream out the contained classes it generates.
|
33
|
+
#
|
34
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
35
|
+
def process request
|
36
|
+
request.parse do |obj|
|
37
|
+
yield obj
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# This makes the script go.
|
43
|
+
Wukong::Script.new(TwitterRequestParser, nil).run
|
44
|
+
|
45
|
+
In practice, all you need to know is that a StructStreamer gets a stream of objects to parse. Here's an outline of its internals. The Wukong StructStreamer:
|
46
|
+
|
47
|
+
# takes each flattened line:
|
48
|
+
|
49
|
+
"twitter_friends_request http://.... 20090701123456 ...fields... [{...}, {...}, ...json..., {...}]"
|
50
|
+
|
51
|
+
# splits by tabs to create an array of fields
|
52
|
+
|
53
|
+
["twitter_friends_request", "http://...", ... "[{...}, {...}, ...json..., {...}]"]
|
54
|
+
|
55
|
+
# constructs the class name indicated in the first field,
|
56
|
+
using the values extracted from the remaining fields.
|
57
|
+
|
58
|
+
TwitterFriendsRequest.new "http://...", "20090701123456", ... "[{...}, {...}, ...json..., {...}]"
|
59
|
+
|
60
|
+
The last (contents) field is still just a string: there's nothing special about it to Wukong.
|
61
|
+
|
62
|
+
h4. Parsing
|
63
|
+
|
64
|
+
Since each requests' contents are handled in a slightly (and brittle-ly) different manner, we just ask each request object to parse itself and feed out all the TwitterXXXX objects it generates.
|
65
|
+
|
66
|
+
class TwitterFollowersRequest
|
67
|
+
# ...
|
68
|
+
|
69
|
+
def parse &block
|
70
|
+
return unless healthy?
|
71
|
+
# for each raw user/tweet pair in the parsed JSON contents,
|
72
|
+
parsed_contents.each do |hsh|
|
73
|
+
json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
|
74
|
+
next unless json_obj && json_obj.healthy?
|
75
|
+
# Extract user, tweet and relationship
|
76
|
+
yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
|
77
|
+
json_obj.each(&block)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
# ...
|
82
|
+
end
|
83
|
+
|
84
|
+
The TwitterXXXRequest objects consist of one or many hashes with (a raw user hash, and possibly its latest raw tweet hash) or (a raw tweet hash and its raw user hash). The user hash might have only the fields for a TwitterPartialUser or it might have the fields for a full set of TwitterUser, TwitterUserProfile, TwitterUserStyle. Besides which, the fields themselves need some massaging to be compatible with Wukong and other tools in our Map/Reduce toolkit (details explained in a later section).
|
85
|
+
|
86
|
+
The fiddly little details are handled by a JsonUserWithTweet or JsonTweetWithUser (as appropriate) adapter pattern:
|
87
|
+
|
88
|
+
class JsonUserTweetPair
|
89
|
+
def initialize raw, moreinfo
|
90
|
+
# clean up fields in entries (flatten date, true/false -> 1/0, etc)
|
91
|
+
fix_raw_user!
|
92
|
+
fix_raw_tweet!
|
93
|
+
end
|
94
|
+
|
95
|
+
# generate all the contained TwitterXXX objects
|
96
|
+
def each
|
97
|
+
#
|
98
|
+
end
|
99
|
+
|
100
|
+
# create TwitterUser object from raw info
|
101
|
+
def user
|
102
|
+
end
|
103
|
+
# create Tweet object from raw tweet hash
|
104
|
+
def tweet
|
105
|
+
end
|
106
|
+
# ... and so forth
|
107
|
+
end
|
108
|
+
|
109
|
+
I'll ignore the gory details; view the source if you're interested.
|
110
|
+
|
111
|
+
|
112
|
+
h4. Running the script
|
113
|
+
|
114
|
+
Here, again, is the code (in full!) for the twitter_request_parser.rb script.
|
115
|
+
|
116
|
+
#
|
117
|
+
# Instantiate each incoming request.
|
118
|
+
# Stream out the contained classes it generates.
|
119
|
+
#
|
120
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
121
|
+
def process request
|
122
|
+
request.parse do |obj|
|
123
|
+
yield obj
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# This makes the script go.
|
129
|
+
Wukong::Script.new(TwitterRequestParser, nil).run
|
130
|
+
|
131
|
+
That last line is the runner: it makes this a Wukong script with a map phase only. (We'll add in a reducer later on.)
|
132
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
[
|
2
|
+
{ // TwitterUser
|
3
|
+
"id":123456789,
|
4
|
+
// Basic fields
|
5
|
+
"screen_name":"nena", "protected":false, "created_at":"Thu Apr 23 02:00:00 +0000 2009",
|
6
|
+
"followers_count":0, "friends_count":1, "statuses_count":1, "favourites_count":0,
|
7
|
+
// TwitterUserProfile fields
|
8
|
+
"name":"nena", "url":null, "location":null, "description":null,"time_zone":null,"utc_offset":null,
|
9
|
+
// TwitterUserStyle
|
10
|
+
"profile_background_color":"9ae4e8", "profile_text_color":"000000", "profile_link_color":"0000ff", "profile_sidebar_border_color":"87bc44", "profile_sidebar_fill_color":"e0ff92", "profile_background_tile":false,
|
11
|
+
"profile_background_image_url":"http:\/\/static.twitter.com\/images\/themes\/theme1\/bg.gif"
|
12
|
+
"profile_image_url":"http:\/\/s3.amazonaws.com\/twitter_production\/profile_images\/123456789\/crane_normal.JPG",
|
13
|
+
// with enclosed Tweet
|
14
|
+
"status": {
|
15
|
+
"id":123456789,
|
16
|
+
// the twitter_user_id is implied
|
17
|
+
"created_at":"Thu Apr 23 02:00:00 +0000 2009", "favorited":false, "truncated":false, "source":"web",
|
18
|
+
"in_reply_to_user_id":null, "in_reply_to_status_id":null, "in_reply_to_screen_name":null
|
19
|
+
"text":"My cat's breath smells like cat food." },
|
20
|
+
},
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# extract each record from request contents
|
2
|
+
# and stream it to output
|
3
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
4
|
+
def process request
|
5
|
+
request.parse do |obj|
|
6
|
+
yield obj
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# Incoming Request:
|
12
|
+
class TwitterFollowersRequest < Struct.new(
|
13
|
+
:url, :scraped_at, :response_code, :response_message, :moreinfo, :contents)
|
14
|
+
include Monkeyshines::ScrapeRequest
|
15
|
+
end
|
16
|
+
|
17
|
+
# Outgoing classes:
|
18
|
+
class TwitterUser < TypedStruct.new( :id, :scraped_at, :screen_name, :protected, :created_at,
|
19
|
+
:followers_count, :friends_count, :statuses_count, :favourites_count )
|
20
|
+
end
|
21
|
+
class Tweet < TypedStruct.new(:id, :created_at, :twitter_user_id, :favorited, :truncated,
|
22
|
+
:text, :source, :in_reply_to_user_id, :in_reply_to_status_id, :in_reply_to_screen_name)
|
23
|
+
end
|
24
|
+
|
25
|
+
# Parsing code:
|
26
|
+
TwitterFollowersRequest.class_eval do
|
27
|
+
include Monkeyshines::RawJsonContents
|
28
|
+
def parse &block
|
29
|
+
parsed_contents.each do |user_tweet_hash|
|
30
|
+
yield AFollowsB.new user_tweet_hash["id"], self.moreinfo[:request_user_id]
|
31
|
+
yield TwitterUser.from_hash user_tweet_hash
|
32
|
+
yield Tweet.from_hash user_tweet_hash
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# This makes the script go.
|
38
|
+
Wukong::Script.new(TwitterRequestParser, TwitterRequestUniqer).run
|
@@ -0,0 +1,51 @@
|
|
1
|
+
The "Cloudera Hadoop AMI Instances":http://www.cloudera.com/hadoop-ec2 for Amazon's EC2 compute cloud are the fastest, easiest way to get up and running with hadoop. Unfortunately, doing streaming scripts can be a pain, especially if you're doing iterative development.
|
2
|
+
|
3
|
+
Installing NFS to share files along the cluster gives the following conveniences:
|
4
|
+
|
5
|
+
* You don't have to bundle everything up with each run: any path in ~coder/ will refer back via NFS to the filesystem on master.
|
6
|
+
|
7
|
+
* The user can now passwordless ssh among the nodes, since there's only one shared home directory and since we included the user's own public key in the authorized_keys2 file. This lets you easily rsync files among the nodes.
|
8
|
+
|
9
|
+
First, you need to take note of the _internal_ name for your master, perhaps something like @domU-xx-xx-xx-xx-xx-xx.compute-1.internal@.
|
10
|
+
|
11
|
+
As root, on the master (change @compute-1.internal@ to match your setup):
|
12
|
+
|
13
|
+
<pre>
|
14
|
+
apt-get install nfs-kernel-server
|
15
|
+
echo "/home *.compute-1.internal(rw)" >> /etc/exports ;
|
16
|
+
/etc/init.d/nfs-kernel-server stop ;
|
17
|
+
</pre>
|
18
|
+
|
19
|
+
(The @*.compute-1.internal@ part limits host access, but you should take a look at the security settings of both EC2 and the built-in portmapper as well.)
|
20
|
+
|
21
|
+
Next, set up a regular user account on the *master only*. In this case our user will be named 'chimpy':
|
22
|
+
|
23
|
+
<pre>
|
24
|
+
visudo # uncomment the last line, to allow group sudo to sudo
|
25
|
+
groupadd admin
|
26
|
+
adduser chimpy
|
27
|
+
usermod -a -G sudo,admin chimpy
|
28
|
+
su chimpy # now you are the new user
|
29
|
+
ssh-keygen -t rsa # accept all the defaults
|
30
|
+
cat ~/.ssh/id_rsa.pub # can paste this public key into your github, etc
|
31
|
+
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys2
|
32
|
+
</pre>
|
33
|
+
|
34
|
+
Then on each slave (replacing domU-xx-... by the internal name for the master node):
|
35
|
+
|
36
|
+
<pre>
|
37
|
+
apt-get install nfs-common ;
|
38
|
+
echo "domU-xx-xx-xx-xx-xx-xx.compute-1.internal:/home /mnt/home nfs rw 0 0" >> /etc/fstab
|
39
|
+
/etc/init.d/nfs-common restart
|
40
|
+
mkdir /mnt/home
|
41
|
+
mount /mnt/home
|
42
|
+
ln -s /mnt/home/chimpy /home/chimpy
|
43
|
+
</pre>
|
44
|
+
|
45
|
+
You should now be in business.
|
46
|
+
|
47
|
+
Performance tradeoffs should be small as long as you're just sending code files and gems around. *Don't* write out log entries or data to NFS partitions, or you'll effectively perform a denial-of-service attack on the master node.
|
48
|
+
|
49
|
+
------------------------------
|
50
|
+
|
51
|
+
The "Setting up an NFS Server HOWTO":http://nfs.sourceforge.net/nfs-howto/index.html was an immense help, and I recommend reading it carefully.
|
@@ -0,0 +1,29 @@
|
|
1
|
+
|
2
|
+
h2. Hadoop on EC2
|
3
|
+
|
4
|
+
* http://www.cloudera.com/hadoop-ec2
|
5
|
+
* http://www.cloudera.com/hadoop-ec2-ebs-beta
|
6
|
+
|
7
|
+
|
8
|
+
h3. Setup NFS within the cluster
|
9
|
+
|
10
|
+
*
|
11
|
+
* http://nfs.sourceforge.net/nfs-howto/ar01s03.html
|
12
|
+
|
13
|
+
|
14
|
+
h3. Miscellaneous Hadoop Tips
|
15
|
+
|
16
|
+
* The Cloudera AMIs and distribution include BZip2 support. This means that if you have input files with a .bz2 extension, they will be naturally un-bzipped and streamed. (Note that there is a non-trivial penalty for doing so: each bzip'ed file must go, in whole, to a single mapper; and the CPU load for un-bzipping is sizeable.)
|
17
|
+
|
18
|
+
* To _produce_ bzip2 files, specify the new @--compress_output=@ flag. If you have the BZip2 patches installed, you can give @--compress_output=bz2@; everyone should be able to use @--compress_output=gz@.
|
19
|
+
|
20
|
+
* For excellent performance you can patch your install for "Parallel LZO Splitting":http://www.cloudera.com/blog/2009/06/24/parallel-lzo-splittable-compression-for-hadoop/
|
21
|
+
|
22
|
+
|
23
|
+
h3. Tools for EC2 and S3 Management
|
24
|
+
|
25
|
+
* http://s3sync.net/wiki
|
26
|
+
* http://jets3t.s3.amazonaws.com/applications/applications.html#uploader
|
27
|
+
* "ElasticFox"
|
28
|
+
* "S3Fox (S3 Organizer)":
|
29
|
+
* "FoxyProxy":
|
data/doc/index.textile
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: mrflip.github.com/wukong
|
4
|
+
collapse: false
|
5
|
+
---
|
6
|
+
|
7
|
+
h1(gemheader). wukong %(small):: hadoop made easy%
|
8
|
+
|
9
|
+
|
10
|
+
p(description). {{ site.description }}
|
11
|
+
|
12
|
+
|
13
|
+
Treat your dataset like a
|
14
|
+
* stream of lines when it's efficient to process by lines
|
15
|
+
* stream of field arrays when it's efficient to deal directly with fields
|
16
|
+
* stream of lightweight objects when it's efficient to deal with objects
|
17
|
+
|
18
|
+
Wukong is friends with "Hadoop":http://hadoop.apache.org/core the elephant, "Pig":http://hadoop.apache.org/pig/ the query language, and the @cat@ on your command line.
|
19
|
+
|
20
|
+
<notextile><div class="toggle"></notextile>
|
21
|
+
|
22
|
+
h2. How to write a Wukong script
|
23
|
+
|
24
|
+
Here's a script to count words in a text stream:
|
25
|
+
|
26
|
+
<pre><code>
|
27
|
+
require 'wukong'
|
28
|
+
module WordCount
|
29
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
30
|
+
# Emit each word in the line.
|
31
|
+
def process line
|
32
|
+
words = line.strip.split(/\W+/).reject(&:blank?)
|
33
|
+
words.each{|word| yield [word, 1] }
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Reducer < Wukong::Streamer::ListReducer
|
38
|
+
def finalize
|
39
|
+
yield [ key, values.map(&:last).map(&:to_i).sum ]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
Wukong::Script.new(
|
45
|
+
WordCount::Mapper,
|
46
|
+
WordCount::Reducer
|
47
|
+
).run # Execute the script
|
48
|
+
</code></pre>
|
49
|
+
|
50
|
+
The first class, the Mapper, eats lines and craps @[word, count]@ records: word is the /key/, its count is the /value/.
|
51
|
+
|
52
|
+
In the reducer, the values for each key are stacked up into a list; then the record(s) yielded by @#finalize@ are emitted. There are many other ways to write the reducer (most of them are better) -- see the ["examples":examples/]
|
53
|
+
|
54
|
+
<notextile></div><div class="toggle"></notextile>
|
55
|
+
|
56
|
+
h2. Structured data stream
|
57
|
+
|
58
|
+
You can also use structs to treat your dataset as a stream of objects:
|
59
|
+
|
60
|
+
<pre><code>
|
61
|
+
require 'wukong'
|
62
|
+
require 'my_blog' #defines the blog models
|
63
|
+
# structs for our input objects
|
64
|
+
Tweet = Struct.new( :id, :created_at, :twitter_user_id,
|
65
|
+
:in_reply_to_user_id, :in_reply_to_status_id, :text )
|
66
|
+
TwitterUser = Struct.new( :id, :username, :fullname,
|
67
|
+
:homepage, :location, :description )
|
68
|
+
module TwitBlog
|
69
|
+
class Mapper < Wukong::Streamer::RecordStreamer
|
70
|
+
# Watch for tweets by me
|
71
|
+
MY_USER_ID = 24601
|
72
|
+
#
|
73
|
+
# If this is a tweet is by me, convert it to a Post.
|
74
|
+
#
|
75
|
+
# If it is a tweet not by me, convert it to a Comment that
|
76
|
+
# will be paired with the correct Post.
|
77
|
+
#
|
78
|
+
# If it is a TwitterUser, convert it to a User record and
|
79
|
+
# a user_location record
|
80
|
+
#
|
81
|
+
def process record
|
82
|
+
case record
|
83
|
+
when TwitterUser
|
84
|
+
user = MyBlog::User.new.merge(record) # grab the fields in common
|
85
|
+
user_loc = MyBlog::UserLoc.new(record.id, record.location, nil, nil)
|
86
|
+
yield user
|
87
|
+
yield user_loc
|
88
|
+
when Tweet
|
89
|
+
if record.twitter_user_id == MY_USER_ID
|
90
|
+
post = MyBlog::Post.new.merge record
|
91
|
+
post.link = "http://twitter.com/statuses/show/#{record.id}"
|
92
|
+
post.body = record.text
|
93
|
+
post.title = record.text[0..65] + "..."
|
94
|
+
yield post
|
95
|
+
else
|
96
|
+
comment = MyBlog::Comment.new.merge record
|
97
|
+
comment.body = record.text
|
98
|
+
comment.post_id = record.in_reply_to_status_id
|
99
|
+
yield comment
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
Wukong::Script.new( TwitBlog::Mapper, nil ).run # identity reducer
|
106
|
+
</code></pre>
|
107
|
+
|
108
|
+
<notextile></div><div class="toggle"></notextile>
|
109
|
+
|
110
|
+
h2. More info
|
111
|
+
|
112
|
+
There are many useful examples (including an actually-useful version of the WordCount script) in examples/ directory.
|
113
|
+
|
114
|
+
h3. Authors
|
115
|
+
|
116
|
+
Philip (flip) Kromer (flip@infochimps.org)
|
117
|
+
|
118
|
+
Patches submitted by:
|
119
|
+
* gemified by Ben Woosley (ben.woosley@gmail.com)
|
120
|
+
* ruby interpreter path fix by "Yuichiro MASUI":http://github.com/masuidrive - masui@masuidrive.jp - http://blog.masuidrive.jp/
|
121
|
+
|
122
|
+
<notextile></div></notextile>
|
123
|
+
|
124
|
+
{% include news.html %}
|