wukong 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
data/doc/TODO.textile ADDED
@@ -0,0 +1,61 @@
1
+ Utility
2
+
3
+ * columnizing / reconstituting
4
+
5
+ * Set up with JRuby
6
+ * Allow for direct HDFS operations
7
+ * Make the dfs commands slightly less stupid
8
+ * add more standard options
9
+ * Allow for combiners
10
+ * JobStarter / JobSteps
11
+ * might as well take dumbo's command line args
12
+
13
+ BUGS:
14
+
15
+ * Can't do multiple input files in local mode
16
+
17
+ Patterns to implement:
18
+
19
+ * Stats reducer (takes sum, avg, max, min, std.dev of a numeric field)
20
+ * Make StructRecordizer work generically with other reducers (spec. AccumulatingReducer)
21
+
22
+ Example graph scripts:
23
+
24
+ * Multigraph
25
+ * Pagerank (done)
26
+ * Breadth-first search
27
+ * Triangle enumeration
28
+ * Clustering
29
+
30
+ Example example scripts (from http://www.cloudera.com/resources/learning-mapreduce):
31
+
32
+ 1. Find the [number of] hits by 5 minute timeslot for a website given its access logs.
33
+
34
+ 2. Find the pages with over 1 million hits in day for a website given its access logs.
35
+
36
+ 3. Find the pages that link to each page in a collection of webpages.
37
+
38
+ 4. Calculate the proportion of lines that match a given regular expression for a collection of documents.
39
+
40
+ 5. Sort tabular data by a primary and secondary column.
41
+
42
+ 6. Find the most popular pages for a website given its access logs.
43
+
44
+ /can use
45
+
46
+
47
+ ---------------------------------------------------------------------------
48
+
49
+ Add statistics helpers
50
+
51
+ * including "running standard deviation":http://www.johndcook.com/standard_deviation.html
52
+
53
+
54
+ ---------------------------------------------------------------------------
55
+
56
+ Make wutils: tsv-oriented implementations of the coreutils (eg uniq, sort, cut, nl, wc, split, ls, df and du) to instrinsically accept and emit tab-separated records.
57
+
58
+ More example hadoop algorithms:
59
+ Bigram counts: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/bigrams.html
60
+ * Inverted index construction: http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/indexer.html
61
+ * Pagerank : http://www.umiacs.umd.edu/~jimmylin/cloud9/docs/exercises/pagerank.html
@@ -0,0 +1,2 @@
1
+ h1. Using Wukong and Wuclan, Part 1 - Setup
2
+
@@ -0,0 +1,2 @@
1
+ h1. Using Wukong and Wuclan, Part 2 - Scraping
2
+
@@ -0,0 +1,132 @@
1
+ h1. Using Wukong and Wuclan, Part 3 - Parsing
2
+
3
+ In part 2 we begain a scraper to trawl our desired part of the social web. Now
4
+ we're ready to start using Wukong to process the files.
5
+
6
+ Files come off the wire as
7
+
8
+ :url :scraped_at :response_code :response_message :contents
9
+ String DateTime (flat) Integer String String (JSON-formatted, tab&newline-munged)
10
+
11
+ The contents field is a JSON-formatted mix of records:
12
+
13
+ * TwitterFollowersRequest and TwitterFriendsRequest yield an @Array[Hash{user => raw_tweet}]@. We want to extract a stream of AFollowsB (with the request user as user_a for a friends request and user_b for a followers request) along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
14
+ * TwitterFavoritesRequest yields an array of @Array[Hash{tweet_hash => user_hash}]. We want to extract a stream of AFavoritesB along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records
15
+ * TwitterUser yields a single @user_hash@ making one each of TwitterUser, TwitterUserProfile and TwitterUserStyle.
16
+ * UserTimelineRequest and PublicTimelineRequest yield an Array[Hash{tweet => user}]. We want to extract the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
17
+ * TwitterFollowerIdsRequest and TwitterFriendIdsRequest return an Array[user_ids] (each user_id is a simple Integer). We extract a series of AFollowsB (using the request's user_id as user_a_id or user_b_id)
18
+
19
+ We want to split each API response into a stream of those TwitterUser, Tweet, etc. records.
20
+
21
+ # Stream in each line (each line holds one request)
22
+ # turn the line into the corresponding TwitterRequest
23
+ # have the TwitterRequest parse its JSON contents and construct the TwitterUser, Tweet, etc.
24
+ # seriealize those records back out as tab-separated lines suitable for further processing with Wukong
25
+
26
+ h4. The basics of StructStreamer
27
+
28
+ Wukong handles the first and last steps through its StructStreamer and the standard .to_flat method. So the actual structure is really simple:
29
+
30
+ #
31
+ # Instantiate each incoming request.
32
+ # Stream out the contained classes it generates.
33
+ #
34
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
35
+ def process request
36
+ request.parse do |obj|
37
+ yield obj
38
+ end
39
+ end
40
+ end
41
+
42
+ # This makes the script go.
43
+ Wukong::Script.new(TwitterRequestParser, nil).run
44
+
45
+ In practice, all you need to know is that a StructStreamer gets a stream of objects to parse. Here's an outline of its internals. The Wukong StructStreamer:
46
+
47
+ # takes each flattened line:
48
+
49
+ "twitter_friends_request http://.... 20090701123456 ...fields... [{...}, {...}, ...json..., {...}]"
50
+
51
+ # splits by tabs to create an array of fields
52
+
53
+ ["twitter_friends_request", "http://...", ... "[{...}, {...}, ...json..., {...}]"]
54
+
55
+ # constructs the class name indicated in the first field,
56
+ using the values extracted from the remaining fields.
57
+
58
+ TwitterFriendsRequest.new "http://...", "20090701123456", ... "[{...}, {...}, ...json..., {...}]"
59
+
60
+ The last (contents) field is still just a string: there's nothing special about it to Wukong.
61
+
62
+ h4. Parsing
63
+
64
+ Since each requests' contents are handled in a slightly (and brittle-ly) different manner, we just ask each request object to parse itself and feed out all the TwitterXXXX objects it generates.
65
+
66
+ class TwitterFollowersRequest
67
+ # ...
68
+
69
+ def parse &block
70
+ return unless healthy?
71
+ # for each raw user/tweet pair in the parsed JSON contents,
72
+ parsed_contents.each do |hsh|
73
+ json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
74
+ next unless json_obj && json_obj.healthy?
75
+ # Extract user, tweet and relationship
76
+ yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
77
+ json_obj.each(&block)
78
+ end
79
+ end
80
+
81
+ # ...
82
+ end
83
+
84
+ The TwitterXXXRequest objects consist of one or many hashes with (a raw user hash, and possibly its latest raw tweet hash) or (a raw tweet hash and its raw user hash). The user hash might have only the fields for a TwitterPartialUser or it might have the fields for a full set of TwitterUser, TwitterUserProfile, TwitterUserStyle. Besides which, the fields themselves need some massaging to be compatible with Wukong and other tools in our Map/Reduce toolkit (details explained in a later section).
85
+
86
+ The fiddly little details are handled by a JsonUserWithTweet or JsonTweetWithUser (as appropriate) adapter pattern:
87
+
88
+ class JsonUserTweetPair
89
+ def initialize raw, moreinfo
90
+ # clean up fields in entries (flatten date, true/false -> 1/0, etc)
91
+ fix_raw_user!
92
+ fix_raw_tweet!
93
+ end
94
+
95
+ # generate all the contained TwitterXXX objects
96
+ def each
97
+ #
98
+ end
99
+
100
+ # create TwitterUser object from raw info
101
+ def user
102
+ end
103
+ # create Tweet object from raw tweet hash
104
+ def tweet
105
+ end
106
+ # ... and so forth
107
+ end
108
+
109
+ I'll ignore the gory details; view the source if you're interested.
110
+
111
+
112
+ h4. Running the script
113
+
114
+ Here, again, is the code (in full!) for the twitter_request_parser.rb script.
115
+
116
+ #
117
+ # Instantiate each incoming request.
118
+ # Stream out the contained classes it generates.
119
+ #
120
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
121
+ def process request
122
+ request.parse do |obj|
123
+ yield obj
124
+ end
125
+ end
126
+ end
127
+
128
+ # This makes the script go.
129
+ Wukong::Script.new(TwitterRequestParser, nil).run
130
+
131
+ That last line is the runner: it makes this a Wukong script with a map phase only. (We'll add in a reducer later on.)
132
+
@@ -0,0 +1,20 @@
1
+ [
2
+ { // TwitterUser
3
+ "id":123456789,
4
+ // Basic fields
5
+ "screen_name":"nena", "protected":false, "created_at":"Thu Apr 23 02:00:00 +0000 2009",
6
+ "followers_count":0, "friends_count":1, "statuses_count":1, "favourites_count":0,
7
+ // TwitterUserProfile fields
8
+ "name":"nena", "url":null, "location":null, "description":null,"time_zone":null,"utc_offset":null,
9
+ // TwitterUserStyle
10
+ "profile_background_color":"9ae4e8", "profile_text_color":"000000", "profile_link_color":"0000ff", "profile_sidebar_border_color":"87bc44", "profile_sidebar_fill_color":"e0ff92", "profile_background_tile":false,
11
+ "profile_background_image_url":"http:\/\/static.twitter.com\/images\/themes\/theme1\/bg.gif"
12
+ "profile_image_url":"http:\/\/s3.amazonaws.com\/twitter_production\/profile_images\/123456789\/crane_normal.JPG",
13
+ // with enclosed Tweet
14
+ "status": {
15
+ "id":123456789,
16
+ // the twitter_user_id is implied
17
+ "created_at":"Thu Apr 23 02:00:00 +0000 2009", "favorited":false, "truncated":false, "source":"web",
18
+ "in_reply_to_user_id":null, "in_reply_to_status_id":null, "in_reply_to_screen_name":null
19
+ "text":"My cat's breath smells like cat food." },
20
+ },
@@ -0,0 +1,38 @@
1
+ # extract each record from request contents
2
+ # and stream it to output
3
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
4
+ def process request
5
+ request.parse do |obj|
6
+ yield obj
7
+ end
8
+ end
9
+ end
10
+
11
+ # Incoming Request:
12
+ class TwitterFollowersRequest < Struct.new(
13
+ :url, :scraped_at, :response_code, :response_message, :moreinfo, :contents)
14
+ include Monkeyshines::ScrapeRequest
15
+ end
16
+
17
+ # Outgoing classes:
18
+ class TwitterUser < TypedStruct.new( :id, :scraped_at, :screen_name, :protected, :created_at,
19
+ :followers_count, :friends_count, :statuses_count, :favourites_count )
20
+ end
21
+ class Tweet < TypedStruct.new(:id, :created_at, :twitter_user_id, :favorited, :truncated,
22
+ :text, :source, :in_reply_to_user_id, :in_reply_to_status_id, :in_reply_to_screen_name)
23
+ end
24
+
25
+ # Parsing code:
26
+ TwitterFollowersRequest.class_eval do
27
+ include Monkeyshines::RawJsonContents
28
+ def parse &block
29
+ parsed_contents.each do |user_tweet_hash|
30
+ yield AFollowsB.new user_tweet_hash["id"], self.moreinfo[:request_user_id]
31
+ yield TwitterUser.from_hash user_tweet_hash
32
+ yield Tweet.from_hash user_tweet_hash
33
+ end
34
+ end
35
+ end
36
+
37
+ # This makes the script go.
38
+ Wukong::Script.new(TwitterRequestParser, TwitterRequestUniqer).run
@@ -0,0 +1,51 @@
1
+ The "Cloudera Hadoop AMI Instances":http://www.cloudera.com/hadoop-ec2 for Amazon's EC2 compute cloud are the fastest, easiest way to get up and running with hadoop. Unfortunately, doing streaming scripts can be a pain, especially if you're doing iterative development.
2
+
3
+ Installing NFS to share files along the cluster gives the following conveniences:
4
+
5
+ * You don't have to bundle everything up with each run: any path in ~coder/ will refer back via NFS to the filesystem on master.
6
+
7
+ * The user can now passwordless ssh among the nodes, since there's only one shared home directory and since we included the user's own public key in the authorized_keys2 file. This lets you easily rsync files among the nodes.
8
+
9
+ First, you need to take note of the _internal_ name for your master, perhaps something like @domU-xx-xx-xx-xx-xx-xx.compute-1.internal@.
10
+
11
+ As root, on the master (change @compute-1.internal@ to match your setup):
12
+
13
+ <pre>
14
+ apt-get install nfs-kernel-server
15
+ echo "/home *.compute-1.internal(rw)" >> /etc/exports ;
16
+ /etc/init.d/nfs-kernel-server stop ;
17
+ </pre>
18
+
19
+ (The @*.compute-1.internal@ part limits host access, but you should take a look at the security settings of both EC2 and the built-in portmapper as well.)
20
+
21
+ Next, set up a regular user account on the *master only*. In this case our user will be named 'chimpy':
22
+
23
+ <pre>
24
+ visudo # uncomment the last line, to allow group sudo to sudo
25
+ groupadd admin
26
+ adduser chimpy
27
+ usermod -a -G sudo,admin chimpy
28
+ su chimpy # now you are the new user
29
+ ssh-keygen -t rsa # accept all the defaults
30
+ cat ~/.ssh/id_rsa.pub # can paste this public key into your github, etc
31
+ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys2
32
+ </pre>
33
+
34
+ Then on each slave (replacing domU-xx-... by the internal name for the master node):
35
+
36
+ <pre>
37
+ apt-get install nfs-common ;
38
+ echo "domU-xx-xx-xx-xx-xx-xx.compute-1.internal:/home /mnt/home nfs rw 0 0" >> /etc/fstab
39
+ /etc/init.d/nfs-common restart
40
+ mkdir /mnt/home
41
+ mount /mnt/home
42
+ ln -s /mnt/home/chimpy /home/chimpy
43
+ </pre>
44
+
45
+ You should now be in business.
46
+
47
+ Performance tradeoffs should be small as long as you're just sending code files and gems around. *Don't* write out log entries or data to NFS partitions, or you'll effectively perform a denial-of-service attack on the master node.
48
+
49
+ ------------------------------
50
+
51
+ The "Setting up an NFS Server HOWTO":http://nfs.sourceforge.net/nfs-howto/index.html was an immense help, and I recommend reading it carefully.
@@ -0,0 +1,29 @@
1
+
2
+ h2. Hadoop on EC2
3
+
4
+ * http://www.cloudera.com/hadoop-ec2
5
+ * http://www.cloudera.com/hadoop-ec2-ebs-beta
6
+
7
+
8
+ h3. Setup NFS within the cluster
9
+
10
+ *
11
+ * http://nfs.sourceforge.net/nfs-howto/ar01s03.html
12
+
13
+
14
+ h3. Miscellaneous Hadoop Tips
15
+
16
+ * The Cloudera AMIs and distribution include BZip2 support. This means that if you have input files with a .bz2 extension, they will be naturally un-bzipped and streamed. (Note that there is a non-trivial penalty for doing so: each bzip'ed file must go, in whole, to a single mapper; and the CPU load for un-bzipping is sizeable.)
17
+
18
+ * To _produce_ bzip2 files, specify the new @--compress_output=@ flag. If you have the BZip2 patches installed, you can give @--compress_output=bz2@; everyone should be able to use @--compress_output=gz@.
19
+
20
+ * For excellent performance you can patch your install for "Parallel LZO Splitting":http://www.cloudera.com/blog/2009/06/24/parallel-lzo-splittable-compression-for-hadoop/
21
+
22
+
23
+ h3. Tools for EC2 and S3 Management
24
+
25
+ * http://s3sync.net/wiki
26
+ * http://jets3t.s3.amazonaws.com/applications/applications.html#uploader
27
+ * "ElasticFox"
28
+ * "S3Fox (S3 Organizer)":
29
+ * "FoxyProxy":
data/doc/index.textile ADDED
@@ -0,0 +1,124 @@
1
+ ---
2
+ layout: default
3
+ title: mrflip.github.com/wukong
4
+ collapse: false
5
+ ---
6
+
7
+ h1(gemheader). wukong %(small):: hadoop made easy%
8
+
9
+
10
+ p(description). {{ site.description }}
11
+
12
+
13
+ Treat your dataset like a
14
+ * stream of lines when it's efficient to process by lines
15
+ * stream of field arrays when it's efficient to deal directly with fields
16
+ * stream of lightweight objects when it's efficient to deal with objects
17
+
18
+ Wukong is friends with "Hadoop":http://hadoop.apache.org/core the elephant, "Pig":http://hadoop.apache.org/pig/ the query language, and the @cat@ on your command line.
19
+
20
+ <notextile><div class="toggle"></notextile>
21
+
22
+ h2. How to write a Wukong script
23
+
24
+ Here's a script to count words in a text stream:
25
+
26
+ <pre><code>
27
+ require 'wukong'
28
+ module WordCount
29
+ class Mapper < Wukong::Streamer::LineStreamer
30
+ # Emit each word in the line.
31
+ def process line
32
+ words = line.strip.split(/\W+/).reject(&:blank?)
33
+ words.each{|word| yield [word, 1] }
34
+ end
35
+ end
36
+
37
+ class Reducer < Wukong::Streamer::ListReducer
38
+ def finalize
39
+ yield [ key, values.map(&:last).map(&:to_i).sum ]
40
+ end
41
+ end
42
+ end
43
+
44
+ Wukong::Script.new(
45
+ WordCount::Mapper,
46
+ WordCount::Reducer
47
+ ).run # Execute the script
48
+ </code></pre>
49
+
50
+ The first class, the Mapper, eats lines and craps @[word, count]@ records: word is the /key/, its count is the /value/.
51
+
52
+ In the reducer, the values for each key are stacked up into a list; then the record(s) yielded by @#finalize@ are emitted. There are many other ways to write the reducer (most of them are better) -- see the ["examples":examples/]
53
+
54
+ <notextile></div><div class="toggle"></notextile>
55
+
56
+ h2. Structured data stream
57
+
58
+ You can also use structs to treat your dataset as a stream of objects:
59
+
60
+ <pre><code>
61
+ require 'wukong'
62
+ require 'my_blog' #defines the blog models
63
+ # structs for our input objects
64
+ Tweet = Struct.new( :id, :created_at, :twitter_user_id,
65
+ :in_reply_to_user_id, :in_reply_to_status_id, :text )
66
+ TwitterUser = Struct.new( :id, :username, :fullname,
67
+ :homepage, :location, :description )
68
+ module TwitBlog
69
+ class Mapper < Wukong::Streamer::RecordStreamer
70
+ # Watch for tweets by me
71
+ MY_USER_ID = 24601
72
+ #
73
+ # If this is a tweet is by me, convert it to a Post.
74
+ #
75
+ # If it is a tweet not by me, convert it to a Comment that
76
+ # will be paired with the correct Post.
77
+ #
78
+ # If it is a TwitterUser, convert it to a User record and
79
+ # a user_location record
80
+ #
81
+ def process record
82
+ case record
83
+ when TwitterUser
84
+ user = MyBlog::User.new.merge(record) # grab the fields in common
85
+ user_loc = MyBlog::UserLoc.new(record.id, record.location, nil, nil)
86
+ yield user
87
+ yield user_loc
88
+ when Tweet
89
+ if record.twitter_user_id == MY_USER_ID
90
+ post = MyBlog::Post.new.merge record
91
+ post.link = "http://twitter.com/statuses/show/#{record.id}"
92
+ post.body = record.text
93
+ post.title = record.text[0..65] + "..."
94
+ yield post
95
+ else
96
+ comment = MyBlog::Comment.new.merge record
97
+ comment.body = record.text
98
+ comment.post_id = record.in_reply_to_status_id
99
+ yield comment
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
105
+ Wukong::Script.new( TwitBlog::Mapper, nil ).run # identity reducer
106
+ </code></pre>
107
+
108
+ <notextile></div><div class="toggle"></notextile>
109
+
110
+ h2. More info
111
+
112
+ There are many useful examples (including an actually-useful version of the WordCount script) in examples/ directory.
113
+
114
+ h3. Authors
115
+
116
+ Philip (flip) Kromer (flip@infochimps.org)
117
+
118
+ Patches submitted by:
119
+ * gemified by Ben Woosley (ben.woosley@gmail.com)
120
+ * ruby interpreter path fix by "Yuichiro MASUI":http://github.com/masuidrive - masui@masuidrive.jp - http://blog.masuidrive.jp/
121
+
122
+ <notextile></div></notextile>
123
+
124
+ {% include news.html %}