wukong 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +107 -0
- data/README.textile +166 -0
- data/bin/cutc +30 -0
- data/bin/cuttab +5 -0
- data/bin/greptrue +8 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-du +81 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-ls +10 -0
- data/bin/hdp-mkdir +3 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +11 -0
- data/bin/hdp-sort +29 -0
- data/bin/hdp-stream +29 -0
- data/bin/hdp-stream-flat +18 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/md5sort +20 -0
- data/bin/tabchar +5 -0
- data/bin/uniqc +3 -0
- data/bin/wu-hist +3 -0
- data/bin/wu-lign +177 -0
- data/bin/wu-sum +30 -0
- data/doc/INSTALL.textile +41 -0
- data/doc/LICENSE.textile +107 -0
- data/doc/README-tutorial.textile +163 -0
- data/doc/README-wulign.textile +59 -0
- data/doc/README-wutils.textile +128 -0
- data/doc/TODO.textile +61 -0
- data/doc/UsingWukong-part1-setup.textile +2 -0
- data/doc/UsingWukong-part2-scraping.textile +2 -0
- data/doc/UsingWukong-part3-parsing.textile +132 -0
- data/doc/code/api_response_example.txt +20 -0
- data/doc/code/parser_skeleton.rb +38 -0
- data/doc/hadoop-nfs.textile +51 -0
- data/doc/hadoop-setup.textile +29 -0
- data/doc/index.textile +124 -0
- data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
- data/doc/links.textile +42 -0
- data/doc/overview.textile +91 -0
- data/doc/pig/PigLatinExpressionsList.txt +122 -0
- data/doc/pig/PigLatinReferenceManual.html +19134 -0
- data/doc/pig/PigLatinReferenceManual.txt +1640 -0
- data/doc/tips.textile +116 -0
- data/doc/usage.textile +102 -0
- data/doc/utils.textile +48 -0
- data/examples/README.txt +17 -0
- data/examples/and_pig/sample_queries.rb +128 -0
- data/examples/apache_log_parser.rb +53 -0
- data/examples/count_keys.rb +56 -0
- data/examples/count_keys_at_mapper.rb +57 -0
- data/examples/graph/adjacency_list.rb +74 -0
- data/examples/graph/breadth_first_search.rb +79 -0
- data/examples/graph/gen_2paths.rb +68 -0
- data/examples/graph/gen_multi_edge.rb +103 -0
- data/examples/graph/gen_symmetric_links.rb +53 -0
- data/examples/package-local.rb +100 -0
- data/examples/package.rb +96 -0
- data/examples/pagerank/README.textile +6 -0
- data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
- data/examples/pagerank/pagerank.rb +88 -0
- data/examples/pagerank/pagerank_initialize.rb +46 -0
- data/examples/pagerank/run_pagerank.sh +19 -0
- data/examples/rank_and_bin.rb +173 -0
- data/examples/run_all.sh +47 -0
- data/examples/sample_records.rb +44 -0
- data/examples/size.rb +60 -0
- data/examples/word_count.rb +95 -0
- data/lib/wukong.rb +11 -0
- data/lib/wukong/and_pig.rb +62 -0
- data/lib/wukong/and_pig/README.textile +12 -0
- data/lib/wukong/and_pig/as.rb +37 -0
- data/lib/wukong/and_pig/data_types.rb +30 -0
- data/lib/wukong/and_pig/functions.rb +50 -0
- data/lib/wukong/and_pig/generate.rb +85 -0
- data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
- data/lib/wukong/and_pig/junk.rb +51 -0
- data/lib/wukong/and_pig/operators.rb +8 -0
- data/lib/wukong/and_pig/operators/compound.rb +29 -0
- data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
- data/lib/wukong/and_pig/operators/execution.rb +15 -0
- data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
- data/lib/wukong/and_pig/operators/foreach.rb +98 -0
- data/lib/wukong/and_pig/operators/groupies.rb +212 -0
- data/lib/wukong/and_pig/operators/load_store.rb +65 -0
- data/lib/wukong/and_pig/operators/meta.rb +42 -0
- data/lib/wukong/and_pig/operators/relational.rb +129 -0
- data/lib/wukong/and_pig/pig_struct.rb +48 -0
- data/lib/wukong/and_pig/pig_var.rb +95 -0
- data/lib/wukong/and_pig/symbol.rb +29 -0
- data/lib/wukong/and_pig/utils.rb +0 -0
- data/lib/wukong/bad_record.rb +18 -0
- data/lib/wukong/boot.rb +47 -0
- data/lib/wukong/datatypes.rb +24 -0
- data/lib/wukong/datatypes/enum.rb +123 -0
- data/lib/wukong/dfs.rb +80 -0
- data/lib/wukong/encoding.rb +111 -0
- data/lib/wukong/extensions.rb +15 -0
- data/lib/wukong/extensions/array.rb +18 -0
- data/lib/wukong/extensions/blank.rb +93 -0
- data/lib/wukong/extensions/class.rb +189 -0
- data/lib/wukong/extensions/date_time.rb +24 -0
- data/lib/wukong/extensions/emittable.rb +82 -0
- data/lib/wukong/extensions/hash.rb +120 -0
- data/lib/wukong/extensions/hash_like.rb +119 -0
- data/lib/wukong/extensions/hashlike_class.rb +47 -0
- data/lib/wukong/extensions/module.rb +2 -0
- data/lib/wukong/extensions/pathname.rb +27 -0
- data/lib/wukong/extensions/string.rb +65 -0
- data/lib/wukong/extensions/struct.rb +17 -0
- data/lib/wukong/extensions/symbol.rb +11 -0
- data/lib/wukong/logger.rb +53 -0
- data/lib/wukong/models/graph.rb +27 -0
- data/lib/wukong/rdf.rb +104 -0
- data/lib/wukong/schema.rb +37 -0
- data/lib/wukong/script.rb +265 -0
- data/lib/wukong/script/hadoop_command.rb +111 -0
- data/lib/wukong/script/local_command.rb +14 -0
- data/lib/wukong/streamer.rb +13 -0
- data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
- data/lib/wukong/streamer/base.rb +76 -0
- data/lib/wukong/streamer/count_keys.rb +30 -0
- data/lib/wukong/streamer/count_lines.rb +26 -0
- data/lib/wukong/streamer/filter.rb +20 -0
- data/lib/wukong/streamer/line_streamer.rb +12 -0
- data/lib/wukong/streamer/list_reducer.rb +20 -0
- data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
- data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
- data/lib/wukong/streamer/set_reducer.rb +14 -0
- data/lib/wukong/streamer/struct_streamer.rb +48 -0
- data/lib/wukong/streamer/summing_reducer.rb +29 -0
- data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
- data/lib/wukong/typed_struct.rb +12 -0
- data/lib/wukong/wukong_class.rb +21 -0
- data/spec/bin/hdp-wc_spec.rb +4 -0
- data/spec/spec_helper.rb +0 -0
- data/wukong.gemspec +179 -0
- metadata +214 -0
data/doc/tips.textile
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
|
2
|
+
h3. Don't Drop ACID while exploring Big Data
|
3
|
+
|
4
|
+
The traditional "ACID quartet":http://en.wikipedia.org/wiki/ACID for relational databases can be re-interpreted in a Big Data context:
|
5
|
+
|
6
|
+
* A -- Associative
|
7
|
+
* C -- Commutative
|
8
|
+
* I -- Idempotent
|
9
|
+
* D -- Distributed
|
10
|
+
* (*) -- (and where possible, left in sort order)
|
11
|
+
|
12
|
+
Finally, where possible leave things in sort order by some appropriate index. Clearly I'm not talking about introducing extra unnecessary sorts on ephemeral data. For things that will be read (and experimented with) much more often than they're written, though, it's worth running a final sort. Now you can
|
13
|
+
|
14
|
+
* Efficiently index into a massive dataset with binary search
|
15
|
+
* Do a direct merge sort on two files with the same sort order
|
16
|
+
* Run a reducer directly across the data
|
17
|
+
* Assign a synthetic key by just serially numbering lines (either distribute a unique prefix to each mapper
|
18
|
+
|
19
|
+
Note: for files that will live on the DFS, you should usually *not* do a total sort,
|
20
|
+
|
21
|
+
h3. If it's not broken, it's wrong
|
22
|
+
|
23
|
+
Something that goes wrong one in a five million times will crop up hundreds of times in a billion-record collection.
|
24
|
+
|
25
|
+
h3. Error is not normally distributed
|
26
|
+
|
27
|
+
What's more, errors introduced will not in general be normally distributed and their impact may not decrease with increasing data size.
|
28
|
+
|
29
|
+
h3. Encode once, and carefully.
|
30
|
+
|
31
|
+
Encoding violates idempotence. Data brought in from elsewhere *must* be considered unparsable, ill-formatted and rife with illegal characters.
|
32
|
+
|
33
|
+
* Immediately fix a copy of the original data with as minimal encoding as possible.
|
34
|
+
* Follow this with a separate parse stage to emit perfectly well-formed, tab-separated / newline delimited data
|
35
|
+
* In this parse stage, encode the data to 7-bits, free of internal tabs, backslashes, carriage return/line feed or control characters. You want your encoding scheme to be
|
36
|
+
** perfectly reversible
|
37
|
+
** widely implemented
|
38
|
+
** easily parseable
|
39
|
+
** recognizable: incoming data that is mostly inoffensive (a json record, or each line of a document such as this one) should be minimally altered from its original. This lets you do rough exploration with sort/cut/grep and friends.
|
40
|
+
** !! Involve **NO QUOTING**, only escaping. I can write a simple regexp to decode entities such as %10, \n or . This regexp will behave harmlessly with ill-formed data (eg %%10 or &&; or \ at end of line) and is robust against data being split or interpolated. Schemes such as "quoting: it's bad", %Q{quoting: "just say no"} or <notextile><notextile>tagged markup</notextile></notextile> require a recursive parser. An extra or missing quote mark is almost impossible to backtrack. And av
|
41
|
+
|
42
|
+
In the absence of some lightweight, mostly-transparent, ASCII-compatible *AND* idempotent encoding scheme lurking in a back closet of some algorithms book -- how to handle the initial lousy payload coming off the wire?
|
43
|
+
|
44
|
+
* For data that is *mostly* text in a western language, you'll do well wiht XML encoding (with <notextile>[\n\r\t\\]</notextile> forced to encode as entities)
|
45
|
+
* URL encoding isn't as recognizable, but is also safe. Use this for things like URIs and filenames, or if you want to be /really/ paranoid about escaping.
|
46
|
+
* For binary data, Binhex is efficient enough and every toolkit can handle it. There are more data-efficient ascii-compatible encoding schemes but it's not worth the hassle for the 10% or whatever gain in size.
|
47
|
+
* If your payload itself is XML data, consider using \0 (nul) between records, with a fixed number of tab-separated metadata fields leading the XML data, which can then include tabs, newlines, or whatever the hell it wants. No changes are made to the data apart from a quick gsub to remove any (highly illegal) \0 in the XML data itself. A later parse round will convert it to structured hadoop-able data. Ex:
|
48
|
+
|
49
|
+
{% highlight html %}
|
50
|
+
feed_request 20090809101112 200 OK <?xml version='1.0' encoding='utf-8' ?>
|
51
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
52
|
+
<html lang='en' xml:lang='en' xmlns='http://www.w3.org/1999/xhtml'>
|
53
|
+
<head>
|
54
|
+
<title>infochimps.org — Find Any Dataset in the World</title>
|
55
|
+
{% endhighlight %}
|
56
|
+
|
57
|
+
p. Many of the command line utilities (@cat@, @grep@, etc.) will accept nul-delimited files.
|
58
|
+
|
59
|
+
You may be tempted to use XML around your XML so you can XML while you XML, but this is ultimately only done right by parsing or scrubbing the inputm and at that point you should just translate directly to a reasonable tab/newline format. (Even if that format is tsv-compatible JSON).
|
60
|
+
|
61
|
+
h3. Do your exception handling in-band
|
62
|
+
|
63
|
+
A large, heavily-used cluster will want to have ganglia or "scribe":http://www.cloudera.com/blog/2008/11/02/configuring-and-using-scribe-for-hadoop-log-collection/ or the like collecting and managing log data. "Splunk":http://www.splunk.com/ is a compelling option I haven't myself used, but it is "broadly endorsed.":http://www.igvita.com/2008/10/22/distributed-logging-syslog-ng-splunk/
|
64
|
+
|
65
|
+
However, it's worth considering another extremely efficient, simple and powerful distributed system for routing massive quantities of data in a structured way, namely wukong|hadoop itself.
|
66
|
+
|
67
|
+
Wukong gives you a BadRecord class -- just rescue errors, pass the full or partial contents of the offending input. and emit the BadRecord instance in-band. They'll be serialized out along with the rest, and at your preference can be made to reduce to a single instance. Do analysis on them at your leisure; by default, any StructStreamer will silently discard *inbound* BadRecords -- they won't survive past the current generation.
|
68
|
+
|
69
|
+
h3. Don't be afraid to use the command line as an IDE
|
70
|
+
|
71
|
+
%{ highlight sh %}
|
72
|
+
cat /data/foo.tsv | ruby -ne 'puts $_.chomp.scan(/text="([^"]+)"/).join("\t")'
|
73
|
+
{% endhighlight %}
|
74
|
+
|
75
|
+
|
76
|
+
h3. Keys
|
77
|
+
|
78
|
+
* Artificial key: assigned externally, key is not a function of the object's intrinsic values. A social security number is an artificial key. Artificial
|
79
|
+
|
80
|
+
* Natural Key: minimal subset of fields with _intrinsic semantic value_ that _uniquely identify_ the record. My name isn't unique, but my fingerprint is both uniqe and intrinsic. Given the object (me) you can generate the key, and given the key there's exactly one object (me) that matches.
|
81
|
+
|
82
|
+
h4. other fields
|
83
|
+
|
84
|
+
* Mutable:
|
85
|
+
** A user's 'bio' section.
|
86
|
+
|
87
|
+
* Immutable:
|
88
|
+
** A user's created_at date is immutable: it doesn't help identify the person but it will never change.
|
89
|
+
|
90
|
+
h4. Natural keys are right for big data
|
91
|
+
|
92
|
+
Synthetic keys suck. They demand locality or a central keymaster.
|
93
|
+
|
94
|
+
|
95
|
+
* Use the natural key
|
96
|
+
* Hash the natural key. This has some drawbacks
|
97
|
+
|
98
|
+
OK, fine. you need a synthetic key
|
99
|
+
|
100
|
+
* Do a total sort, and use nl
|
101
|
+
* Generate
|
102
|
+
* Use a single reducer to reduce locality. YUCK.
|
103
|
+
* have each mapper generate a unique prefix; number each line as "prefix#{line_number}" or whatever.
|
104
|
+
|
105
|
+
How do you get a unique prefix?
|
106
|
+
|
107
|
+
* Distribute a unique prefix to each mapper out-of-band. People using Streaming are out of luck.
|
108
|
+
|
109
|
+
* Use a UUID -- that's what they're for. Drawback: ridiculously long
|
110
|
+
|
111
|
+
* Hash the machine name, PID and timestamp to something short. Check after the
|
112
|
+
fact that uniqueness was achieved. Use the birthday party formula to find out
|
113
|
+
how often this will happen. (In practice, almost never.)
|
114
|
+
|
115
|
+
|
116
|
+
Working with records that change over time,
|
data/doc/usage.textile
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: Usage notes
|
4
|
+
---
|
5
|
+
|
6
|
+
h1(gemheader). {{ site.gemname }} %(small):: usage%
|
7
|
+
|
8
|
+
|
9
|
+
<notextile><div class="toggle"></notextile>
|
10
|
+
|
11
|
+
h2. How to run a Wukong script
|
12
|
+
|
13
|
+
To run your script using local files and no connection to a hadoop cluster,
|
14
|
+
|
15
|
+
pre. your/script.rb --run=local path/to/input_files path/to/output_dir
|
16
|
+
|
17
|
+
To run the command across a Hadoop cluster,
|
18
|
+
|
19
|
+
pre. your/script.rb --run=hadoop path/to/input_files path/to/output_dir
|
20
|
+
|
21
|
+
You can set the default in the config/wukong-site.yaml file, and then just use @--run@ instead of @--run=something@ --it will just use the default run mode.
|
22
|
+
|
23
|
+
If you're running @--run=hadoop@, all file paths are HDFS paths. If you're running @--run=local@, all file paths are local paths. (your/script path, of course, lives on the local filesystem).
|
24
|
+
|
25
|
+
You can supply arbitrary command line arguments (they wind up as key-value pairs in the options path your mapper and reducer receive), and you can use the hadoop syntax to specify more than one input file:
|
26
|
+
|
27
|
+
pre. ./path/to/your/script.rb --any_specific_options --options=can_have_vals \
|
28
|
+
--run "input_dir/part_*,input_file2.tsv,etc.tsv" path/to/output_dir
|
29
|
+
|
30
|
+
Note that all @--options@ must precede (in any order) all non-options.
|
31
|
+
|
32
|
+
<notextile></div><div class="toggle"></notextile>
|
33
|
+
|
34
|
+
h2. How to test your scripts
|
35
|
+
|
36
|
+
To run mapper on its own:
|
37
|
+
|
38
|
+
pre. cat ./local/test/input.tsv | ./examples/word_count.rb --map | more
|
39
|
+
|
40
|
+
or if your test data lies on the HDFS,
|
41
|
+
|
42
|
+
pre. hdp-cat test/input.tsv | ./examples/word_count.rb --map | more
|
43
|
+
|
44
|
+
Next graduate to running @--run=local@ mode so you can inspect the reducer.
|
45
|
+
|
46
|
+
<notextile></div><div class="toggle"></notextile>
|
47
|
+
|
48
|
+
h2. What tools does Wukong work with?
|
49
|
+
|
50
|
+
Wukong is friends with "Hadoop":http://hadoop.apache.org/core the elephant, "Pig":http://hadoop.apache.org/pig/ the query language, and the @cat@ on your command line. We're looking forward to being friends with "martinis":http://datamapper.org and "express trains":http://wiki.rubyonrails.org/rails/pages/ActiveRecord down the road.
|
51
|
+
|
52
|
+
<notextile></div><div class="toggle"></notextile>
|
53
|
+
|
54
|
+
h2. Design
|
55
|
+
|
56
|
+
...
|
57
|
+
|
58
|
+
<notextile></div><div class="toggle"></notextile>
|
59
|
+
|
60
|
+
h2. Caveats
|
61
|
+
|
62
|
+
...
|
63
|
+
|
64
|
+
<notextile></div><div class="toggle"></notextile>
|
65
|
+
|
66
|
+
h2. TODOs
|
67
|
+
|
68
|
+
...
|
69
|
+
|
70
|
+
<notextile></div><div class="toggle"></notextile>
|
71
|
+
|
72
|
+
h2. Note on Patches/Pull Requests
|
73
|
+
|
74
|
+
* Fork the project.
|
75
|
+
* Make your feature addition or bug fix.
|
76
|
+
* Add tests for it. This is important so I don't break it in a future version unintentionally.
|
77
|
+
* Commit, do not mess with rakefile, version, or history. (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
78
|
+
* Send me a pull request. Bonus points for topic branches.
|
79
|
+
|
80
|
+
<notextile></div><div class="toggle"></notextile>
|
81
|
+
|
82
|
+
h2. Endnotes
|
83
|
+
|
84
|
+
|
85
|
+
h3. Why is it called Wukong?
|
86
|
+
|
87
|
+
Hadoop, as you may know, is "named after a stuffed elephant.":http://en.wikipedia.org/wiki/Hadoop Since Wukong was started by the "infochimps":http://infochimps.org team, we needed a simian analog. A Monkey King who journeyed to the land of the Elephant seems to fit the bill:
|
88
|
+
|
89
|
+
bq. Sun Wukong (孙悟空), known in the West as the Monkey King, is the main character in the classical Chinese epic novel Journey to the West. In the novel, he accompanies the monk Xuanzang on the journey to retrieve Buddhist sutras from India.
|
90
|
+
|
91
|
+
bq. Sun Wukong possesses incredible strength, being able to lift his 13,500 jīn (8,100 kg) Ruyi Jingu Bang with ease. He also has superb speed, traveling 108,000 li (54,000 kilometers) in one somersault. Sun knows 72 transformations, which allows him to transform into various animals and objects; he is, however, shown with slight problems transforming into other people, since he is unable to complete the transformation of his tail. He is a skilled fighter, capable of holding his own against the best generals of heaven. Each of his hairs possesses magical properties, and is capable of transforming into a clone of the Monkey King himself, or various weapons, animals, and other objects. He also knows various spells in order to command wind, part water, conjure protective circles against demons, freeze humans, demons, and gods alike. -- ["Sun Wukong's Wikipedia entry":http://en.wikipedia.org/wiki/Wukong]
|
92
|
+
|
93
|
+
The "Jaime Hewlett / Damon Albarn short":http://news.bbc.co.uk/sport1/hi/olympics/monkey that the BBC made for their 2008 Olympics coverage gives the general idea.
|
94
|
+
|
95
|
+
|
96
|
+
* What's up with Wukong::AndPig?
|
97
|
+
** @Wukong::AndPig@ is a small library to more easily generate code for the "Pig":http://hadoop.apache.org/pig data analysis language. See its "README":wukong/and_pig/README.textile for more.
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
<notextile></div></notextile>
|
102
|
+
|
data/doc/utils.textile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
|
2
|
+
<something to tab and align table>
|
3
|
+
|
4
|
+
|
5
|
+
* uniq - report or filter out repeated lines in a file
|
6
|
+
** -c produces line<tab>count
|
7
|
+
** --ignore f1,f2,... discards given fields from consideration. field syntax same as for cut, etc.
|
8
|
+
|
9
|
+
* sort - sort lines of text files
|
10
|
+
** columns indexed as tab-separated
|
11
|
+
** can specify any column order, uses same field spec as cut
|
12
|
+
* tsort - topological sort of a directed graph
|
13
|
+
|
14
|
+
* cut - select portions of each line of a file
|
15
|
+
** can reorder columns
|
16
|
+
* nl - line numbering filter
|
17
|
+
** takes prefix, suffix
|
18
|
+
** count \t line -OR- line \t count
|
19
|
+
|
20
|
+
* wc - word, line, character, and byte count
|
21
|
+
** field count (tab-separated fields)
|
22
|
+
* paste - merge corresponding or subsequent lines of files
|
23
|
+
* expand, unexpand - expand tabs to spaces, and vice versa
|
24
|
+
* seq
|
25
|
+
* simple row, column sums
|
26
|
+
* join - relational database operator
|
27
|
+
* tac
|
28
|
+
|
29
|
+
* cat - concatenate and print files
|
30
|
+
* head - display first lines of a file
|
31
|
+
* tail - display the last part of a file
|
32
|
+
* shuf
|
33
|
+
* split - split a file into pieces
|
34
|
+
* csplit - split files based on context
|
35
|
+
* tee - pipe fitting
|
36
|
+
|
37
|
+
* ls - list directory contents.
|
38
|
+
* df - display free disk space
|
39
|
+
* du - display disk usage statistics
|
40
|
+
** tab-delimited, space aligned
|
41
|
+
|
42
|
+
* od - octal, decimal, hex, ASCII dump
|
43
|
+
* printf - formatted output
|
44
|
+
* cksum, sum - display file checksums and block counts
|
45
|
+
* md5sum
|
46
|
+
|
47
|
+
* diff
|
48
|
+
* comm
|
data/examples/README.txt
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Examples:
|
2
|
+
|
3
|
+
|
4
|
+
* sample_records -- extract a random sample from a collection of data
|
5
|
+
|
6
|
+
* word_count
|
7
|
+
|
8
|
+
* apache_log_parser -- example for parsing standard apache webserver log files.
|
9
|
+
|
10
|
+
* wordchains -- solving a word puzzle using breadth-first search of a graph
|
11
|
+
|
12
|
+
* graph -- some generic graph
|
13
|
+
|
14
|
+
* pagerank -- use the pagerank algorithm to find the most 'interesting'
|
15
|
+
(central) nodes of a network graph
|
16
|
+
|
17
|
+
|
@@ -0,0 +1,128 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__) + '/../../lib'
|
3
|
+
require 'wukong' ; include Wukong
|
4
|
+
require 'wukong/and_pig' ; include Wukong::AndPig
|
5
|
+
|
6
|
+
# PIG_DIR = '/usr/local/share/pig'
|
7
|
+
PIG_DIR = '/public/share/pig'
|
8
|
+
# full pathname to the pig executable
|
9
|
+
# Wukong::AndPig::PIG_EXECUTABLE = "#{PIG_DIR}/bin/pig"
|
10
|
+
Wukong::AndPig::PIG_EXECUTABLE = "/public/bin/pig -x local"
|
11
|
+
|
12
|
+
#
|
13
|
+
HDFS_BASE_DIR = 'foo/meta/lang'
|
14
|
+
Wukong::AndPig::PigVar.working_dir = HDFS_BASE_DIR
|
15
|
+
Wukong::AndPig.comments = false
|
16
|
+
# Wukong::AndPig.emit_dest = :captured
|
17
|
+
|
18
|
+
Wukong::AndPig::PigVar.emit "REGISTER #{PIG_DIR}/contrib/piggybank/java/piggybank.jar"
|
19
|
+
|
20
|
+
#
|
21
|
+
# Load basic types
|
22
|
+
#
|
23
|
+
|
24
|
+
# class Token < Struct.new(:rsrc, :context, :user_id, :token, :usages)
|
25
|
+
# end
|
26
|
+
# :tokens_users_0 << Token.pig_load('meta/datanerds/token_count/users_tokens')
|
27
|
+
# :tokens_users_0 << Token.pig_load('/tmp/users_tokens.tsv')
|
28
|
+
# :tokens_users << :tokens_users_0.generate(:user_id, :token, :usages)
|
29
|
+
# :tokens_users.checkpoint!
|
30
|
+
|
31
|
+
class Token < TypedStruct.new(
|
32
|
+
[:user_id, Integer], [:token, String], [:usages, Integer])
|
33
|
+
end
|
34
|
+
:tokens_users << Token.pig_load('/tmp/users_tokens.tsv')
|
35
|
+
:tokens_users.describe
|
36
|
+
|
37
|
+
pig_comment %Q{
|
38
|
+
# ***************************************************************************
|
39
|
+
#
|
40
|
+
# Global totals
|
41
|
+
#
|
42
|
+
# Each row in Tokens lists a (user, token, usages)
|
43
|
+
# We want
|
44
|
+
# Sum of all usage counts = total tokens seen in tweet stream.
|
45
|
+
# Number of distinct tokens
|
46
|
+
# Number of distinct users <- different than total in twitter_users.tsv
|
47
|
+
# because we want only users that say stuff.
|
48
|
+
}
|
49
|
+
|
50
|
+
def count_distinct relation, field, options={}
|
51
|
+
result_name = options[:as] || "#{relation.name}_#{field}_count".to_sym
|
52
|
+
a = relation.
|
53
|
+
generate(field).set!.describe.
|
54
|
+
distinct(options).set!
|
55
|
+
result_name << a.
|
56
|
+
group(:all).set!.
|
57
|
+
generate(["COUNT(#{a.relation}.#{field})", :u_count, Integer]).set!
|
58
|
+
end
|
59
|
+
|
60
|
+
pig_comment "Count Users"
|
61
|
+
tok_users_count = count_distinct(:tokens_users, :user_id).checkpoint!
|
62
|
+
|
63
|
+
pig_comment "Count Tokens"
|
64
|
+
tok_tokens_count = count_distinct(:tokens_users, :token, :parallel => 10).checkpoint!
|
65
|
+
|
66
|
+
|
67
|
+
pig_comment %Q{
|
68
|
+
# ***************************************************************************
|
69
|
+
#
|
70
|
+
# Statistics for each user
|
71
|
+
}
|
72
|
+
|
73
|
+
def user_stats users_tokens
|
74
|
+
users_tokens.describe.
|
75
|
+
group( :user_id).set!.describe.
|
76
|
+
generate(
|
77
|
+
[:group, :user_id],
|
78
|
+
["(int)COUNT(#{users_tokens.relation})", :tot_tokens, Integer],
|
79
|
+
[ "(int)SUM(#{users_tokens.relation}.usages)", :tot_usages, Integer],
|
80
|
+
[ "FLATTEN(#{users_tokens.relation}.token", :token, String ],
|
81
|
+
[ "FLATTEN(#{users_tokens.relation}.usages", :usages, Integer]).set!.describe.
|
82
|
+
# [ "FLATTEN(#{users_tokens.relation}.(token, usages) )", [:token, :usages], TypedStruct.new([:token, String], [:usages, Integer])]).set!.
|
83
|
+
generate(:user_id, :token, :usages,
|
84
|
+
["(float)(1.0*usages / tot_usages)", :usage_pct, Float],
|
85
|
+
["(float)(1.0*usages / tot_usages) * (1.0*(float)usages / tot_usages)", :usage_pct_sq, Float]).set!
|
86
|
+
end
|
87
|
+
|
88
|
+
:user_stats << user_stats(:tokens_users)
|
89
|
+
:user_stats.describe.checkpoint!
|
90
|
+
puts "UserStats = LOAD 'foo/meta/lang/user_stats' AS (user_id, token, usages, usage_pct, usage_pct_sq) ;"
|
91
|
+
|
92
|
+
UserStats = TypedStruct.new([:user_id, Integer],
|
93
|
+
[:token, String],
|
94
|
+
[:usages, Integer],
|
95
|
+
[:usage_pct, Float],
|
96
|
+
[:usage_pct_sq, Float])
|
97
|
+
:user_stats << UserStats.pig_load('foo/meta/lang/user_stats')
|
98
|
+
|
99
|
+
def range_and_dispersion user_stats
|
100
|
+
|
101
|
+
n_users = 436
|
102
|
+
n_tokens = 61630
|
103
|
+
|
104
|
+
token_stats = user_stats.group(:token).set!
|
105
|
+
token_stats = token_stats.foreach(
|
106
|
+
["(float)SUM(#{user_stats.relation}.usage_pct) / #{n_users.to_f}", :avg_uspct ],
|
107
|
+
["(float)SUM(#{user_stats.relation}.usage_pct_sq)", :sum_uspct_sq],
|
108
|
+
["org.apache.pig.piggybank.evaluation.math.SQRT(
|
109
|
+
(sum_uspct_sq /436) -
|
110
|
+
( (SUM(#{user_stats.relation}.usage_pct)/436.0) * (SUM(#{user_stats.relation}.usage_pct)/436.0) )
|
111
|
+
)", :stdev_uspct],
|
112
|
+
["1 - ( ( stdev_uspct / avg_uspct ) / org.apache.pig.piggybank.evaluation.math.SQRT(436.0 - 1.0) )", :dispersion],
|
113
|
+
[
|
114
|
+
[:group, :token, String ],
|
115
|
+
["(int)COUNT(#{user_stats.relation}) ", :range, Integer ],
|
116
|
+
["(int)COUNT(#{user_stats.relation}) / #{n_users.to_f}", :pct_range, Integer ],
|
117
|
+
["(int)SUM( #{user_stats.relation}.usages)", :tot_usages, Integer],
|
118
|
+
["(float)( 1.0e6*SUM(#{user_stats.relation}.usages) / #{n_tokens.to_f})", :ppm_usages, Float],
|
119
|
+
[:avg_uspct, :avg_uspct],
|
120
|
+
[:stdev_uspct, :stdev_uspct],
|
121
|
+
[:dispersion, :dispersion]
|
122
|
+
]
|
123
|
+
).set!
|
124
|
+
end
|
125
|
+
|
126
|
+
range_and_dispersion(:user_stats).checkpoint!
|
127
|
+
|
128
|
+
Wukong::AndPig.finish
|
@@ -0,0 +1,53 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../lib'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
module ApacheLogParser
|
6
|
+
class Mapper < Wukong::Streamer::LineStreamer
|
7
|
+
|
8
|
+
|
9
|
+
def parse_request req
|
10
|
+
m = %r{\A(\w+) (.*) (\w+/[\w\.]+)\z}.match(req)
|
11
|
+
if m
|
12
|
+
[''] + m.captures
|
13
|
+
else
|
14
|
+
[req, '', '', '']
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
# regular expression to match on apache-style log lines
|
20
|
+
# IP addr - - [07/Jun/2008:20:37:11 +0000] 400 "GET /faq" + gaJsHost + "google-analytics.com/ga.js HTTP/1.1" 173 "-" "-" "-"
|
21
|
+
LOG_RE = %r{\A(\d+\.\d+\.\d+\.\d+) ([^\s]+) ([^\s]+) \[(\d\d/\w+/\d+):(\d\d:\d\d:\d\d)([^\]]*)\] (\d+) "([^\"]*(?:\" \+ gaJsHost \+ \"[^\"]*)?)" (\d+) "([^\"]*)" "([^\"]*)" "([^\"]*)"\z}
|
22
|
+
|
23
|
+
def process line
|
24
|
+
line.chomp
|
25
|
+
m = LOG_RE.match(line)
|
26
|
+
if m
|
27
|
+
ip, j1, j2, datepart, timepart, tzpart, resp, req, j3, ref, ua, j4 = m.captures
|
28
|
+
req_date = DateTime.parse("#{datepart} #{timepart} #{tzpart}").to_flat
|
29
|
+
req, method, path, protocol = parse_request(req)
|
30
|
+
yield [:logline, method, path, protocol, ip, j1, j2, req_date, resp, req, j3, ref, ua, j4]
|
31
|
+
else
|
32
|
+
yield [:unparseable, line]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
class Reducer < Wukong::Streamer::LineStreamer
|
38
|
+
end
|
39
|
+
|
40
|
+
# Execute the script
|
41
|
+
class Script < Wukong::Script
|
42
|
+
def reduce_command
|
43
|
+
"/usr/bin/uniq"
|
44
|
+
end
|
45
|
+
def default_options
|
46
|
+
super.merge :sort_fields => 8 # , :reduce_tasks => 0
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
Script.new(Mapper,nil).run
|
51
|
+
end
|
52
|
+
|
53
|
+
# 55.55.155.55 - - [04/Feb/2008:11:37:52 +0000] 301 "GET /robots.txt HTTP/1.1" 185 "-" "WebAlta Crawler/2.0 (http://www.webalta.net/ru/about_webmaster.html) (Windows; U; Windows NT 5.1; ru-RU)" "-"
|