wukong 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,111 @@
1
+ # -*- coding: utf-8 -*-
2
+ module Wukong
3
+ module HadoopCommand
4
+
5
+ # ===========================================================================
6
+ #
7
+ # Hadoop Options
8
+ #
9
+
10
+ #
11
+ # Translate the simplified args to their hairy-assed hadoop equivalents
12
+ #
13
+ HADOOP_OPTIONS_MAP = {
14
+ :max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
15
+ :max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
16
+ :map_tasks => 'mapred.map.tasks',
17
+ :reduce_tasks => 'mapred.reduce.tasks',
18
+ :sort_fields => 'stream.num.map.output.key.fields',
19
+ :key_field_separator => 'map.output.key.field.separator',
20
+ :partition_fields => 'num.key.fields.for.partition',
21
+ :output_field_separator => 'stream.map.output.field.separator',
22
+ :map_speculative => 'mapred.map.tasks.speculative.execution',
23
+ :timeout => 'mapred.task.timeout',
24
+ }
25
+
26
+ # emit a -jobconf hadoop option if the simplified command line arg is present
27
+ # if not, the resulting nil will be elided later
28
+ def jobconf option
29
+ if options[option]
30
+ "-jobconf %s=%s" % [HADOOP_OPTIONS_MAP[option], options[option]]
31
+ end
32
+ end
33
+
34
+ # Define what fields hadoop should treat as the keys
35
+ def hadoop_sort_args
36
+ [
37
+ jobconf(:key_field_separator),
38
+ jobconf(:sort_fields),
39
+ ]
40
+ end
41
+
42
+ # Define what fields hadoop should use to distribute records to reducers
43
+ def hadoop_partition_args
44
+ if options[:partition_fields]
45
+ [
46
+ '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
47
+ jobconf(:output_field_separator),
48
+ jobconf(:partition_fields),
49
+ ]
50
+ end
51
+ end
52
+
53
+ # Emit options for setting the number of mappers and reducers.
54
+ def hadoop_num_tasks_args
55
+ [
56
+ jobconf(:max_node_map_tasks),
57
+ jobconf(:max_node_reduce_tasks),
58
+ jobconf(:map_tasks),
59
+ jobconf(:reduce_tasks)
60
+ ]
61
+ end
62
+
63
+ def hadoop_other_args
64
+ extra_str_args = [ options[:extra_args] ]
65
+ extra_hsh_args = [:map_speculative, :timeout].map{|opt| jobconf(opt) }
66
+ extra_str_args + extra_hsh_args
67
+ end
68
+
69
+ #
70
+ # Assemble the hadoop command to execute
71
+ #
72
+ def hadoop_command input_path, output_path
73
+ # If this is wrong, create a config/wukong-site.rb or
74
+ # otherwise set Wukong::CONFIG[:hadoop_home] to the
75
+ # root of your config install.
76
+ hadoop_program = Wukong::CONFIG[:hadoop_home]+'/bin/hadoop'
77
+ [
78
+ hadoop_program,
79
+ "jar #{Wukong::CONFIG[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
80
+ hadoop_partition_args,
81
+ hadoop_sort_args,
82
+ hadoop_num_tasks_args,
83
+ "-mapper '#{map_command}'",
84
+ "-reducer '#{reduce_command}'",
85
+ "-input '#{input_path}'",
86
+ "-output '#{output_path}'",
87
+ hadoop_other_args,
88
+ ].flatten.compact.join(" \t\\\n ")
89
+ end
90
+
91
+ end
92
+ end
93
+
94
+
95
+ # -inputformat <name of inputformat (class)> (“auto” by default)
96
+ # -input <additional DFS input path>
97
+ # -python <python command to use on nodes> (“python” by default)
98
+ # -name <job name> (“program.py” by default)
99
+ # -numMapTasks <number>
100
+ # -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
101
+ # -priority <priority value> (“NORMAL” by default)
102
+ # -libjar <path to jar> (this jar gets put in the class path)
103
+ # -libegg <path to egg> (this egg gets put in the Python path)
104
+ # -file <local file> (this file will be put in the dir where the python program gets executed)
105
+ # -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
106
+ # -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
107
+ # -cmdenv <env var name>=<value>
108
+ # -jobconf <property name>=<value>
109
+ # -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
110
+ # -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
111
+ # -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
@@ -0,0 +1,14 @@
1
+ module Wukong
2
+ module LocalCommand
3
+
4
+ # ===========================================================================
5
+ #
6
+ # Local execution Options
7
+ #
8
+
9
+ def local_command input_path, output_path
10
+ %Q{ cat #{input_path} | #{map_command} | sort | #{reduce_command} > '#{output_path}'}
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ module Wukong
2
+ module Streamer
3
+ autoload :Base, 'wukong/streamer/base'
4
+ autoload :LineStreamer, 'wukong/streamer/line_streamer'
5
+ autoload :StructStreamer, 'wukong/streamer/struct_streamer'
6
+ #
7
+ autoload :Filter, 'wukong/streamer/filter'
8
+ #
9
+ autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
10
+ autoload :ListReducer, 'wukong/streamer/list_reducer'
11
+ autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
12
+ end
13
+ end
@@ -0,0 +1,89 @@
1
+ module Wukong
2
+ module Streamer
3
+
4
+ #
5
+ # AccumulatingReducer makes it easy to apply one operation across all
6
+ # occurrences of each key
7
+ #
8
+ # On each occurrence of a given key, AccumulatingReducer calls
9
+ # accumulate, and at the final occurrence calls finalize.
10
+ #
11
+ # See ListAccumulatingReducer and KeyCountingReducer for examples
12
+ #
13
+ # Make sure you don't have the bad luck, bad judgement or bad approach to
14
+ # accumulate more data than your box can hold before finalizing.
15
+ #
16
+ class AccumulatingReducer < Wukong::Streamer::Base
17
+ attr_accessor :key
18
+ def initialize options
19
+ super options
20
+ self.key = :__first_pass__
21
+ end
22
+
23
+ #
24
+ # override for multiple-field keys, etc.
25
+ #
26
+ # Note that get_key is called by +process+ -- so the arguments have
27
+ # already been +recordize+d. In particular, if you are using
28
+ # StructRecordizer (or StructStreamer), you can write this as
29
+ #
30
+ # def get_key(thing) thing.id.to_i ; end
31
+ #
32
+ # or whatever
33
+ def get_key *record
34
+ record.first
35
+ end
36
+
37
+ #
38
+ # Accumulate all records for a given key.
39
+ #
40
+ # When the last record for the key is seen, finalize processing and adopt the
41
+ # new key.
42
+ #
43
+ def process *args, &block
44
+ this_key = get_key(*args)
45
+ if this_key != self.key # if this is a new key,
46
+ unless self.key == :__first_pass__
47
+ finalize(&block) # process what we've collected so far
48
+ end
49
+ self.key = this_key # adopt the new key
50
+ start! *args # and set up for the next accumulation
51
+ end
52
+ # collect the current record
53
+ accumulate *args, &block
54
+ end
55
+
56
+ #
57
+ # start! is called on the the first record of the new key
58
+ #
59
+ def start! *args
60
+ raise %Q{start! is the new reset! -- it has args now, namely the first
61
+ record of the new key. It doesn\'t want #super either}
62
+ end
63
+
64
+ #
65
+ # Override this to accumulate each record for the given key in turn.
66
+ #
67
+ def accumulate *args, &block
68
+ raise "override the accumulate method in your subclass"
69
+ end
70
+
71
+ #
72
+ #
73
+ # You must override this method.
74
+ #
75
+ def finalize
76
+ raise "override the finalize method in your subclass"
77
+ end
78
+
79
+ #
80
+ # Must make sure to finalize the last-seen accumulation.
81
+ #
82
+ def stream
83
+ super
84
+ finalize(){|record| emit record }
85
+ end
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,76 @@
1
+ module Wukong
2
+ module Streamer
3
+ class Base
4
+
5
+ # Options, initially set from the command-line args -- see
6
+ # Script#process_argv!
7
+ attr_accessor :options
8
+
9
+ #
10
+ # Accepts option hash from script runner
11
+ #
12
+ def initialize options={}
13
+ self.options = options
14
+ end
15
+
16
+ #
17
+ # Pass each record to +#process+
18
+ #
19
+ def stream
20
+ before_stream
21
+ $stdin.each do |line|
22
+ record = recordize(line.chomp)
23
+ next unless record
24
+ process(*record) do |output_record|
25
+ emit output_record
26
+ end
27
+ end
28
+ after_stream
29
+ end
30
+
31
+ # Called exactly once, before streaming begins
32
+ def before_stream
33
+ end
34
+
35
+ # Called exactly once, after streaming completes
36
+ def after_stream
37
+ end
38
+
39
+ #
40
+ # Default recordizer: returns array of fields by splitting at tabs
41
+ #
42
+ def recordize line
43
+ line.split("\t") rescue nil
44
+ end
45
+
46
+ #
47
+ # Serializes the record to output.
48
+ #
49
+ # Emits a single line of tab-separated fields created by calling #to_flat
50
+ # on the record and joining with "\t".
51
+ #
52
+ # Does no escaping or processing of the record -- that's to_flat's job, or
53
+ # yours if you override this method.
54
+ #
55
+ def emit record
56
+ puts record.to_flat.join("\t")
57
+ end
58
+
59
+ #
60
+ # Process each record in turn, yielding the records to emit
61
+ #
62
+ def process *args, &block
63
+ raise "override the process method in your implementation: it should process each record."
64
+ end
65
+
66
+ #
67
+ # To track processing errors inline,
68
+ # pass the line back to bad_record!
69
+ #
70
+ def bad_record! key, *args
71
+ warn "Bad record #{args.inspect[0..400]}"
72
+ puts ["bad_record-"+key, *args].join("\t")
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,30 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # Emit each unique key and the count of its occurrences
5
+ #
6
+ class CountKeys < Wukong::Streamer::AccumulatingReducer
7
+ attr_accessor :key_count
8
+
9
+ def formatted_key_count
10
+ "%10d"%key_count.to_i
11
+ end
12
+
13
+ # reset the counter to zero
14
+ def start! *args
15
+ self.key_count = 0
16
+ end
17
+
18
+ # record one more for this key
19
+ def accumulate *vals
20
+ self.key_count += 1
21
+ end
22
+
23
+ # emit each key field and the count, tab-separated.
24
+ def finalize
25
+ yield [key, formatted_key_count]
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,26 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # For each identical line in the map phase output, emit one representative
5
+ # line followed by the count of occrrences (separated by a tab).
6
+ #
7
+ # (This is the functional equivalent of +'uniq -c'+)
8
+ #
9
+ class CountLines < Wukong::Streamer::Base
10
+ def formatted_count item, key_count
11
+ "%s\t%10d" % [item, key_count.to_i]
12
+ end
13
+
14
+ #
15
+ # Delegate to +uniq -c+, but put the count last for idempotence.
16
+ #
17
+ def stream
18
+ %x{/usr/bin/uniq -c}.split("\n").each do |line|
19
+ key_count, item = line.chomp.strip.split(/\s+/, 2)
20
+ puts formatted_count(item, key_count)
21
+ end
22
+ end
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,20 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # emit only some records, as dictated by the #emit? method
5
+ #
6
+ # This is a mixin: including this module in your streamer
7
+ # implements its +#process+ method.
8
+ #
9
+ module Filter
10
+ #
11
+ # Filter out a subset of record/lines
12
+ #
13
+ # Subclass and re-define the emit? method
14
+ #
15
+ def process *record, &block
16
+ yield record if emit?(record)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,12 @@
1
+ module Wukong
2
+ module Streamer
3
+ class LineStreamer < Wukong::Streamer::Base
4
+ #
5
+ # Turns a flat line into a record for +#process+ to consume
6
+ #
7
+ def recordize line
8
+ [line]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,20 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # Emit each unique key and the count of its occurrences
5
+ #
6
+ class ListReducer < Wukong::Streamer::AccumulatingReducer
7
+ attr_accessor :values
8
+
9
+ # reset the counter to zero
10
+ def start! *args
11
+ self.values = []
12
+ end
13
+
14
+ # record one more for this key
15
+ def accumulate *record
16
+ self.values << record
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ module Wukong
2
+ module Streamer
3
+ module PreprocessWithPipeStreamer
4
+ #
5
+ # Runs STDIN through a shell command and then begins processing.
6
+ #
7
+ # If you don't need to do anything to the output of the command, just
8
+ # inherit from Wukong::Script and override the #map_command.
9
+ #
10
+ # You must provide a @preprocess_pipe_command@ method that returns a shell
11
+ # command to run the input through.
12
+ #
13
+ def stream
14
+ #
15
+ `#{preprocess_pipe_command}`.each do |line|
16
+ item = itemize(line) ; next if item.blank?
17
+ process(*item)
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end