wukong 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. data/LICENSE.textile +107 -0
  2. data/README.textile +166 -0
  3. data/bin/cutc +30 -0
  4. data/bin/cuttab +5 -0
  5. data/bin/greptrue +8 -0
  6. data/bin/hdp-cat +3 -0
  7. data/bin/hdp-catd +3 -0
  8. data/bin/hdp-du +81 -0
  9. data/bin/hdp-get +3 -0
  10. data/bin/hdp-kill +3 -0
  11. data/bin/hdp-ls +10 -0
  12. data/bin/hdp-mkdir +3 -0
  13. data/bin/hdp-mv +3 -0
  14. data/bin/hdp-parts_to_keys.rb +77 -0
  15. data/bin/hdp-ps +3 -0
  16. data/bin/hdp-put +3 -0
  17. data/bin/hdp-rm +11 -0
  18. data/bin/hdp-sort +29 -0
  19. data/bin/hdp-stream +29 -0
  20. data/bin/hdp-stream-flat +18 -0
  21. data/bin/hdp-sync +17 -0
  22. data/bin/hdp-wc +67 -0
  23. data/bin/md5sort +20 -0
  24. data/bin/tabchar +5 -0
  25. data/bin/uniqc +3 -0
  26. data/bin/wu-hist +3 -0
  27. data/bin/wu-lign +177 -0
  28. data/bin/wu-sum +30 -0
  29. data/doc/INSTALL.textile +41 -0
  30. data/doc/LICENSE.textile +107 -0
  31. data/doc/README-tutorial.textile +163 -0
  32. data/doc/README-wulign.textile +59 -0
  33. data/doc/README-wutils.textile +128 -0
  34. data/doc/TODO.textile +61 -0
  35. data/doc/UsingWukong-part1-setup.textile +2 -0
  36. data/doc/UsingWukong-part2-scraping.textile +2 -0
  37. data/doc/UsingWukong-part3-parsing.textile +132 -0
  38. data/doc/code/api_response_example.txt +20 -0
  39. data/doc/code/parser_skeleton.rb +38 -0
  40. data/doc/hadoop-nfs.textile +51 -0
  41. data/doc/hadoop-setup.textile +29 -0
  42. data/doc/index.textile +124 -0
  43. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  44. data/doc/links.textile +42 -0
  45. data/doc/overview.textile +91 -0
  46. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  47. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  48. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  49. data/doc/tips.textile +116 -0
  50. data/doc/usage.textile +102 -0
  51. data/doc/utils.textile +48 -0
  52. data/examples/README.txt +17 -0
  53. data/examples/and_pig/sample_queries.rb +128 -0
  54. data/examples/apache_log_parser.rb +53 -0
  55. data/examples/count_keys.rb +56 -0
  56. data/examples/count_keys_at_mapper.rb +57 -0
  57. data/examples/graph/adjacency_list.rb +74 -0
  58. data/examples/graph/breadth_first_search.rb +79 -0
  59. data/examples/graph/gen_2paths.rb +68 -0
  60. data/examples/graph/gen_multi_edge.rb +103 -0
  61. data/examples/graph/gen_symmetric_links.rb +53 -0
  62. data/examples/package-local.rb +100 -0
  63. data/examples/package.rb +96 -0
  64. data/examples/pagerank/README.textile +6 -0
  65. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  66. data/examples/pagerank/pagerank.rb +88 -0
  67. data/examples/pagerank/pagerank_initialize.rb +46 -0
  68. data/examples/pagerank/run_pagerank.sh +19 -0
  69. data/examples/rank_and_bin.rb +173 -0
  70. data/examples/run_all.sh +47 -0
  71. data/examples/sample_records.rb +44 -0
  72. data/examples/size.rb +60 -0
  73. data/examples/word_count.rb +95 -0
  74. data/lib/wukong.rb +11 -0
  75. data/lib/wukong/and_pig.rb +62 -0
  76. data/lib/wukong/and_pig/README.textile +12 -0
  77. data/lib/wukong/and_pig/as.rb +37 -0
  78. data/lib/wukong/and_pig/data_types.rb +30 -0
  79. data/lib/wukong/and_pig/functions.rb +50 -0
  80. data/lib/wukong/and_pig/generate.rb +85 -0
  81. data/lib/wukong/and_pig/generate/variable_inflections.rb +82 -0
  82. data/lib/wukong/and_pig/junk.rb +51 -0
  83. data/lib/wukong/and_pig/operators.rb +8 -0
  84. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  85. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  86. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  87. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  88. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  89. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  90. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  91. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  92. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  93. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  94. data/lib/wukong/and_pig/pig_var.rb +95 -0
  95. data/lib/wukong/and_pig/symbol.rb +29 -0
  96. data/lib/wukong/and_pig/utils.rb +0 -0
  97. data/lib/wukong/bad_record.rb +18 -0
  98. data/lib/wukong/boot.rb +47 -0
  99. data/lib/wukong/datatypes.rb +24 -0
  100. data/lib/wukong/datatypes/enum.rb +123 -0
  101. data/lib/wukong/dfs.rb +80 -0
  102. data/lib/wukong/encoding.rb +111 -0
  103. data/lib/wukong/extensions.rb +15 -0
  104. data/lib/wukong/extensions/array.rb +18 -0
  105. data/lib/wukong/extensions/blank.rb +93 -0
  106. data/lib/wukong/extensions/class.rb +189 -0
  107. data/lib/wukong/extensions/date_time.rb +24 -0
  108. data/lib/wukong/extensions/emittable.rb +82 -0
  109. data/lib/wukong/extensions/hash.rb +120 -0
  110. data/lib/wukong/extensions/hash_like.rb +119 -0
  111. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  112. data/lib/wukong/extensions/module.rb +2 -0
  113. data/lib/wukong/extensions/pathname.rb +27 -0
  114. data/lib/wukong/extensions/string.rb +65 -0
  115. data/lib/wukong/extensions/struct.rb +17 -0
  116. data/lib/wukong/extensions/symbol.rb +11 -0
  117. data/lib/wukong/logger.rb +53 -0
  118. data/lib/wukong/models/graph.rb +27 -0
  119. data/lib/wukong/rdf.rb +104 -0
  120. data/lib/wukong/schema.rb +37 -0
  121. data/lib/wukong/script.rb +265 -0
  122. data/lib/wukong/script/hadoop_command.rb +111 -0
  123. data/lib/wukong/script/local_command.rb +14 -0
  124. data/lib/wukong/streamer.rb +13 -0
  125. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  126. data/lib/wukong/streamer/base.rb +76 -0
  127. data/lib/wukong/streamer/count_keys.rb +30 -0
  128. data/lib/wukong/streamer/count_lines.rb +26 -0
  129. data/lib/wukong/streamer/filter.rb +20 -0
  130. data/lib/wukong/streamer/line_streamer.rb +12 -0
  131. data/lib/wukong/streamer/list_reducer.rb +20 -0
  132. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  133. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  134. data/lib/wukong/streamer/set_reducer.rb +14 -0
  135. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  136. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  137. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  138. data/lib/wukong/typed_struct.rb +12 -0
  139. data/lib/wukong/wukong_class.rb +21 -0
  140. data/spec/bin/hdp-wc_spec.rb +4 -0
  141. data/spec/spec_helper.rb +0 -0
  142. data/wukong.gemspec +179 -0
  143. metadata +214 -0
@@ -0,0 +1,111 @@
1
+ # -*- coding: utf-8 -*-
2
+ module Wukong
3
+ module HadoopCommand
4
+
5
+ # ===========================================================================
6
+ #
7
+ # Hadoop Options
8
+ #
9
+
10
+ #
11
+ # Translate the simplified args to their hairy-assed hadoop equivalents
12
+ #
13
+ HADOOP_OPTIONS_MAP = {
14
+ :max_node_map_tasks => 'mapred.tasktracker.map.tasks.maximum',
15
+ :max_node_reduce_tasks => 'mapred.tasktracker.reduce.tasks.maximum',
16
+ :map_tasks => 'mapred.map.tasks',
17
+ :reduce_tasks => 'mapred.reduce.tasks',
18
+ :sort_fields => 'stream.num.map.output.key.fields',
19
+ :key_field_separator => 'map.output.key.field.separator',
20
+ :partition_fields => 'num.key.fields.for.partition',
21
+ :output_field_separator => 'stream.map.output.field.separator',
22
+ :map_speculative => 'mapred.map.tasks.speculative.execution',
23
+ :timeout => 'mapred.task.timeout',
24
+ }
25
+
26
+ # emit a -jobconf hadoop option if the simplified command line arg is present
27
+ # if not, the resulting nil will be elided later
28
+ def jobconf option
29
+ if options[option]
30
+ "-jobconf %s=%s" % [HADOOP_OPTIONS_MAP[option], options[option]]
31
+ end
32
+ end
33
+
34
+ # Define what fields hadoop should treat as the keys
35
+ def hadoop_sort_args
36
+ [
37
+ jobconf(:key_field_separator),
38
+ jobconf(:sort_fields),
39
+ ]
40
+ end
41
+
42
+ # Define what fields hadoop should use to distribute records to reducers
43
+ def hadoop_partition_args
44
+ if options[:partition_fields]
45
+ [
46
+ '-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
47
+ jobconf(:output_field_separator),
48
+ jobconf(:partition_fields),
49
+ ]
50
+ end
51
+ end
52
+
53
+ # Emit options for setting the number of mappers and reducers.
54
+ def hadoop_num_tasks_args
55
+ [
56
+ jobconf(:max_node_map_tasks),
57
+ jobconf(:max_node_reduce_tasks),
58
+ jobconf(:map_tasks),
59
+ jobconf(:reduce_tasks)
60
+ ]
61
+ end
62
+
63
+ def hadoop_other_args
64
+ extra_str_args = [ options[:extra_args] ]
65
+ extra_hsh_args = [:map_speculative, :timeout].map{|opt| jobconf(opt) }
66
+ extra_str_args + extra_hsh_args
67
+ end
68
+
69
+ #
70
+ # Assemble the hadoop command to execute
71
+ #
72
+ def hadoop_command input_path, output_path
73
+ # If this is wrong, create a config/wukong-site.rb or
74
+ # otherwise set Wukong::CONFIG[:hadoop_home] to the
75
+ # root of your config install.
76
+ hadoop_program = Wukong::CONFIG[:hadoop_home]+'/bin/hadoop'
77
+ [
78
+ hadoop_program,
79
+ "jar #{Wukong::CONFIG[:hadoop_home]}/contrib/streaming/hadoop-*-streaming.jar",
80
+ hadoop_partition_args,
81
+ hadoop_sort_args,
82
+ hadoop_num_tasks_args,
83
+ "-mapper '#{map_command}'",
84
+ "-reducer '#{reduce_command}'",
85
+ "-input '#{input_path}'",
86
+ "-output '#{output_path}'",
87
+ hadoop_other_args,
88
+ ].flatten.compact.join(" \t\\\n ")
89
+ end
90
+
91
+ end
92
+ end
93
+
94
+
95
+ # -inputformat <name of inputformat (class)> (“auto” by default)
96
+ # -input <additional DFS input path>
97
+ # -python <python command to use on nodes> (“python” by default)
98
+ # -name <job name> (“program.py” by default)
99
+ # -numMapTasks <number>
100
+ # -numReduceTasks <number> (no sorting or reducing will take place if this is 0)
101
+ # -priority <priority value> (“NORMAL” by default)
102
+ # -libjar <path to jar> (this jar gets put in the class path)
103
+ # -libegg <path to egg> (this egg gets put in the Python path)
104
+ # -file <local file> (this file will be put in the dir where the python program gets executed)
105
+ # -cacheFile hdfs://<host>:<fs_port>/<path to file>#<link name> (a link ”<link name>” to the given file will be in the dir)
106
+ # -cacheArchive hdfs://<host>:<fs_port>/<path to jar>#<link name> (link points to dir that contains files from given jar)
107
+ # -cmdenv <env var name>=<value>
108
+ # -jobconf <property name>=<value>
109
+ # -addpath yes (replace each input key by a tuple consisting of the path of the corresponding input file and the original key)
110
+ # -fake yes (fake run, only prints the underlying shell commands but does not actually execute them)
111
+ # -memlimit <number of bytes> (set an upper limit on the amount of memory that can be used)
@@ -0,0 +1,14 @@
1
+ module Wukong
2
+ module LocalCommand
3
+
4
+ # ===========================================================================
5
+ #
6
+ # Local execution Options
7
+ #
8
+
9
+ def local_command input_path, output_path
10
+ %Q{ cat #{input_path} | #{map_command} | sort | #{reduce_command} > '#{output_path}'}
11
+ end
12
+
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ module Wukong
2
+ module Streamer
3
+ autoload :Base, 'wukong/streamer/base'
4
+ autoload :LineStreamer, 'wukong/streamer/line_streamer'
5
+ autoload :StructStreamer, 'wukong/streamer/struct_streamer'
6
+ #
7
+ autoload :Filter, 'wukong/streamer/filter'
8
+ #
9
+ autoload :AccumulatingReducer, 'wukong/streamer/accumulating_reducer'
10
+ autoload :ListReducer, 'wukong/streamer/list_reducer'
11
+ autoload :UniqByLastReducer, 'wukong/streamer/uniq_by_last_reducer'
12
+ end
13
+ end
@@ -0,0 +1,89 @@
1
+ module Wukong
2
+ module Streamer
3
+
4
+ #
5
+ # AccumulatingReducer makes it easy to apply one operation across all
6
+ # occurrences of each key
7
+ #
8
+ # On each occurrence of a given key, AccumulatingReducer calls
9
+ # accumulate, and at the final occurrence calls finalize.
10
+ #
11
+ # See ListAccumulatingReducer and KeyCountingReducer for examples
12
+ #
13
+ # Make sure you don't have the bad luck, bad judgement or bad approach to
14
+ # accumulate more data than your box can hold before finalizing.
15
+ #
16
+ class AccumulatingReducer < Wukong::Streamer::Base
17
+ attr_accessor :key
18
+ def initialize options
19
+ super options
20
+ self.key = :__first_pass__
21
+ end
22
+
23
+ #
24
+ # override for multiple-field keys, etc.
25
+ #
26
+ # Note that get_key is called by +process+ -- so the arguments have
27
+ # already been +recordize+d. In particular, if you are using
28
+ # StructRecordizer (or StructStreamer), you can write this as
29
+ #
30
+ # def get_key(thing) thing.id.to_i ; end
31
+ #
32
+ # or whatever
33
+ def get_key *record
34
+ record.first
35
+ end
36
+
37
+ #
38
+ # Accumulate all records for a given key.
39
+ #
40
+ # When the last record for the key is seen, finalize processing and adopt the
41
+ # new key.
42
+ #
43
+ def process *args, &block
44
+ this_key = get_key(*args)
45
+ if this_key != self.key # if this is a new key,
46
+ unless self.key == :__first_pass__
47
+ finalize(&block) # process what we've collected so far
48
+ end
49
+ self.key = this_key # adopt the new key
50
+ start! *args # and set up for the next accumulation
51
+ end
52
+ # collect the current record
53
+ accumulate *args, &block
54
+ end
55
+
56
+ #
57
+ # start! is called on the the first record of the new key
58
+ #
59
+ def start! *args
60
+ raise %Q{start! is the new reset! -- it has args now, namely the first
61
+ record of the new key. It doesn\'t want #super either}
62
+ end
63
+
64
+ #
65
+ # Override this to accumulate each record for the given key in turn.
66
+ #
67
+ def accumulate *args, &block
68
+ raise "override the accumulate method in your subclass"
69
+ end
70
+
71
+ #
72
+ #
73
+ # You must override this method.
74
+ #
75
+ def finalize
76
+ raise "override the finalize method in your subclass"
77
+ end
78
+
79
+ #
80
+ # Must make sure to finalize the last-seen accumulation.
81
+ #
82
+ def stream
83
+ super
84
+ finalize(){|record| emit record }
85
+ end
86
+ end
87
+
88
+ end
89
+ end
@@ -0,0 +1,76 @@
1
+ module Wukong
2
+ module Streamer
3
+ class Base
4
+
5
+ # Options, initially set from the command-line args -- see
6
+ # Script#process_argv!
7
+ attr_accessor :options
8
+
9
+ #
10
+ # Accepts option hash from script runner
11
+ #
12
+ def initialize options={}
13
+ self.options = options
14
+ end
15
+
16
+ #
17
+ # Pass each record to +#process+
18
+ #
19
+ def stream
20
+ before_stream
21
+ $stdin.each do |line|
22
+ record = recordize(line.chomp)
23
+ next unless record
24
+ process(*record) do |output_record|
25
+ emit output_record
26
+ end
27
+ end
28
+ after_stream
29
+ end
30
+
31
+ # Called exactly once, before streaming begins
32
+ def before_stream
33
+ end
34
+
35
+ # Called exactly once, after streaming completes
36
+ def after_stream
37
+ end
38
+
39
+ #
40
+ # Default recordizer: returns array of fields by splitting at tabs
41
+ #
42
+ def recordize line
43
+ line.split("\t") rescue nil
44
+ end
45
+
46
+ #
47
+ # Serializes the record to output.
48
+ #
49
+ # Emits a single line of tab-separated fields created by calling #to_flat
50
+ # on the record and joining with "\t".
51
+ #
52
+ # Does no escaping or processing of the record -- that's to_flat's job, or
53
+ # yours if you override this method.
54
+ #
55
+ def emit record
56
+ puts record.to_flat.join("\t")
57
+ end
58
+
59
+ #
60
+ # Process each record in turn, yielding the records to emit
61
+ #
62
+ def process *args, &block
63
+ raise "override the process method in your implementation: it should process each record."
64
+ end
65
+
66
+ #
67
+ # To track processing errors inline,
68
+ # pass the line back to bad_record!
69
+ #
70
+ def bad_record! key, *args
71
+ warn "Bad record #{args.inspect[0..400]}"
72
+ puts ["bad_record-"+key, *args].join("\t")
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,30 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # Emit each unique key and the count of its occurrences
5
+ #
6
+ class CountKeys < Wukong::Streamer::AccumulatingReducer
7
+ attr_accessor :key_count
8
+
9
+ def formatted_key_count
10
+ "%10d"%key_count.to_i
11
+ end
12
+
13
+ # reset the counter to zero
14
+ def start! *args
15
+ self.key_count = 0
16
+ end
17
+
18
+ # record one more for this key
19
+ def accumulate *vals
20
+ self.key_count += 1
21
+ end
22
+
23
+ # emit each key field and the count, tab-separated.
24
+ def finalize
25
+ yield [key, formatted_key_count]
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,26 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # For each identical line in the map phase output, emit one representative
5
+ # line followed by the count of occrrences (separated by a tab).
6
+ #
7
+ # (This is the functional equivalent of +'uniq -c'+)
8
+ #
9
+ class CountLines < Wukong::Streamer::Base
10
+ def formatted_count item, key_count
11
+ "%s\t%10d" % [item, key_count.to_i]
12
+ end
13
+
14
+ #
15
+ # Delegate to +uniq -c+, but put the count last for idempotence.
16
+ #
17
+ def stream
18
+ %x{/usr/bin/uniq -c}.split("\n").each do |line|
19
+ key_count, item = line.chomp.strip.split(/\s+/, 2)
20
+ puts formatted_count(item, key_count)
21
+ end
22
+ end
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,20 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # emit only some records, as dictated by the #emit? method
5
+ #
6
+ # This is a mixin: including this module in your streamer
7
+ # implements its +#process+ method.
8
+ #
9
+ module Filter
10
+ #
11
+ # Filter out a subset of record/lines
12
+ #
13
+ # Subclass and re-define the emit? method
14
+ #
15
+ def process *record, &block
16
+ yield record if emit?(record)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,12 @@
1
+ module Wukong
2
+ module Streamer
3
+ class LineStreamer < Wukong::Streamer::Base
4
+ #
5
+ # Turns a flat line into a record for +#process+ to consume
6
+ #
7
+ def recordize line
8
+ [line]
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,20 @@
1
+ module Wukong
2
+ module Streamer
3
+ #
4
+ # Emit each unique key and the count of its occurrences
5
+ #
6
+ class ListReducer < Wukong::Streamer::AccumulatingReducer
7
+ attr_accessor :values
8
+
9
+ # reset the counter to zero
10
+ def start! *args
11
+ self.values = []
12
+ end
13
+
14
+ # record one more for this key
15
+ def accumulate *record
16
+ self.values << record
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ module Wukong
2
+ module Streamer
3
+ module PreprocessWithPipeStreamer
4
+ #
5
+ # Runs STDIN through a shell command and then begins processing.
6
+ #
7
+ # If you don't need to do anything to the output of the command, just
8
+ # inherit from Wukong::Script and override the #map_command.
9
+ #
10
+ # You must provide a @preprocess_pipe_command@ method that returns a shell
11
+ # command to run the input through.
12
+ #
13
+ def stream
14
+ #
15
+ `#{preprocess_pipe_command}`.each do |line|
16
+ item = itemize(line) ; next if item.blank?
17
+ process(*item)
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end