mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env bash
2
+ # insert a tab char from the command line:
3
+ # echo "hi$(tabchar)there"
4
+ # # => "hi there"
5
+ echo -n -e '\t'
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ uniq -c | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ "%15s\t" % $1 }'
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env bash
2
+
3
+ sort | uniq -c | sort -rn | ruby -ne 'puts $_.chomp.gsub(/^\s+(\d+)\s+/){ $1+"\t" }'
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ USAGE= %Q{
4
+ # h1. wulign -- format a tab-separated file as aligned columns
5
+ #
6
+ # wulign will intelligently reformat a tab-separated file into a tab-separated,
7
+ # space aligned file that is still suitable for further processing. For example,
8
+ # given the log-file input
9
+ #
10
+ # <pre><code>
11
+ # 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
12
+ # 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
13
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
14
+ # 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
15
+ # 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
16
+ # 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
17
+ # 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
18
+ # </code></pre>
19
+ #
20
+ # wulign will reformat it to read
21
+ #
22
+ # <pre><code>
23
+ # 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
24
+ # 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
25
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
26
+ # 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
27
+ # 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
28
+ # 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
29
+ # 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
30
+ # </code></pre>
31
+ #
32
+ # The fields are still tab-delimited by exactly one tab -- only spaces are used to
33
+ # pad out fields. You can still use cuttab and friends to manipulate columns.
34
+ #
35
+ # wulign isn't intended to be smart, or correct, or reliable -- only to be
36
+ # useful for previewing and organizing tab-formatted files. In general
37
+ # @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically
38
+ # equivalent to its input. (That is, the only changes should be insertion of
39
+ # spaces and re-formatting of numerics.) But still -- reserve its use for human
40
+ # inspection only.
41
+ #
42
+ # (Note: tab characters in this source code file have been converted to spaces;
43
+ # replace whitespace with tab in the first example if you'd like to play along at
44
+ # home.)
45
+ #
46
+ # h2. How it works
47
+ #
48
+ # Wulign takes the first 1000 lines, splits by TAB characters into fields, and
49
+ # tries to guess the format -- int, float, or string -- for each. It builds a
50
+ # consensus of the width and type for corresponding columns in the chunk. If a
51
+ # column has mixed numeric and string formats it degrades to :mixed, which is
52
+ # basically treated as :string. If a column has mixed :float and :int elements all
53
+ # of them are formatted as float.
54
+ #
55
+ # h2. Command-line arguments
56
+ #
57
+ # You can give sprintf-style positional arguments on the command line that will be
58
+ # applied to the corresponding columns. (Blank args are used for placeholding and
59
+ # auto-formatting is still applied). So with the example above,
60
+ #
61
+ # @cat foo | wulign '' '' '' '%8.4e'@
62
+ #
63
+ # will format the fourth column with "%8.4e", while the first three columns and
64
+ # fifth-and-higher columns are formatted as usual.
65
+ #
66
+ # <pre><code>
67
+ # ...
68
+ # 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
69
+ # 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
70
+ # 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
71
+ # ....
72
+ # </code></pre>
73
+ #
74
+ # h2. Notes
75
+ #
76
+ # * It has no knowledge of header rows. An all-text first line will screw everything up.
77
+ #
78
+ # * It also requires a unanimous vote. One screwy line can coerce the whole mess
79
+ # to :mixed; width formatting will still be applied, though.
80
+ #
81
+ # * It won't set columns wider than 70 chars -- this allows for the occasional
82
+ # super-wide column without completely breaking your screen.
83
+ #
84
+ # * For :float values, wulign tries to guess at the right number of significant
85
+ # digits to the left and right of the decimal point.
86
+ #
87
+ # * wulign does not parse 'TSV files' in their strict sense -- there is no quoting
88
+ # or escaping; every tab delimits a field, every newline a record.
89
+ }
90
+
91
+ if ARGV[0] == '--help'
92
+ puts $0
93
+ puts USAGE
94
+ exit
95
+ end
96
+
97
+ #
98
+ # How many initial lines to use to guess formatting. Lines after this are
99
+ # simply reformatted according to the consensus of the initial
100
+ # FORMAT_GUESSING_LINES.
101
+ #
102
+ FORMAT_GUESSING_LINES = 500
103
+ # widest column to set
104
+ MAX_MAX_WIDTH = 70
105
+
106
+ INT_RE = /\A\d+\z/
107
+ FLOAT_RE = /\A(\d+)(?:\.(\d+))?(?:e-?\d+)?\z/
108
+
109
+ def consensus_type val, alltype
110
+ return :mixed if alltype == :mixed
111
+ case
112
+ when val == '' then type = nil
113
+ when val =~ INT_RE then type = :int
114
+ when val =~ FLOAT_RE then type = :float
115
+ else type = :str end
116
+ return if ! type
117
+ case
118
+ when alltype.nil? then type
119
+ when alltype == type then type
120
+ when ( ((alltype==:float) && (type == :int)) || ((alltype == :int) && (type == :float)) )
121
+ :float
122
+ else :mixed
123
+ end
124
+ end
125
+
126
+ def f_width str
127
+ str =~ FLOAT_RE or return 0
128
+ [$1.length, $2 ? $2.length : 0]
129
+ end
130
+
131
+ maxw = []
132
+ col_types = []
133
+ col_minmag = []
134
+ col_maxmag = []
135
+ rows = []
136
+ skip_col = []
137
+ ARGV.each_with_index{|v,i| next if (v == '') ; maxw[i] = 0; skip_col[i] = true }
138
+ FORMAT_GUESSING_LINES.times do
139
+ line = $stdin.readline rescue nil
140
+ break unless line
141
+ cols = line.chomp.split("\t").map{|s| s.strip }
142
+ col_widths = cols.map{|col| col.length }
143
+ col_widths.each_with_index{|cw,i| maxw[i] = [[cw,maxw[i]].compact.max, MAX_MAX_WIDTH].min }
144
+ cols.each_with_index{|col,i|
145
+ next if skip_col[i]
146
+ col_types[i] = consensus_type(col, col_types[i])
147
+ if col_types[i] == :float
148
+ mantissa, radix = f_width(col)
149
+ col_minmag[i] = [radix, col_minmag[i], 1].compact.max
150
+ col_maxmag[i] = [mantissa, col_maxmag[i], 1].compact.max
151
+ end
152
+ }
153
+ # p [maxw, col_types, col_minmag, col_maxmag, col_widths, cols]
154
+ rows << cols
155
+ end
156
+
157
+ format = maxw.zip(col_types, col_minmag, col_maxmag, ARGV).map do |width, type, minmag, maxmag, default|
158
+ next(lambda{|s| default % s rescue s }) if default.to_s != ''
159
+ case type
160
+ when :mixed, nil then lambda{|s| "%-#{width}s" % s }
161
+ when :str then lambda{|s| "%-#{width}s" % s }
162
+ when :int then lambda{|s| "%#{width}d" % s.to_i }
163
+ when :float then lambda{|s| "%#{maxmag+minmag+1}.#{minmag}f" % s.to_f }
164
+ else raise "oops type #{type}" end
165
+ end
166
+ # p [maxw, col_types, col_minmag, col_maxmag, format]
167
+
168
+ pad = [''] * maxw.length
169
+ rows.each do |row|
170
+ # note -- strips trailing columns
171
+ puts row.zip(format).map{|c,f| f.call(c) }.join("\t")
172
+ end
173
+ $stdin.each do |line|
174
+ cols = line.chomp.split("\t").map{|s| s.strip }
175
+ # note -- strips trailing columns
176
+ puts cols.zip(format).map{|c,f| f.call(c) rescue c }.join("\t")
177
+ end
@@ -0,0 +1,30 @@
1
+ #!/usr/bin/env ruby
2
+ require 'wukong'
3
+ require 'wukong/streamer/summing_reducer'
4
+
5
+ #
6
+ #
7
+ class Summer < Wukong::Streamer::SummingReducer
8
+ attr_accessor :sample_line
9
+
10
+ def initialize *args
11
+ self.summing_elements = [0]
12
+ super *args
13
+ end
14
+
15
+ def start! *args
16
+ self.sample_line = args
17
+ super *args
18
+ end
19
+
20
+ def get_key *fields
21
+ fields.values_at(2,3)
22
+ end
23
+
24
+ def finalize
25
+ summing_elements.each{|idx| sample_line[idx] = sums[idx]}
26
+ yield sample_line
27
+ end
28
+ end
29
+
30
+ Wukong::Script.new(Summer, nil).run
@@ -0,0 +1,59 @@
1
+ h1. wulign -- format a tab-separated file as aligned columns
2
+
3
+ wulign will intelligently reformat a tab-separated file into a tab-separated, space aligned file that is still suitable for further processing. For example, given the log-file input
4
+
5
+ <pre><code>
6
+ 2009-07-21T21:39:40 day 65536 3.15479 68750 1171316
7
+ 2009-07-21T21:39:45 doing 65536 1.04533 26230 1053956
8
+ 2009-07-21T21:41:53 hapaxlegomenon 65536 0.87574e-05 23707 10051141
9
+ 2009-07-21T21:44:00 concert 500 0.29290 13367 9733414
10
+ 2009-07-21T21:44:29 world 65536 1.09110 32850 200916
11
+ 2009-07-21T21:44:39 world+series 65536 0.49380 9929 7972025
12
+ 2009-07-21T21:44:54 iranelection 65536 2.91775 14592 136342
13
+ </code></pre>
14
+
15
+ wulign will reformat it to read
16
+
17
+ <pre><code>
18
+ 2009-07-21T21:39:40 day 65536 3.154791234 68750 1171316
19
+ 2009-07-21T21:39:45 doing 65536 1.045330000 26230 1053956
20
+ 2009-07-21T21:41:53 hapaxlegomenon 65536 0.000008757 23707 10051141
21
+ 2009-07-21T21:44:00 concert 500 0.292900000 13367 9733414
22
+ 2009-07-21T21:44:29 world 65536 1.091100000 32850 200916
23
+ 2009-07-21T21:44:39 world+series 65536 0.493800000 9929 7972025
24
+ 2009-07-21T21:44:54 iranelection 65536 2.917750000 14592 136342
25
+ </code></pre>
26
+
27
+ The fields are still tab-delimited by exactly one tab -- only spaces are used to pad out fields. You can still use cuttab and friends to manipulate columns.
28
+
29
+ wulign isn't intended to be smart, or correct, or reliable -- only to be useful for previewing and organizing tab-formatted files. In general @wulign(foo).split("\t").map(&:strip)@ *should* give output semantically equivalent to its input. (That is, the only changes should be insertion of spaces and re-formatting of numerics.) But still -- reserve its use for human inspection only.
30
+
31
+ (Note: tab characters in this source code file have been converted to spaces; replace whitespace with tab in the first example if you'd like to play along at home.)
32
+
33
+ h2. How it works
34
+
35
+ Wulign takes the first 1000 lines, splits by TAB characters into fields, and tries to guess the format -- int, float, or string -- for each. It builds a consensus of the width and type for corresponding columns in the chunk. If a column has mixed numeric and string formats it degrades to :mixed, which is basically treated as :string. If a column has mixed :float and :int elements all of them are formatted as float.
36
+
37
+ h2. Command-line arguments
38
+
39
+ You can give sprintf-style positional arguments on the command line that will be applied to the corresponding columns. (Blank args are used for placeholding and auto-formatting is still applied). So with the example above,
40
+
41
+ @cat foo | wulign '' '' '' '%8.4e'@
42
+
43
+ will format the fourth column with "%8.4e", while the first three columns and fifth-and-higher columns are formatted as usual.
44
+
45
+ <pre><code>
46
+ ...
47
+ 2009-07-21T21:39:45 doing 65536 1.0453e+00 26230 1053956
48
+ 2009-07-21T21:41:53 hapaxlegomenon 65536 8.7574e-06 23707 10051141
49
+ 2009-07-21T21:44:00 concert 500 2.9290e-01 13367 9733414
50
+ ....
51
+ </code></pre>
52
+
53
+ h2. Notes
54
+
55
+ * It has no knowledge of header rows. An all-text first line will screw everything up.
56
+ * It also requires a unanimous vote. One screwy line can coerce the whole mess to :mixed; width formatting will still be applied, though.
57
+ * It won't set columns wider than 70 chars -- this allows for the occasional super-wide column without completely breaking your screen.
58
+ * For :float values, wulign tries to guess at the right number of significant digits to the left and right of the decimal point.
59
+ * wulign does not parse 'TSV files' in their strict sense -- there is no quoting or escaping; every tab delimits a field, every newline a record.
@@ -0,0 +1,128 @@
1
+ h1. Wukong Utility Scripts
2
+
3
+ h2. Stupid command-line tricks
4
+
5
+ h3. Histogram
6
+
7
+ Given data with a date column:
8
+
9
+ message 235623 20090423012345 Now is the winter of our discontent Made glorious summer by this son of York
10
+ message 235623 20080101230900 These pretzels are making me THIRSTY!
11
+ ...
12
+
13
+ You can calculate number of messages sent by day with
14
+
15
+ cat messages | cuttab 3 | cutc 8 | sort | uniq -c
16
+
17
+ (see the wuhist command, below.)
18
+
19
+ h3. Simple intersection, union, etc
20
+
21
+ For two datasets (batch_1 and batch_2) with unique entries (no repeated lines),
22
+
23
+ * Their union is simple:
24
+
25
+ cat batch_1 batch_2 | sort -u
26
+
27
+
28
+ * Their intersection:
29
+
30
+ cat batch_1 batch_2 | sort | uniq -c | egrep -v '^ *1 '
31
+
32
+ This concatenates the two sets and filters out everything that only occurred once.
33
+
34
+ * For the complement of the intersection, use "... | egrep '^ *1 '"
35
+
36
+ * In both cases, if the files are each internally sorted, the commandline sort takes a --merge flag:
37
+
38
+ sort --merge -u batch_1 batch_2
39
+
40
+ h2. Command Listing
41
+
42
+ h3. cutc
43
+
44
+ @cutc [colnum]@
45
+
46
+ Ex.
47
+
48
+ echo -e 'foo\tbar\tbaz' | cutc 6
49
+ foo ba
50
+
51
+ Cuts from beginning of line to given column (default 200). A tab is one character, so right margin can still be ragged.
52
+
53
+ h3. cuttab
54
+
55
+ @cuttab [colspec]@
56
+
57
+ Cuts given tab-separated columns. You can give a comma separated list of numbers
58
+ or ranges 1-4. columns are numbered from 1.
59
+
60
+ Ex.
61
+
62
+ echo -e 'foo\tbar\tbaz' | cuttab 1,3
63
+ foo baz
64
+
65
+ h3. hdp-*
66
+
67
+ These perform the corresponding commands on the HDFS filesystem. In general,
68
+ where they accept command-line flags, they go with the GNU-style ones, not the
69
+ hadoop-style: so, @hdp-du -s dir@ or @hdp-rm -r foo/@
70
+
71
+ * @hdp-cat@
72
+ * @hdp-catd@ -- cats the files that don't start with '_' in a directory. Use this for a pile of @.../part-00000@ files
73
+ * @hdp-du@
74
+ * @hdp-get@
75
+ * @hdp-kill@
76
+ * @hdp-ls@
77
+ * @hdp-mkdir@
78
+ * @hdp-mv@
79
+ * @hdp-ps@
80
+ * @hdp-put@
81
+ * @hdp-rm@
82
+ * @hdp-sync@
83
+
84
+ h3. hdp-sort, hdp-stream, hdp-stream-flat
85
+
86
+ * @hdp-sort@
87
+ * @hdp-stream@
88
+ * @hdp-stream-flat@
89
+
90
+ <code><pre>
91
+ hdp-stream input_filespec output_file map_cmd reduce_cmd num_key_fields
92
+ </pre></code>
93
+
94
+ h3. tabchar
95
+
96
+ Outputs a single tab character.
97
+
98
+ h3. wuhist
99
+
100
+ Occasionally useful to gather a lexical histogram of a single column:
101
+
102
+ Ex.
103
+
104
+ <code><pre>
105
+ $ echo -e 'foo\nbar\nbar\nfoo\nfoo\nfoo\n7' | ./wuhist
106
+ 4 foo
107
+ 2 bar
108
+ 1 7
109
+ </pre></code>
110
+
111
+ (the output will have a tab between the first and second column, for futher processing.)
112
+
113
+ h3. wulign
114
+
115
+ Intelligently format a tab-separated file into aligned columns (while remaining tab-separated for further processing). See README-wulign.textile.
116
+
117
+ h3. hdp-parts_to_keys.rb
118
+
119
+ A *very* clumsy script to rename reduced hadoop output files by their initial key.
120
+
121
+ If your output file has an initial key in the first column and you pass it
122
+ through hdp-sort, they will be distributed across reducers and thus output
123
+ files. (Because of the way hadoop hashes the keys, there's no guarantee that
124
+ each file will get a distinct key. You could have 2 keys with a million entries
125
+ and they could land sequentially on the same reducer, always fun.)
126
+
127
+ If you're willing to roll the dice, this script will rename files according to
128
+ the first key in the first line.
@@ -0,0 +1,2 @@
1
+ h1. Using Wukong and Wuclan, Part 1 - Setup
2
+
@@ -0,0 +1,2 @@
1
+ h1. Using Wukong and Wuclan, Part 2 - Scraping
2
+
@@ -0,0 +1,132 @@
1
+ h1. Using Wukong and Wuclan, Part 3 - Parsing
2
+
3
+ In part 2 we begain a scraper to trawl our desired part of the social web. Now
4
+ we're ready to start using Wukong to process the files.
5
+
6
+ Files come off the wire as
7
+
8
+ :url :scraped_at :response_code :response_message :contents
9
+ String DateTime (flat) Integer String String (JSON-formatted, tab&newline-munged)
10
+
11
+ The contents field is a JSON-formatted mix of records:
12
+
13
+ * TwitterFollowersRequest and TwitterFriendsRequest yield an @Array[Hash{user => raw_tweet}]@. We want to extract a stream of AFollowsB (with the request user as user_a for a friends request and user_b for a followers request) along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
14
+ * TwitterFavoritesRequest yields an array of @Array[Hash{tweet_hash => user_hash}]. We want to extract a stream of AFavoritesB along with the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records
15
+ * TwitterUser yields a single @user_hash@ making one each of TwitterUser, TwitterUserProfile and TwitterUserStyle.
16
+ * UserTimelineRequest and PublicTimelineRequest yield an Array[Hash{tweet => user}]. We want to extract the included Tweet, TwitterUser, TwitterUserProfile and TwitterUserStyle records.
17
+ * TwitterFollowerIdsRequest and TwitterFriendIdsRequest return an Array[user_ids] (each user_id is a simple Integer). We extract a series of AFollowsB (using the request's user_id as user_a_id or user_b_id)
18
+
19
+ We want to split each API response into a stream of those TwitterUser, Tweet, etc. records.
20
+
21
+ # Stream in each line (each line holds one request)
22
+ # turn the line into the corresponding TwitterRequest
23
+ # have the TwitterRequest parse its JSON contents and construct the TwitterUser, Tweet, etc.
24
+ # seriealize those records back out as tab-separated lines suitable for further processing with Wukong
25
+
26
+ h4. The basics of StructStreamer
27
+
28
+ Wukong handles the first and last steps through its StructStreamer and the standard .to_flat method. So the actual structure is really simple:
29
+
30
+ #
31
+ # Instantiate each incoming request.
32
+ # Stream out the contained classes it generates.
33
+ #
34
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
35
+ def process request
36
+ request.parse do |obj|
37
+ yield obj
38
+ end
39
+ end
40
+ end
41
+
42
+ # This makes the script go.
43
+ Wukong::Script.new(TwitterRequestParser, nil).run
44
+
45
+ In practice, all you need to know is that a StructStreamer gets a stream of objects to parse. Here's an outline of its internals. The Wukong StructStreamer:
46
+
47
+ # takes each flattened line:
48
+
49
+ "twitter_friends_request http://.... 20090701123456 ...fields... [{...}, {...}, ...json..., {...}]"
50
+
51
+ # splits by tabs to create an array of fields
52
+
53
+ ["twitter_friends_request", "http://...", ... "[{...}, {...}, ...json..., {...}]"]
54
+
55
+ # constructs the class name indicated in the first field,
56
+ using the values extracted from the remaining fields.
57
+
58
+ TwitterFriendsRequest.new "http://...", "20090701123456", ... "[{...}, {...}, ...json..., {...}]"
59
+
60
+ The last (contents) field is still just a string: there's nothing special about it to Wukong.
61
+
62
+ h4. Parsing
63
+
64
+ Since each requests' contents are handled in a slightly (and brittle-ly) different manner, we just ask each request object to parse itself and feed out all the TwitterXXXX objects it generates.
65
+
66
+ class TwitterFollowersRequest
67
+ # ...
68
+
69
+ def parse &block
70
+ return unless healthy?
71
+ # for each raw user/tweet pair in the parsed JSON contents,
72
+ parsed_contents.each do |hsh|
73
+ json_obj = JsonUserWithTweet.new(hsh, 'scraped_at' => scraped_at)
74
+ next unless json_obj && json_obj.healthy?
75
+ # Extract user, tweet and relationship
76
+ yield AFollowsB.new(json_obj.user.id, self.twitter_user_id) if json_obj.user
77
+ json_obj.each(&block)
78
+ end
79
+ end
80
+
81
+ # ...
82
+ end
83
+
84
+ The TwitterXXXRequest objects consist of one or many hashes with (a raw user hash, and possibly its latest raw tweet hash) or (a raw tweet hash and its raw user hash). The user hash might have only the fields for a TwitterPartialUser or it might have the fields for a full set of TwitterUser, TwitterUserProfile, TwitterUserStyle. Besides which, the fields themselves need some massaging to be compatible with Wukong and other tools in our Map/Reduce toolkit (details explained in a later section).
85
+
86
+ The fiddly little details are handled by a JsonUserWithTweet or JsonTweetWithUser (as appropriate) adapter pattern:
87
+
88
+ class JsonUserTweetPair
89
+ def initialize raw, moreinfo
90
+ # clean up fields in entries (flatten date, true/false -> 1/0, etc)
91
+ fix_raw_user!
92
+ fix_raw_tweet!
93
+ end
94
+
95
+ # generate all the contained TwitterXXX objects
96
+ def each
97
+ #
98
+ end
99
+
100
+ # create TwitterUser object from raw info
101
+ def user
102
+ end
103
+ # create Tweet object from raw tweet hash
104
+ def tweet
105
+ end
106
+ # ... and so forth
107
+ end
108
+
109
+ I'll ignore the gory details; view the source if you're interested.
110
+
111
+
112
+ h4. Running the script
113
+
114
+ Here, again, is the code (in full!) for the twitter_request_parser.rb script.
115
+
116
+ #
117
+ # Instantiate each incoming request.
118
+ # Stream out the contained classes it generates.
119
+ #
120
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
121
+ def process request
122
+ request.parse do |obj|
123
+ yield obj
124
+ end
125
+ end
126
+ end
127
+
128
+ # This makes the script go.
129
+ Wukong::Script.new(TwitterRequestParser, nil).run
130
+
131
+ That last line is the runner: it makes this a Wukong script with a map phase only. (We'll add in a reducer later on.)
132
+