mrflip-wukong 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (137) hide show
  1. data/LICENSE.txt +202 -0
  2. data/README-tutorial.textile +163 -0
  3. data/README.textile +165 -0
  4. data/bin/cutc +30 -0
  5. data/bin/cuttab +5 -0
  6. data/bin/greptrue +8 -0
  7. data/bin/hdp-cat +3 -0
  8. data/bin/hdp-catd +3 -0
  9. data/bin/hdp-du +81 -0
  10. data/bin/hdp-get +3 -0
  11. data/bin/hdp-kill +3 -0
  12. data/bin/hdp-ls +10 -0
  13. data/bin/hdp-mkdir +3 -0
  14. data/bin/hdp-mv +3 -0
  15. data/bin/hdp-parts_to_keys.rb +77 -0
  16. data/bin/hdp-ps +3 -0
  17. data/bin/hdp-put +3 -0
  18. data/bin/hdp-rm +11 -0
  19. data/bin/hdp-sort +29 -0
  20. data/bin/hdp-stream +29 -0
  21. data/bin/hdp-stream-flat +18 -0
  22. data/bin/hdp-sync +17 -0
  23. data/bin/hdp-wc +67 -0
  24. data/bin/md5sort +20 -0
  25. data/bin/tabchar +5 -0
  26. data/bin/uniqc +3 -0
  27. data/bin/wu-hist +3 -0
  28. data/bin/wu-lign +177 -0
  29. data/bin/wu-sum +30 -0
  30. data/doc/README-wulign.textile +59 -0
  31. data/doc/README-wutils.textile +128 -0
  32. data/doc/UsingWukong-part1.textile +2 -0
  33. data/doc/UsingWukong-part2.textile +2 -0
  34. data/doc/UsingWukong-part3-parsing.textile +132 -0
  35. data/doc/code/api_response_example.txt +20 -0
  36. data/doc/code/parser_skeleton.rb +38 -0
  37. data/doc/hadoop-setup.textile +21 -0
  38. data/doc/intro_to_map_reduce/MapReduceDiagram.graffle +0 -0
  39. data/doc/links.textile +42 -0
  40. data/doc/overview.textile +91 -0
  41. data/doc/pig/PigLatinExpressionsList.txt +122 -0
  42. data/doc/pig/PigLatinReferenceManual.html +19134 -0
  43. data/doc/pig/PigLatinReferenceManual.txt +1640 -0
  44. data/doc/tips.textile +65 -0
  45. data/doc/utils.textile +48 -0
  46. data/examples/README.txt +17 -0
  47. data/examples/and_pig/sample_queries.rb +128 -0
  48. data/examples/apache_log_parser.rb +53 -0
  49. data/examples/count_keys.rb +56 -0
  50. data/examples/count_keys_at_mapper.rb +57 -0
  51. data/examples/graph/adjacency_list.rb +74 -0
  52. data/examples/graph/breadth_first_search.rb +79 -0
  53. data/examples/graph/gen_2paths.rb +68 -0
  54. data/examples/graph/gen_multi_edge.rb +103 -0
  55. data/examples/graph/gen_symmetric_links.rb +53 -0
  56. data/examples/package-local.rb +100 -0
  57. data/examples/package.rb +96 -0
  58. data/examples/pagerank/README.textile +6 -0
  59. data/examples/pagerank/gen_initial_pagerank_graph.pig +57 -0
  60. data/examples/pagerank/pagerank.rb +88 -0
  61. data/examples/pagerank/pagerank_initialize.rb +46 -0
  62. data/examples/pagerank/run_pagerank.sh +19 -0
  63. data/examples/rank_and_bin.rb +173 -0
  64. data/examples/run_all.sh +47 -0
  65. data/examples/sample_records.rb +44 -0
  66. data/examples/size.rb +60 -0
  67. data/examples/word_count.rb +95 -0
  68. data/lib/wukong.rb +11 -0
  69. data/lib/wukong/and_pig.rb +62 -0
  70. data/lib/wukong/and_pig/README.textile +12 -0
  71. data/lib/wukong/and_pig/as.rb +37 -0
  72. data/lib/wukong/and_pig/data_types.rb +30 -0
  73. data/lib/wukong/and_pig/functions.rb +50 -0
  74. data/lib/wukong/and_pig/generate.rb +85 -0
  75. data/lib/wukong/and_pig/generate/variable_inflections.rb +85 -0
  76. data/lib/wukong/and_pig/junk.rb +51 -0
  77. data/lib/wukong/and_pig/operators.rb +8 -0
  78. data/lib/wukong/and_pig/operators/compound.rb +29 -0
  79. data/lib/wukong/and_pig/operators/evaluators.rb +7 -0
  80. data/lib/wukong/and_pig/operators/execution.rb +15 -0
  81. data/lib/wukong/and_pig/operators/file_methods.rb +29 -0
  82. data/lib/wukong/and_pig/operators/foreach.rb +98 -0
  83. data/lib/wukong/and_pig/operators/groupies.rb +212 -0
  84. data/lib/wukong/and_pig/operators/load_store.rb +65 -0
  85. data/lib/wukong/and_pig/operators/meta.rb +42 -0
  86. data/lib/wukong/and_pig/operators/relational.rb +129 -0
  87. data/lib/wukong/and_pig/pig_struct.rb +48 -0
  88. data/lib/wukong/and_pig/pig_var.rb +95 -0
  89. data/lib/wukong/and_pig/symbol.rb +29 -0
  90. data/lib/wukong/and_pig/utils.rb +0 -0
  91. data/lib/wukong/bad_record.rb +18 -0
  92. data/lib/wukong/boot.rb +47 -0
  93. data/lib/wukong/datatypes.rb +24 -0
  94. data/lib/wukong/datatypes/enum.rb +123 -0
  95. data/lib/wukong/dfs.rb +80 -0
  96. data/lib/wukong/encoding.rb +111 -0
  97. data/lib/wukong/extensions.rb +15 -0
  98. data/lib/wukong/extensions/array.rb +18 -0
  99. data/lib/wukong/extensions/blank.rb +93 -0
  100. data/lib/wukong/extensions/class.rb +189 -0
  101. data/lib/wukong/extensions/date_time.rb +24 -0
  102. data/lib/wukong/extensions/emittable.rb +82 -0
  103. data/lib/wukong/extensions/hash.rb +120 -0
  104. data/lib/wukong/extensions/hash_like.rb +112 -0
  105. data/lib/wukong/extensions/hashlike_class.rb +47 -0
  106. data/lib/wukong/extensions/module.rb +2 -0
  107. data/lib/wukong/extensions/pathname.rb +27 -0
  108. data/lib/wukong/extensions/string.rb +65 -0
  109. data/lib/wukong/extensions/struct.rb +17 -0
  110. data/lib/wukong/extensions/symbol.rb +11 -0
  111. data/lib/wukong/logger.rb +40 -0
  112. data/lib/wukong/models/graph.rb +27 -0
  113. data/lib/wukong/rdf.rb +104 -0
  114. data/lib/wukong/schema.rb +39 -0
  115. data/lib/wukong/script.rb +265 -0
  116. data/lib/wukong/script/hadoop_command.rb +111 -0
  117. data/lib/wukong/script/local_command.rb +14 -0
  118. data/lib/wukong/streamer.rb +13 -0
  119. data/lib/wukong/streamer/accumulating_reducer.rb +89 -0
  120. data/lib/wukong/streamer/base.rb +76 -0
  121. data/lib/wukong/streamer/count_keys.rb +30 -0
  122. data/lib/wukong/streamer/count_lines.rb +26 -0
  123. data/lib/wukong/streamer/filter.rb +20 -0
  124. data/lib/wukong/streamer/line_streamer.rb +12 -0
  125. data/lib/wukong/streamer/list_reducer.rb +20 -0
  126. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +22 -0
  127. data/lib/wukong/streamer/rank_and_bin_reducer.rb +145 -0
  128. data/lib/wukong/streamer/set_reducer.rb +14 -0
  129. data/lib/wukong/streamer/struct_streamer.rb +48 -0
  130. data/lib/wukong/streamer/summing_reducer.rb +29 -0
  131. data/lib/wukong/streamer/uniq_by_last_reducer.rb +44 -0
  132. data/lib/wukong/typed_struct.rb +12 -0
  133. data/lib/wukong/wukong_class.rb +20 -0
  134. data/spec/bin/hdp-wc_spec.rb +4 -0
  135. data/spec/spec_helper.rb +0 -0
  136. data/wukong.gemspec +173 -0
  137. metadata +208 -0
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # Use this script to do a Breadth-First Search (BFS) of a graph.
7
+ #
8
+ # Usage:
9
+ # ./make_paths --head=[path_in_key] --tail=[path_out_key] --out_rsrc=[combined_path_key]
10
+ #
11
+ # For example, given an edge list in the file '1path.tsv' that looks like
12
+ # 1path n1 n2
13
+ # 1path n1 n3
14
+ # ... and so forth ...
15
+ # you can run
16
+ # for t in 1 2 3 4 5 6 7 8 9 ; do next=$((t+1)) ; time cat 1path.tsv "${t}path.tsv" | ./make_paths.rb --map --head="1path" --tail="${t}path" | sort -u | ./make_paths.rb --reduce --out_rsrc="${next}path" | sort -u > "${next}path.tsv" ; done
17
+ # to do a 9-deep breadth-first search.
18
+ #
19
+ module Gen1HoodEdges
20
+ class Mapper < Wukong::Streamer::Base
21
+ attr_accessor :head, :tail
22
+ def initialize options
23
+ self.head = options[:head]
24
+ self.tail = options[:tail]
25
+ end
26
+ def process rsrc, *nodes
27
+ yield [ nodes.last, 'i', nodes[0..-2] ] if (rsrc == self.head)
28
+ yield [ nodes.first, 'o', nodes[1..-1] ] if (rsrc == self.tail)
29
+ end
30
+ end
31
+
32
+ #
33
+ # Accumulate ( !!in memory!!) all inbound links onto middle node
34
+ #
35
+ # Then for each outbound link, loop over those inbound links and emit the
36
+ # triple (in, mid,out)
37
+ #
38
+ class Reducer < Wukong::Streamer::AccumulatingReducer
39
+ attr_accessor :paths_in, :out_rsrc
40
+ def initialize options
41
+ self.out_rsrc = options[:out_rsrc]
42
+ end
43
+ # clear the list of incoming paths
44
+ def start! *args
45
+ self.paths_in = []
46
+ end
47
+ def accumulate mid, dir, *nodes
48
+ case dir
49
+ when 'i'
50
+ self.paths_in << nodes
51
+ if (self.paths_in.length % 1000 == 0) && (self.paths_in.length > 10000)
52
+ $stderr.puts ["Accumulating:", mid, self.paths_in.length].join("\t")
53
+ end
54
+ when 'o'
55
+ paths_in.each do |path_in|
56
+ yield [self.out_rsrc, path_in, mid, *nodes]
57
+ end
58
+ end
59
+ end
60
+ def finalize
61
+ end
62
+ def get_key mid, *_
63
+ mid
64
+ end
65
+ end
66
+
67
+ class Script < Wukong::Script
68
+ def default_options
69
+ super.merge :sort_fields => 2, :partition_fields => 1
70
+ end
71
+ end
72
+
73
+ end
74
+
75
+ # Execute the script
76
+ Gen1HoodEdges::Script.new(
77
+ Gen1HoodEdges::Mapper,
78
+ Gen1HoodEdges::Reducer
79
+ ).run
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+
5
+ class Edge < Struct.new(:src, :dest)
6
+ end
7
+
8
+ class MultiEdge < Struct.new(
9
+ :src, :dest,
10
+ :a_follows_b, :b_follows_a,
11
+ :a_replies_b, :b_replies_a,
12
+ :a_favorites_b, :b_favorites_a
13
+ )
14
+ end
15
+
16
+ module Gen1HoodEdges
17
+ class Mapper < Wukong::Streamer::Base
18
+ def process rsrc, src, dest
19
+ # next if (src.to_i == 0) || (dest.to_i == 0)
20
+ yield [ dest, 'i', src ]
21
+ yield [ src, 'o', dest]
22
+ end
23
+ end
24
+
25
+ #
26
+ # Accumulate ( !!in memory!!) all inbound links onto middle node
27
+ #
28
+ # Then for each outbound link, loop over those inbound links and emit the
29
+ # triple (in, mid,out)
30
+ #
31
+ class Reducer < Wukong::Streamer::AccumulatingReducer
32
+ attr_accessor :ins
33
+ def start! *args
34
+ self.ins = []
35
+ end
36
+ def accumulate mid, dir, node
37
+ case dir
38
+ when 'i'
39
+ self.ins << node
40
+ if (self.ins.length % 1000 == 0) && (self.ins.length > 10000)
41
+ $stderr.puts ["Accumulating:", mid, self.ins.length].join("\t")
42
+ end
43
+ when 'o'
44
+ ins.each do |inn|
45
+ yield ['path_2', inn, mid, node]
46
+ end
47
+ end
48
+ end
49
+ def finalize
50
+ end
51
+ def get_key mid, *_
52
+ mid
53
+ end
54
+ end
55
+
56
+ class Script < Wukong::Script
57
+ def default_options
58
+ super.merge :sort_fields => 2, :partition_fields => 1
59
+ end
60
+ end
61
+
62
+ end
63
+
64
+ # Execute the script
65
+ Gen1HoodEdges::Script.new(
66
+ Gen1HoodEdges::Mapper,
67
+ Gen1HoodEdges::Reducer
68
+ ).run
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+ require 'wukong/models/graph'; include Wukong::Models
5
+
6
+ #
7
+ # Takes any number of flavors of directed edge with the form
8
+ #
9
+ # a_relatesto_b src_id dest_id [optional fields]
10
+ #
11
+ # and prepares a combined adjacency list. You need to supply a model named
12
+ # "MultiEdge" with members for each edge type.
13
+ #
14
+ # For instance, suppose you have a social network with edges like
15
+ #
16
+ # a_follows_b user_a_id user_b_id
17
+ # a_messages_b user_a_id user_b_id message_id date
18
+ # a_favorites_b user_a_id user_b_id message_id date
19
+ #
20
+ # Your MultiEdge class might look like
21
+ #
22
+ # class MultiEdge < Struct(
23
+ # :src, :dest,
24
+ # :a_follows_b, :b_follows_a,
25
+ # :a_messages_b, :b_messages_a,
26
+ # :a_favorites_b, :b_favorites_a
27
+ # )
28
+ # end
29
+ #
30
+ # The row for a user pair who follows each other; with user_a #24601 messaging b
31
+ # 57 times and favoriting 5 of user_b's messages; and user_b #8675309 messaging
32
+ # 62 times and favoriting none, will emerge as (tab separated, with [blank]
33
+ # indicating there is no text in that slot):
34
+ #
35
+ # ...
36
+ # 24601 8675309 1 1 57 62 5 [blank]
37
+ # ...
38
+ #
39
+ module GenMultiEdge
40
+ #
41
+ # Emit each relation as
42
+ #
43
+ # src dest rel
44
+ #
45
+ # Canonicalizes the src and dest ids to 10-character, zero-padded strings.
46
+ # (Ten chars fits a 32-bit up-to-4-billion-and-change unsigned integer.)
47
+ # Discards all the ancillary crap except +src+, +dest+ and +rel+
48
+ #
49
+ class Mapper < Wukong::Streamer::Base
50
+ def process rsrc, src, dest, *_
51
+ # note that a_retweets_b_id matches here
52
+ m = /^a_([a-z]+)_b.*/.match(rsrc) or return
53
+ rel = m.captures.first
54
+ src = src.to_i ; dest = dest.to_i
55
+ return if ((src == 0) || (dest == 0))
56
+ yield ["%010d"%src, "%010d"%dest, "a_#{rel}_b"]
57
+ yield ["%010d"%dest, "%010d"%src, "b_#{rel}_a"]
58
+ end
59
+ end
60
+
61
+ #
62
+ # Aggregate all sightings of relations for each pair into
63
+ # a single combined
64
+ #
65
+ # Note that [a,b] and [b,a] /each/ have a listing, with the a->b and b<-a
66
+ # relations repeated for each. That is, if there is an "a_messages_b"
67
+ # relation, you'll have edges
68
+ #
69
+ # x y ... a_messages_b(x,y) b_messages_a(y,x) ...
70
+ # y x ... a_messages_b(y,x) b_messages_a(x,y) ...
71
+ #
72
+ #
73
+ class Reducer < Wukong::Streamer::AccumulatingReducer
74
+ attr_accessor :multi_edge
75
+ def get_key src, dest, rel
76
+ [src, dest]
77
+ end
78
+ def start! *args
79
+ self.multi_edge = MultiEdge.new
80
+ end
81
+ def accumulate src, dest, rel
82
+ self.multi_edge[rel] ||= 0
83
+ self.multi_edge[rel] += 1
84
+ end
85
+ def finalize
86
+ multi_edge.src, multi_edge.dest = key
87
+ yield self.multi_edge
88
+ end
89
+ end
90
+
91
+ #
92
+ # Sort on the first two keys: each @[src, dest]@ pair winds up at the same
93
+ # reducer.
94
+ #
95
+ class Script < Wukong::Script
96
+ def default_options
97
+ super.merge :sort_fields => 2
98
+ end
99
+ end
100
+
101
+ # Execute the script
102
+ Script.new(Mapper, Reducer).run
103
+ end
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ require 'wukong'
4
+
5
+ class Edge < Struct.new(:src, :dest)
6
+ end
7
+
8
+ class ASymmetricB < Edge
9
+ end
10
+
11
+ module Wukong::Streamer
12
+ class EdgeStreamer < Wukong::Streamer::Base
13
+ def recordize line
14
+ rsrc, src, dest, *_ = super(line)
15
+ [ASymmetricB.new(src.to_i, dest.to_i)]
16
+ end
17
+ end
18
+ end
19
+
20
+ module FindSymmetricLinks
21
+
22
+ class Mapper < Wukong::Streamer::EdgeStreamer
23
+ def process edge
24
+ yield edge.to_flat(false)
25
+ yield ASymmetricB.new(edge.dest, edge.src).to_flat(false)
26
+ end
27
+ end
28
+
29
+ #
30
+ #
31
+ class Reducer < Wukong::Streamer::Base
32
+ def stream
33
+ %x{/usr/bin/uniq -c}.split("\n").each do |line|
34
+ key_count, rsrc, src, dest, data = line.chomp.strip.split(/\s+/, 4)
35
+ next unless key_count.to_i == 2
36
+ next unless src.to_i < dest.to_i
37
+ emit [src, dest, data].compact
38
+ end
39
+ end
40
+ end
41
+
42
+ class Script < Wukong::Script
43
+ def default_options
44
+ super.merge :sort_fields => 3
45
+ end
46
+ end
47
+ end
48
+
49
+ # Execute the script
50
+ Wukong::Script.new(
51
+ FindSymmetricLinks::Mapper,
52
+ FindSymmetricLinks::Reducer
53
+ ).run
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../lib'
3
+
4
+ require 'wukong'
5
+
6
+ #
7
+ # This is so very very kludgey
8
+ #
9
+ # Input is an 'ls' file, listing files to .bz2 package.
10
+ #
11
+ # Mapper takes each in turn and creates, within a parallel directory tree under
12
+ # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
13
+ #
14
+ # So, the file
15
+ # /user/me/fixd/all-20090103
16
+ # is packaged onto the DFS as
17
+ # /user/me/pkgd/user/me/fixd/all-20090103
18
+ #
19
+ # listing=tmp/fixd-all-package-listing
20
+ # hdp-rm $listing
21
+ # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
22
+ #
23
+ # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
24
+ #
25
+ module ExportPackager
26
+ PKGD_DIR = '/workspace/flip/pkgd'
27
+
28
+ #
29
+ #
30
+ class Reducer < Wukong::Streamer::Base
31
+ def announce *args
32
+ $stdout.puts *args
33
+ $stderr.puts *args
34
+ end
35
+
36
+ def handle_existing_target output_filename
37
+ return true unless File.exist?(output_filename)
38
+ # announce "Exists! #{output_filename}"
39
+ # return false
40
+ announce "Removing target file #{output_filename}"
41
+ begin announce `rm #{output_filename}`
42
+ rescue Exception => e ; announce e ; end
43
+ true
44
+ end
45
+
46
+ def mkdir_target_safely output_filename
47
+ output_dir = File.dirname(output_filename)
48
+ announce "Ensuring directory #{output_dir} exists"
49
+ begin announce `mkdir -p #{output_dir}`
50
+ rescue Exception => e ; announce e ; end
51
+ end
52
+
53
+ def bzip_into_pkgd_file input_filename, output_filename
54
+ announce "bzip'ing into #{output_filename}"
55
+ announce `( hadoop dfs -cat #{input_filename}/[^_]\** ) | bzip2 -c > #{output_filename}`
56
+ end
57
+
58
+ def gen_output_filename input_filename
59
+ input_filename += '.tsv' unless input_filename =~ /.*\.\w{2,}/
60
+ "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(/^\//, '')]
61
+ end
62
+
63
+ def rsync host, local_path, remote_path=nil
64
+ remote_path ||= local_path
65
+ announce `/usr/bin/rsync -Cuvrtlp #{local_path} #{host}:#{remote_path}`
66
+ sleep 5
67
+ end
68
+
69
+ def process input_filename
70
+ output_filename = gen_output_filename(input_filename)
71
+ handle_existing_target(output_filename) or return
72
+ mkdir_target_safely output_filename
73
+ bzip_into_pkgd_file input_filename, output_filename
74
+ rsync :lab3, output_filename
75
+ #
76
+ end
77
+
78
+ def recordize line
79
+ # handle ls or straight file list, either
80
+ line.split(/\s/).last
81
+ end
82
+
83
+ def stream
84
+ super
85
+ rsync :lab3, PKGD_DIR+'/'
86
+ end
87
+ end
88
+
89
+ class Script < Wukong::Script
90
+ def default_options
91
+ super.merge :map_tasks => 1,
92
+ :max_node_reduce_tasks => 1, # only one reducer per local filesystem
93
+ :timeout => 40 * 60 * 1000 # timeout in ms
94
+ end
95
+ end
96
+ # Execute the script
97
+ Script.new(nil, Reducer).run
98
+ end
99
+
100
+
@@ -0,0 +1,96 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
3
+ require 'wukong'
4
+
5
+ #
6
+ # This is so very very kludgey
7
+ #
8
+ # Input is an 'ls' file, listing files to .bz2 package.
9
+ #
10
+ # Reducer takes each in turn and creates, within a parallel directory tree under
11
+ # ~/pkgd on the HDFS, a .bz2 compressed version of the file.
12
+ #
13
+ # So, the file
14
+ # /user/me/fixd/all-20090103
15
+ # is packaged onto the DFS as
16
+ # /user/me/pkgd/user/me/fixd/all-20090103
17
+ #
18
+ # listing=tmp/fixd-all-package-listing
19
+ # hdp-rm $listing
20
+ # hadoop dfs -lsr fixd | egrep '(part-|\.tsv$)' | hdp-put - $listing ;
21
+ #
22
+ # ./package.rb --run --rm --map_tasks=1 $listing $pkgd_log
23
+ #
24
+ module ExportPackager
25
+ PKGD_DIR = 'pkgd'
26
+
27
+ #
28
+ #
29
+ class Reducer < Wukong::Streamer::Base
30
+ def announce str
31
+ return if str.blank?
32
+ $stderr.puts str
33
+ $stdout.puts str
34
+ end
35
+
36
+ def remove_target_filename output_filename
37
+ begin announce "rm\t#{"%-70s"%output_filename}\t" +
38
+ `( hadoop dfs -rmr #{output_filename} ) 2>&1`
39
+ rescue ; nil ; end
40
+ end
41
+
42
+ def mkdir_target_safely output_filename
43
+ output_dir = File.dirname(output_filename)
44
+ begin announce "mkdir\t#{"%-70s"%output_dir}\t" +
45
+ `( hadoop dfs -mkdir #{output_dir} ) 2>&1`
46
+ rescue ; nil ; end
47
+ end
48
+
49
+ def bzip_into_pkgd_file input_filename, output_filename
50
+ announce "cat|bz\t#{"%-70s"%input_filename}\t" +
51
+ `( hadoop dfs -cat #{input_filename}/[^_]\\* | bzip2 -c | hadoop dfs -put - #{output_filename} ) 2>&1`
52
+ end
53
+
54
+ def verify input_filename, output_filename
55
+ announce "sha1sum\t#{"%-70s"%output_filename}\t" +
56
+ `( hadoop dfs -cat #{output_filename} | bzcat - | sha1sum ) 2>&1`
57
+ announce "sha1sum\t#{"%-70s"%input_filename}\t" +
58
+ `( hadoop dfs -cat #{input_filename}/[^_]\\* | sha1sum ) 2>&1`
59
+ end
60
+
61
+ def gen_output_filename input_filename
62
+ "%s/%s.bz2" % [PKGD_DIR, input_filename.gsub(%r{^/},"")]
63
+ end
64
+
65
+ def process input_filename, output_filename
66
+ # remove_target_filename output_filename
67
+ # mkdir_target_safely output_filename
68
+ bzip_into_pkgd_file input_filename, output_filename
69
+ verify input_filename, output_filename
70
+ end
71
+
72
+ def stream
73
+ announce `hostname`
74
+ $stdin.each do |input_filename|
75
+ # handle ls or straight file list, either
76
+ input_filename = input_filename.chomp.strip.split(/\s/).last
77
+ output_filename = gen_output_filename input_filename
78
+ announce "********************************************************"
79
+ announce "Packing\t#{"%-70s"%input_filename}\t#{output_filename}"
80
+ process input_filename, output_filename
81
+ announce "Done\t#{"%-70s"%input_filename}\t#{output_filename}\n\n"
82
+ end
83
+ end
84
+ end
85
+
86
+ class Script < Wukong::Script
87
+ def default_options
88
+ super.merge :timeout => (24 * 60 * 60 * 1000) # milliseconds in one day
89
+ end
90
+ end
91
+ end
92
+
93
+ #
94
+ # Execute the script
95
+ #
96
+ ExportPackager::Script.new(nil, ExportPackager::Reducer, :reduce_tasks => 1000).run