mandy 0.4.83 → 0.4.86
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +2 -1
- data/bin/mandy +0 -1
- data/lib/mandy.rb +1 -0
- metadata +1 -5
- data/bin/mandy-run +0 -58
- data/lib/wrappers/mandy_local_wrapper.rb +0 -58
- data/lib/wrappers/mandy_wrapper.rb +0 -94
data/Gemfile
CHANGED
data/bin/mandy
CHANGED
@@ -35,7 +35,6 @@ puts '------------------------'
|
|
35
35
|
'mandy-install' => 'Installs the Mandy Rubygem on several hosts via ssh.',
|
36
36
|
'mandy-local' => 'Run a Map/Reduce task locally without requiring hadoop',
|
37
37
|
'mandy-hadoop' => 'Run a Map/Reduce task on hadoop using the provided cluster config',
|
38
|
-
'mandy-run' => 'Run an entire Map/Reduce workflow with one command',
|
39
38
|
'mandy-rm' => 'remove a file or directory from HDFS',
|
40
39
|
'mandy-put' => 'upload a file into HDFS',
|
41
40
|
'mandy-map' => 'Run a map task reading on STDIN and writing to STDOUT',
|
data/lib/mandy.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mandy
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.86
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andy Kent
|
@@ -49,7 +49,6 @@ executables:
|
|
49
49
|
- mandy-mkdir
|
50
50
|
- mandy-exists
|
51
51
|
- mandy-install
|
52
|
-
- mandy-run
|
53
52
|
extensions: []
|
54
53
|
|
55
54
|
extra_rdoc_files: []
|
@@ -61,7 +60,6 @@ files:
|
|
61
60
|
- bin/mandy-get
|
62
61
|
- bin/mandy-put
|
63
62
|
- bin/mandy-reduce
|
64
|
-
- bin/mandy-run
|
65
63
|
- readme.md
|
66
64
|
- Rakefile
|
67
65
|
- bootstrap.rb
|
@@ -93,8 +91,6 @@ files:
|
|
93
91
|
- lib/ruby-hbase/version.rb
|
94
92
|
- lib/ruby-hbase/xml_decoder.rb
|
95
93
|
- lib/test_runner.rb
|
96
|
-
- lib/wrappers/mandy_wrapper.rb
|
97
|
-
- lib/wrappers/mandy_local_wrapper.rb
|
98
94
|
has_rdoc: true
|
99
95
|
homepage: http://github.com/trafficbroker/mandy
|
100
96
|
licenses: []
|
data/bin/mandy-run
DELETED
@@ -1,58 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require "rubygems"
|
3
|
-
require "mandy"
|
4
|
-
require 'optparse'
|
5
|
-
require 'ostruct'
|
6
|
-
require 'json'
|
7
|
-
|
8
|
-
options = OpenStruct.new
|
9
|
-
|
10
|
-
OptionParser.new do |opts|
|
11
|
-
opts.banner = "USAGE: mandy-run script input [options]"
|
12
|
-
|
13
|
-
opts.on("-p", "--payload PAYLOAD", "Add a working directory to be sent to the cluster.") do |payload|
|
14
|
-
options.payload = payload
|
15
|
-
end
|
16
|
-
|
17
|
-
opts.on("-o", "--output OUTPUT", "Specify output path for your results.") do |payload|
|
18
|
-
options.payload = payload
|
19
|
-
end
|
20
|
-
|
21
|
-
opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
|
22
|
-
options.config = config
|
23
|
-
end
|
24
|
-
|
25
|
-
opts.on("-j", '--json "{\"key\":\"1 value\"}"', "Pass JSON encoded parameters to jobs") do |config|
|
26
|
-
options.json = config
|
27
|
-
end
|
28
|
-
|
29
|
-
opts.on("-l", "--local", "Run script using mandy-local.") do
|
30
|
-
options.local = true
|
31
|
-
end
|
32
|
-
|
33
|
-
opts.on_tail("-h", "--help", "Show this message") do
|
34
|
-
puts opts
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
end.parse!
|
38
|
-
|
39
|
-
exec('mandy-local -h') unless ARGV.size >= 2
|
40
|
-
|
41
|
-
file = ARGV[0]
|
42
|
-
input = File.new(ARGV[1])
|
43
|
-
|
44
|
-
params = {}
|
45
|
-
params[:output_file] = options.output if options.output
|
46
|
-
params[:lib] = options.payload if options.payload
|
47
|
-
params[:parameters] = JSON.parse(options.json) if options.json
|
48
|
-
|
49
|
-
if options.local
|
50
|
-
require 'wrappers/mandy_local_wrapper'
|
51
|
-
else
|
52
|
-
require 'wrappers/mandy_wrapper'
|
53
|
-
set_mandy_config options.config || 'cluster.xml'
|
54
|
-
end
|
55
|
-
|
56
|
-
output_file = run_mandy(file, input, params)
|
57
|
-
|
58
|
-
puts "Results stored in: #{output_file}"
|
@@ -1,58 +0,0 @@
|
|
1
|
-
module Mandy
|
2
|
-
module Local
|
3
|
-
module Wrapper
|
4
|
-
SESSION_ID = Process.pid
|
5
|
-
|
6
|
-
def run_mandy(script, input_files, options = {})
|
7
|
-
begin
|
8
|
-
#doing this will load all the mandy jobs in memory which will be useful later on
|
9
|
-
require script
|
10
|
-
|
11
|
-
input_file = concat_input_files(input_files)
|
12
|
-
output_file_path = run_mandy_local(script, input_file, options)
|
13
|
-
return output_file_path unless block_given?
|
14
|
-
#if a block is given then yield the output file path and then delete this file before returning
|
15
|
-
yield output_file_path
|
16
|
-
ensure
|
17
|
-
File.delete(input_file) if File.exists?(input_file)
|
18
|
-
File.delete(output_file_path) if output_file_path && File.exists?(output_file_path) if block_given?
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
def concat_input_files(inputs)
|
24
|
-
inputs = [inputs] unless inputs.is_a?(Array)
|
25
|
-
base_dir = File.dirname(inputs.first.path)
|
26
|
-
input_file = "#{base_dir}/#{SESSION_ID}.csv"
|
27
|
-
`cat #{inputs.collect{|f| f.path}.join(' ')} > #{input_file}`
|
28
|
-
input_file
|
29
|
-
end
|
30
|
-
|
31
|
-
def run_mandy_local(script, input, options)
|
32
|
-
mandy_job_params = options.include?(:parameters) ? options[:parameters] : {}
|
33
|
-
param_args = "export json='#{mandy_job_params.to_json}' &&"
|
34
|
-
|
35
|
-
if options.include?(:lib)
|
36
|
-
FileUtils.cp(script, options[:lib])
|
37
|
-
script = File.join(options[:lib], File.basename(script))
|
38
|
-
end
|
39
|
-
|
40
|
-
output_path = options[:output_file] || generate_output_path
|
41
|
-
output_file = `#{param_args} mandy-local #{script} #{input} #{output_path}`
|
42
|
-
output_file = output_file.split("\n").last
|
43
|
-
output_file
|
44
|
-
ensure
|
45
|
-
File.delete(script) if options.include?(:lib)
|
46
|
-
end
|
47
|
-
|
48
|
-
def generate_output_path
|
49
|
-
output_dir = "/tmp/mandy_local_output"
|
50
|
-
FileUtils.mkdir_p(output_dir)
|
51
|
-
file_name = Mandy::Job.jobs.last.name.downcase.gsub(/\W/, '-')
|
52
|
-
"#{output_dir}/#{file_name}_#{DateTime.now.strftime('%Y%m%d%H%M%S')}"
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
Object.send(:include, Mandy::Local::Wrapper)
|
@@ -1,94 +0,0 @@
|
|
1
|
-
module Mandy
|
2
|
-
module Wrapper
|
3
|
-
SESSION_ID = Process.pid
|
4
|
-
|
5
|
-
def set_mandy_config(file_path)
|
6
|
-
@@config_path = file_path
|
7
|
-
end
|
8
|
-
|
9
|
-
def run_mandy(script, inputs, options = {})
|
10
|
-
begin
|
11
|
-
#doing this will load all the mandy jobs in memory which will be useful later on
|
12
|
-
require script
|
13
|
-
inputs = [inputs] unless inputs.is_a?(Array)
|
14
|
-
|
15
|
-
hdfs_input = inputs.all? {|i| i.is_a?(File)} ? process_files(inputs) : process_hdfs_locations(inputs)
|
16
|
-
|
17
|
-
run_mandy_hadoop(hdfs_input, script, options)
|
18
|
-
|
19
|
-
output_file_path = get_file_from_hdfs(hdfs_path, options)
|
20
|
-
return output_file_path unless block_given?
|
21
|
-
#if a block is given then yield the output file path and then delete this file before returning
|
22
|
-
yield output_file_path
|
23
|
-
ensure
|
24
|
-
File.delete(output_file_path) if output_file_path && File.exists?(output_file_path) if block_given?
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
private
|
29
|
-
def process_files(input_files)
|
30
|
-
hdfs_path = "#{self.class.to_s.split('::').join('-').downcase}/#{SESSION_ID}"
|
31
|
-
put_files_on_hdfs(hdfs_path, input_files)
|
32
|
-
hdfs_path
|
33
|
-
end
|
34
|
-
|
35
|
-
def process_locations(input_locations)
|
36
|
-
return input_locations.first if input_locations.size == 1
|
37
|
-
|
38
|
-
hdfs_path = "#{self.class.to_s.split('::').join('-').downcase}/#{SESSION_ID}"
|
39
|
-
input_locations.each_with_index do |location, index|
|
40
|
-
run_command "mandy-cp #{location} #{hdfs_path}/input#{index}"
|
41
|
-
end
|
42
|
-
hdfs_path
|
43
|
-
end
|
44
|
-
|
45
|
-
def put_files_on_hdfs(hdfs_path, input_files)
|
46
|
-
input_files.each do |input_file|
|
47
|
-
input_file_path = input_file.is_a?(File) ? File.expand_path(input_file.path) : input_file
|
48
|
-
base_filename = input_file_path.split("/").last
|
49
|
-
dest_file = ["input/#{hdfs_path}", base_filename].join("/")
|
50
|
-
run_command "mandy-put #{input_file_path} #{dest_file}"
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def run_mandy_hadoop(hdfs_path, script, options)
|
55
|
-
mandy_job_params = options.include?(:parameters) ? options[:parameters] : {}
|
56
|
-
param_args = "-j '#{mandy_job_params.to_json}'"
|
57
|
-
param_args += " -p '#{options[:lib]}'" if options.include?(:lib)
|
58
|
-
|
59
|
-
hdfs_output_path = "output/#{hdfs_path}"
|
60
|
-
run_command "mandy-rm output/#{hdfs_path}"
|
61
|
-
run_command "mandy-hadoop #{script} input/#{hdfs_path} output/#{hdfs_path} #{param_args}"
|
62
|
-
end
|
63
|
-
|
64
|
-
def get_file_from_hdfs(hdfs_path, options)
|
65
|
-
output_file_path = options[:output_file] || generate_output_path
|
66
|
-
hdfs_output_path = "output/#{hdfs_path}"
|
67
|
-
run_command "mandy-get #{get_hdfs_output(hdfs_output_path)} #{output_file_path}"
|
68
|
-
run_command "mandy-rm input/#{hdfs_path}"
|
69
|
-
run_command "mandy-rm output/#{hdfs_path}"
|
70
|
-
output_file_path
|
71
|
-
end
|
72
|
-
|
73
|
-
def run_command(command)
|
74
|
-
command = "#{command} -c #{@@config_path}"
|
75
|
-
respond_to?(:logger) ? logger.info(command) : p(command)
|
76
|
-
@output = `#{command}`
|
77
|
-
end
|
78
|
-
|
79
|
-
def generate_output_path
|
80
|
-
output_dir = "/tmp/mandy_output"
|
81
|
-
FileUtils.mkdir_p(output_dir)
|
82
|
-
file_name = Mandy::Job.jobs.last.name.downcase.gsub(/\W/, '-')
|
83
|
-
"#{output_dir}/#{file_name}_#{DateTime.now.strftime('%Y%m%d%H%M%S')}"
|
84
|
-
end
|
85
|
-
|
86
|
-
def get_hdfs_output(hdfs_output_path)
|
87
|
-
@output.each_line do |line|
|
88
|
-
return line.chomp.strip if line.include?(hdfs_output_path)
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
Object.send(:include, Mandy::Wrapper)
|