mandy 0.4.83 → 0.4.86

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,2 +1,3 @@
1
1
  source "http://gemcutter.org"
2
- gem 'mandy'
2
+ gem 'mandy'
3
+ gem 'json'
data/bin/mandy CHANGED
@@ -35,7 +35,6 @@ puts '------------------------'
35
35
  'mandy-install' => 'Installs the Mandy Rubygem on several hosts via ssh.',
36
36
  'mandy-local' => 'Run a Map/Reduce task locally without requiring hadoop',
37
37
  'mandy-hadoop' => 'Run a Map/Reduce task on hadoop using the provided cluster config',
38
- 'mandy-run' => 'Run an entire Map/Reduce workflow with one command',
39
38
  'mandy-rm' => 'remove a file or directory from HDFS',
40
39
  'mandy-put' => 'upload a file into HDFS',
41
40
  'mandy-map' => 'Run a map task reading on STDIN and writing to STDOUT',
data/lib/mandy.rb CHANGED
@@ -11,6 +11,7 @@ require "cgi"
11
11
  packer
12
12
  support/tuple
13
13
  support/array_serializer
14
+ support/hdfs_location
14
15
  mappers/base_mapper
15
16
  mappers/transpose_mapper
16
17
  mappers/pass_through_mapper
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mandy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.83
4
+ version: 0.4.86
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andy Kent
@@ -49,7 +49,6 @@ executables:
49
49
  - mandy-mkdir
50
50
  - mandy-exists
51
51
  - mandy-install
52
- - mandy-run
53
52
  extensions: []
54
53
 
55
54
  extra_rdoc_files: []
@@ -61,7 +60,6 @@ files:
61
60
  - bin/mandy-get
62
61
  - bin/mandy-put
63
62
  - bin/mandy-reduce
64
- - bin/mandy-run
65
63
  - readme.md
66
64
  - Rakefile
67
65
  - bootstrap.rb
@@ -93,8 +91,6 @@ files:
93
91
  - lib/ruby-hbase/version.rb
94
92
  - lib/ruby-hbase/xml_decoder.rb
95
93
  - lib/test_runner.rb
96
- - lib/wrappers/mandy_wrapper.rb
97
- - lib/wrappers/mandy_local_wrapper.rb
98
94
  has_rdoc: true
99
95
  homepage: http://github.com/trafficbroker/mandy
100
96
  licenses: []
data/bin/mandy-run DELETED
@@ -1,58 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require "rubygems"
3
- require "mandy"
4
- require 'optparse'
5
- require 'ostruct'
6
- require 'json'
7
-
8
- options = OpenStruct.new
9
-
10
- OptionParser.new do |opts|
11
- opts.banner = "USAGE: mandy-run script input [options]"
12
-
13
- opts.on("-p", "--payload PAYLOAD", "Add a working directory to be sent to the cluster.") do |payload|
14
- options.payload = payload
15
- end
16
-
17
- opts.on("-o", "--output OUTPUT", "Specify output path for your results.") do |payload|
18
- options.payload = payload
19
- end
20
-
21
- opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
22
- options.config = config
23
- end
24
-
25
- opts.on("-j", '--json "{\"key\":\"1 value\"}"', "Pass JSON encoded parameters to jobs") do |config|
26
- options.json = config
27
- end
28
-
29
- opts.on("-l", "--local", "Run script using mandy-local.") do
30
- options.local = true
31
- end
32
-
33
- opts.on_tail("-h", "--help", "Show this message") do
34
- puts opts
35
- exit
36
- end
37
- end.parse!
38
-
39
- exec('mandy-local -h') unless ARGV.size >= 2
40
-
41
- file = ARGV[0]
42
- input = File.new(ARGV[1])
43
-
44
- params = {}
45
- params[:output_file] = options.output if options.output
46
- params[:lib] = options.payload if options.payload
47
- params[:parameters] = JSON.parse(options.json) if options.json
48
-
49
- if options.local
50
- require 'wrappers/mandy_local_wrapper'
51
- else
52
- require 'wrappers/mandy_wrapper'
53
- set_mandy_config options.config || 'cluster.xml'
54
- end
55
-
56
- output_file = run_mandy(file, input, params)
57
-
58
- puts "Results stored in: #{output_file}"
@@ -1,58 +0,0 @@
1
- module Mandy
2
- module Local
3
- module Wrapper
4
- SESSION_ID = Process.pid
5
-
6
- def run_mandy(script, input_files, options = {})
7
- begin
8
- #doing this will load all the mandy jobs in memory which will be useful later on
9
- require script
10
-
11
- input_file = concat_input_files(input_files)
12
- output_file_path = run_mandy_local(script, input_file, options)
13
- return output_file_path unless block_given?
14
- #if a block is given then yield the output file path and then delete this file before returning
15
- yield output_file_path
16
- ensure
17
- File.delete(input_file) if File.exists?(input_file)
18
- File.delete(output_file_path) if output_file_path && File.exists?(output_file_path) if block_given?
19
- end
20
- end
21
-
22
- private
23
- def concat_input_files(inputs)
24
- inputs = [inputs] unless inputs.is_a?(Array)
25
- base_dir = File.dirname(inputs.first.path)
26
- input_file = "#{base_dir}/#{SESSION_ID}.csv"
27
- `cat #{inputs.collect{|f| f.path}.join(' ')} > #{input_file}`
28
- input_file
29
- end
30
-
31
- def run_mandy_local(script, input, options)
32
- mandy_job_params = options.include?(:parameters) ? options[:parameters] : {}
33
- param_args = "export json='#{mandy_job_params.to_json}' &&"
34
-
35
- if options.include?(:lib)
36
- FileUtils.cp(script, options[:lib])
37
- script = File.join(options[:lib], File.basename(script))
38
- end
39
-
40
- output_path = options[:output_file] || generate_output_path
41
- output_file = `#{param_args} mandy-local #{script} #{input} #{output_path}`
42
- output_file = output_file.split("\n").last
43
- output_file
44
- ensure
45
- File.delete(script) if options.include?(:lib)
46
- end
47
-
48
- def generate_output_path
49
- output_dir = "/tmp/mandy_local_output"
50
- FileUtils.mkdir_p(output_dir)
51
- file_name = Mandy::Job.jobs.last.name.downcase.gsub(/\W/, '-')
52
- "#{output_dir}/#{file_name}_#{DateTime.now.strftime('%Y%m%d%H%M%S')}"
53
- end
54
- end
55
- end
56
- end
57
-
58
- Object.send(:include, Mandy::Local::Wrapper)
@@ -1,94 +0,0 @@
1
- module Mandy
2
- module Wrapper
3
- SESSION_ID = Process.pid
4
-
5
- def set_mandy_config(file_path)
6
- @@config_path = file_path
7
- end
8
-
9
- def run_mandy(script, inputs, options = {})
10
- begin
11
- #doing this will load all the mandy jobs in memory which will be useful later on
12
- require script
13
- inputs = [inputs] unless inputs.is_a?(Array)
14
-
15
- hdfs_input = inputs.all? {|i| i.is_a?(File)} ? process_files(inputs) : process_hdfs_locations(inputs)
16
-
17
- run_mandy_hadoop(hdfs_input, script, options)
18
-
19
- output_file_path = get_file_from_hdfs(hdfs_path, options)
20
- return output_file_path unless block_given?
21
- #if a block is given then yield the output file path and then delete this file before returning
22
- yield output_file_path
23
- ensure
24
- File.delete(output_file_path) if output_file_path && File.exists?(output_file_path) if block_given?
25
- end
26
- end
27
-
28
- private
29
- def process_files(input_files)
30
- hdfs_path = "#{self.class.to_s.split('::').join('-').downcase}/#{SESSION_ID}"
31
- put_files_on_hdfs(hdfs_path, input_files)
32
- hdfs_path
33
- end
34
-
35
- def process_locations(input_locations)
36
- return input_locations.first if input_locations.size == 1
37
-
38
- hdfs_path = "#{self.class.to_s.split('::').join('-').downcase}/#{SESSION_ID}"
39
- input_locations.each_with_index do |location, index|
40
- run_command "mandy-cp #{location} #{hdfs_path}/input#{index}"
41
- end
42
- hdfs_path
43
- end
44
-
45
- def put_files_on_hdfs(hdfs_path, input_files)
46
- input_files.each do |input_file|
47
- input_file_path = input_file.is_a?(File) ? File.expand_path(input_file.path) : input_file
48
- base_filename = input_file_path.split("/").last
49
- dest_file = ["input/#{hdfs_path}", base_filename].join("/")
50
- run_command "mandy-put #{input_file_path} #{dest_file}"
51
- end
52
- end
53
-
54
- def run_mandy_hadoop(hdfs_path, script, options)
55
- mandy_job_params = options.include?(:parameters) ? options[:parameters] : {}
56
- param_args = "-j '#{mandy_job_params.to_json}'"
57
- param_args += " -p '#{options[:lib]}'" if options.include?(:lib)
58
-
59
- hdfs_output_path = "output/#{hdfs_path}"
60
- run_command "mandy-rm output/#{hdfs_path}"
61
- run_command "mandy-hadoop #{script} input/#{hdfs_path} output/#{hdfs_path} #{param_args}"
62
- end
63
-
64
- def get_file_from_hdfs(hdfs_path, options)
65
- output_file_path = options[:output_file] || generate_output_path
66
- hdfs_output_path = "output/#{hdfs_path}"
67
- run_command "mandy-get #{get_hdfs_output(hdfs_output_path)} #{output_file_path}"
68
- run_command "mandy-rm input/#{hdfs_path}"
69
- run_command "mandy-rm output/#{hdfs_path}"
70
- output_file_path
71
- end
72
-
73
- def run_command(command)
74
- command = "#{command} -c #{@@config_path}"
75
- respond_to?(:logger) ? logger.info(command) : p(command)
76
- @output = `#{command}`
77
- end
78
-
79
- def generate_output_path
80
- output_dir = "/tmp/mandy_output"
81
- FileUtils.mkdir_p(output_dir)
82
- file_name = Mandy::Job.jobs.last.name.downcase.gsub(/\W/, '-')
83
- "#{output_dir}/#{file_name}_#{DateTime.now.strftime('%Y%m%d%H%M%S')}"
84
- end
85
-
86
- def get_hdfs_output(hdfs_output_path)
87
- @output.each_line do |line|
88
- return line.chomp.strip if line.include?(hdfs_output_path)
89
- end
90
- end
91
- end
92
- end
93
-
94
- Object.send(:include, Mandy::Wrapper)