mandy 0.4.83 → 0.4.86

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,2 +1,3 @@
1
1
  source "http://gemcutter.org"
2
- gem 'mandy'
2
+ gem 'mandy'
3
+ gem 'json'
data/bin/mandy CHANGED
@@ -35,7 +35,6 @@ puts '------------------------'
35
35
  'mandy-install' => 'Installs the Mandy Rubygem on several hosts via ssh.',
36
36
  'mandy-local' => 'Run a Map/Reduce task locally without requiring hadoop',
37
37
  'mandy-hadoop' => 'Run a Map/Reduce task on hadoop using the provided cluster config',
38
- 'mandy-run' => 'Run an entire Map/Reduce workflow with one command',
39
38
  'mandy-rm' => 'remove a file or directory from HDFS',
40
39
  'mandy-put' => 'upload a file into HDFS',
41
40
  'mandy-map' => 'Run a map task reading on STDIN and writing to STDOUT',
data/lib/mandy.rb CHANGED
@@ -11,6 +11,7 @@ require "cgi"
11
11
  packer
12
12
  support/tuple
13
13
  support/array_serializer
14
+ support/hdfs_location
14
15
  mappers/base_mapper
15
16
  mappers/transpose_mapper
16
17
  mappers/pass_through_mapper
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mandy
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.83
4
+ version: 0.4.86
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andy Kent
@@ -49,7 +49,6 @@ executables:
49
49
  - mandy-mkdir
50
50
  - mandy-exists
51
51
  - mandy-install
52
- - mandy-run
53
52
  extensions: []
54
53
 
55
54
  extra_rdoc_files: []
@@ -61,7 +60,6 @@ files:
61
60
  - bin/mandy-get
62
61
  - bin/mandy-put
63
62
  - bin/mandy-reduce
64
- - bin/mandy-run
65
63
  - readme.md
66
64
  - Rakefile
67
65
  - bootstrap.rb
@@ -93,8 +91,6 @@ files:
93
91
  - lib/ruby-hbase/version.rb
94
92
  - lib/ruby-hbase/xml_decoder.rb
95
93
  - lib/test_runner.rb
96
- - lib/wrappers/mandy_wrapper.rb
97
- - lib/wrappers/mandy_local_wrapper.rb
98
94
  has_rdoc: true
99
95
  homepage: http://github.com/trafficbroker/mandy
100
96
  licenses: []
data/bin/mandy-run DELETED
@@ -1,58 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require "rubygems"
3
- require "mandy"
4
- require 'optparse'
5
- require 'ostruct'
6
- require 'json'
7
-
8
- options = OpenStruct.new
9
-
10
- OptionParser.new do |opts|
11
- opts.banner = "USAGE: mandy-run script input [options]"
12
-
13
- opts.on("-p", "--payload PAYLOAD", "Add a working directory to be sent to the cluster.") do |payload|
14
- options.payload = payload
15
- end
16
-
17
- opts.on("-o", "--output OUTPUT", "Specify output path for your results.") do |payload|
18
- options.payload = payload
19
- end
20
-
21
- opts.on("-c", "--conf HADOOP_CONF", "Use this cluster xml config file.") do |config|
22
- options.config = config
23
- end
24
-
25
- opts.on("-j", '--json "{\"key\":\"1 value\"}"', "Pass JSON encoded parameters to jobs") do |config|
26
- options.json = config
27
- end
28
-
29
- opts.on("-l", "--local", "Run script using mandy-local.") do
30
- options.local = true
31
- end
32
-
33
- opts.on_tail("-h", "--help", "Show this message") do
34
- puts opts
35
- exit
36
- end
37
- end.parse!
38
-
39
- exec('mandy-local -h') unless ARGV.size >= 2
40
-
41
- file = ARGV[0]
42
- input = File.new(ARGV[1])
43
-
44
- params = {}
45
- params[:output_file] = options.output if options.output
46
- params[:lib] = options.payload if options.payload
47
- params[:parameters] = JSON.parse(options.json) if options.json
48
-
49
- if options.local
50
- require 'wrappers/mandy_local_wrapper'
51
- else
52
- require 'wrappers/mandy_wrapper'
53
- set_mandy_config options.config || 'cluster.xml'
54
- end
55
-
56
- output_file = run_mandy(file, input, params)
57
-
58
- puts "Results stored in: #{output_file}"
@@ -1,58 +0,0 @@
1
- module Mandy
2
- module Local
3
- module Wrapper
4
- SESSION_ID = Process.pid
5
-
6
- def run_mandy(script, input_files, options = {})
7
- begin
8
- #doing this will load all the mandy jobs in memory which will be useful later on
9
- require script
10
-
11
- input_file = concat_input_files(input_files)
12
- output_file_path = run_mandy_local(script, input_file, options)
13
- return output_file_path unless block_given?
14
- #if a block is given then yield the output file path and then delete this file before returning
15
- yield output_file_path
16
- ensure
17
- File.delete(input_file) if File.exists?(input_file)
18
- File.delete(output_file_path) if output_file_path && File.exists?(output_file_path) if block_given?
19
- end
20
- end
21
-
22
- private
23
- def concat_input_files(inputs)
24
- inputs = [inputs] unless inputs.is_a?(Array)
25
- base_dir = File.dirname(inputs.first.path)
26
- input_file = "#{base_dir}/#{SESSION_ID}.csv"
27
- `cat #{inputs.collect{|f| f.path}.join(' ')} > #{input_file}`
28
- input_file
29
- end
30
-
31
- def run_mandy_local(script, input, options)
32
- mandy_job_params = options.include?(:parameters) ? options[:parameters] : {}
33
- param_args = "export json='#{mandy_job_params.to_json}' &&"
34
-
35
- if options.include?(:lib)
36
- FileUtils.cp(script, options[:lib])
37
- script = File.join(options[:lib], File.basename(script))
38
- end
39
-
40
- output_path = options[:output_file] || generate_output_path
41
- output_file = `#{param_args} mandy-local #{script} #{input} #{output_path}`
42
- output_file = output_file.split("\n").last
43
- output_file
44
- ensure
45
- File.delete(script) if options.include?(:lib)
46
- end
47
-
48
- def generate_output_path
49
- output_dir = "/tmp/mandy_local_output"
50
- FileUtils.mkdir_p(output_dir)
51
- file_name = Mandy::Job.jobs.last.name.downcase.gsub(/\W/, '-')
52
- "#{output_dir}/#{file_name}_#{DateTime.now.strftime('%Y%m%d%H%M%S')}"
53
- end
54
- end
55
- end
56
- end
57
-
58
- Object.send(:include, Mandy::Local::Wrapper)
@@ -1,94 +0,0 @@
1
- module Mandy
2
- module Wrapper
3
- SESSION_ID = Process.pid
4
-
5
- def set_mandy_config(file_path)
6
- @@config_path = file_path
7
- end
8
-
9
- def run_mandy(script, inputs, options = {})
10
- begin
11
- #doing this will load all the mandy jobs in memory which will be useful later on
12
- require script
13
- inputs = [inputs] unless inputs.is_a?(Array)
14
-
15
- hdfs_input = inputs.all? {|i| i.is_a?(File)} ? process_files(inputs) : process_hdfs_locations(inputs)
16
-
17
- run_mandy_hadoop(hdfs_input, script, options)
18
-
19
- output_file_path = get_file_from_hdfs(hdfs_path, options)
20
- return output_file_path unless block_given?
21
- #if a block is given then yield the output file path and then delete this file before returning
22
- yield output_file_path
23
- ensure
24
- File.delete(output_file_path) if output_file_path && File.exists?(output_file_path) if block_given?
25
- end
26
- end
27
-
28
- private
29
- def process_files(input_files)
30
- hdfs_path = "#{self.class.to_s.split('::').join('-').downcase}/#{SESSION_ID}"
31
- put_files_on_hdfs(hdfs_path, input_files)
32
- hdfs_path
33
- end
34
-
35
- def process_locations(input_locations)
36
- return input_locations.first if input_locations.size == 1
37
-
38
- hdfs_path = "#{self.class.to_s.split('::').join('-').downcase}/#{SESSION_ID}"
39
- input_locations.each_with_index do |location, index|
40
- run_command "mandy-cp #{location} #{hdfs_path}/input#{index}"
41
- end
42
- hdfs_path
43
- end
44
-
45
- def put_files_on_hdfs(hdfs_path, input_files)
46
- input_files.each do |input_file|
47
- input_file_path = input_file.is_a?(File) ? File.expand_path(input_file.path) : input_file
48
- base_filename = input_file_path.split("/").last
49
- dest_file = ["input/#{hdfs_path}", base_filename].join("/")
50
- run_command "mandy-put #{input_file_path} #{dest_file}"
51
- end
52
- end
53
-
54
- def run_mandy_hadoop(hdfs_path, script, options)
55
- mandy_job_params = options.include?(:parameters) ? options[:parameters] : {}
56
- param_args = "-j '#{mandy_job_params.to_json}'"
57
- param_args += " -p '#{options[:lib]}'" if options.include?(:lib)
58
-
59
- hdfs_output_path = "output/#{hdfs_path}"
60
- run_command "mandy-rm output/#{hdfs_path}"
61
- run_command "mandy-hadoop #{script} input/#{hdfs_path} output/#{hdfs_path} #{param_args}"
62
- end
63
-
64
- def get_file_from_hdfs(hdfs_path, options)
65
- output_file_path = options[:output_file] || generate_output_path
66
- hdfs_output_path = "output/#{hdfs_path}"
67
- run_command "mandy-get #{get_hdfs_output(hdfs_output_path)} #{output_file_path}"
68
- run_command "mandy-rm input/#{hdfs_path}"
69
- run_command "mandy-rm output/#{hdfs_path}"
70
- output_file_path
71
- end
72
-
73
- def run_command(command)
74
- command = "#{command} -c #{@@config_path}"
75
- respond_to?(:logger) ? logger.info(command) : p(command)
76
- @output = `#{command}`
77
- end
78
-
79
- def generate_output_path
80
- output_dir = "/tmp/mandy_output"
81
- FileUtils.mkdir_p(output_dir)
82
- file_name = Mandy::Job.jobs.last.name.downcase.gsub(/\W/, '-')
83
- "#{output_dir}/#{file_name}_#{DateTime.now.strftime('%Y%m%d%H%M%S')}"
84
- end
85
-
86
- def get_hdfs_output(hdfs_output_path)
87
- @output.each_line do |line|
88
- return line.chomp.strip if line.include?(hdfs_output_path)
89
- end
90
- end
91
- end
92
- end
93
-
94
- Object.send(:include, Mandy::Wrapper)