mrtoolkit 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/regression.rb ADDED
@@ -0,0 +1,33 @@
1
+
2
+ class LinearRegression
3
+ attr_accessor :slope, :offset
4
+
5
+ def initialize (dx, dy=nil)
6
+ @size = dx.size
7
+ dy,dx = dx,axis() unless dy # make 2D if given 1D
8
+ raise "arguments not same length!" unless @size == dy.size
9
+ sxx = sxy = sx = sy = 0
10
+ dx.zip(dy).each do |x,y|
11
+ sxy += x*y
12
+ sxx += x*x
13
+ sx += x
14
+ sy += y
15
+ end
16
+ @slope = ( @size * sxy - sx*sy ) / ( @size * sxx - sx * sx )
17
+ @offset = (sy - @slope*sx) / @size
18
+ end
19
+
20
+ def fit(dx = nil)
21
+ dx = axis unless dx
22
+ return dx.map{|data| predict(data) }
23
+ end
24
+
25
+ def predict( x )
26
+ y = @slope * x + @offset
27
+ end
28
+
29
+ def axis
30
+ (0...@size).to_a
31
+ end
32
+ end
33
+
@@ -0,0 +1,100 @@
1
+
2
+ # StreamRunner
3
+ # This class is responsible for running stream jobs in hadoop.
4
+ #
5
+ # Streaming is a simplified programming model in which map and reduce
6
+ # proceses that read from STDIN and write to STDOUT are given.
7
+ # StreamRunner runs *ruby programs* as the map and reduce steps.
8
+ #
9
+ # Additional services provided:
10
+ # * the number of reducers can be specified
11
+ # * extra files to include can be given
12
+ # * input can be one directory or an array of directories
13
+ # * collects the output and copies it to local file in "out" directory
14
+ # * deletes hadoop output direcory before starting job -- BE CAREFUL
15
+ #
16
+ # Extra files are distributed to each cluster member, and are stored in the
17
+ # directory with the map or reduce programs. You must include any data files
18
+ # your program reads or library files it requires. You do not have to
19
+ # include the program itself -- this is done automatically.
20
+ #
21
+ # HADOOP_HOME must be set.
22
+ # It might be necessary to change HADOOP_STREAMING_VERSION if the version changes.
23
+
24
+ streaming_version = ENV['HADOOP_STREAMING_VERSION']
25
+ streaming_version ="0.20.0" unless streaming_version
26
+
27
+ HADOOP_HOME=ENV['HADOOP_HOME']
28
+ HADOOP_STREAMING="#{HADOOP_HOME}/contrib/streaming/hadoop-#{streaming_version}-streaming.jar"
29
+
30
+ class StreamRunner
31
+ def expand_path(file)
32
+ return file if File.exist?(file)
33
+ rlib = ENV['RUBYLIB'] || File.dirname(__FILE__)
34
+ raise "Cannot resolve path to #{file} -- no RUBYLIB" unless rlib
35
+ (rlib.split(':') + [File.dirname(__FILE__)]).each do |rp|
36
+ trial = "#{rp}/#{file}"
37
+ return trial if File.exists?(trial)
38
+ end
39
+ raise "Cannot resolve path to #{file}. Is it in RUBYLIB?"
40
+ end
41
+ def expand_paths(extra)
42
+ extras = []
43
+ extra.collect { |e| expand_path(e)}
44
+ end
45
+
46
+ def run_hadoop_stream(input, out, mapper, reducer, reducers, extra,
47
+ map_opts, reduce_opts, opts)
48
+ extras = ''
49
+ extra << mapper.split(' ')[0]
50
+ extra << reducer.split(' ')[0]
51
+ expand_paths(extra.uniq).each {|e| extras += "-file #{e} "}
52
+ map_opt = ''
53
+ map_opts.each {|n, v| map_opt += "-jobconf #{n}=#{v} "}
54
+ reduce_opt = ''
55
+ reduce_opts.each {|n, v| reduce_opt += "-jobconf #{n}=#{v} "}
56
+ if input.class == Array
57
+ input = input.collect {|i| "-input #{i}"}.join(" ")
58
+ else
59
+ input = "-input #{input}"
60
+ end
61
+
62
+ if reducer.nil?
63
+ cmd = "hadoop jar #{HADOOP_STREAMING} " +
64
+ "#{input} " +
65
+ "-output NONE " +
66
+ "-mapper \"ruby #{mapper}\"" +
67
+ "-jobconf mapred.reduce.tasks=0 " +
68
+ map_opt +
69
+ "#{extras}"
70
+ else
71
+ cmd = "hadoop jar #{HADOOP_STREAMING} " +
72
+ "#{input} " +
73
+ "-output #{out} " +
74
+ "-mapper \"ruby #{mapper}\" " +
75
+ map_opt +
76
+ "-reducer \"ruby #{reducer}\" " +
77
+ "-jobconf mapred.reduce.tasks=#{reducers} " +
78
+ reduce_opt +
79
+ "#{extras}"
80
+ end
81
+ cmd += " -verbose " if opts.has_key?(:verbose)
82
+ cmd += " #{opts[:hadoop_opts]}" if opts.has_key?(:hadoop_opts)
83
+ puts cmd if opts.has_key?(:verbose)
84
+ system(cmd)
85
+ end
86
+
87
+ def run_map_reduce(input, out, map, reduce, reducers, extra,
88
+ map_opts = {}, reduce_opts = {}, opts = {})
89
+ system("hadoop fs -rmr #{out}")
90
+ system("rm -rf out/#{out}")
91
+ system("mkdir -p out/#{out}")
92
+ run_hadoop_stream(input, out, map, reduce, reducers, extra,
93
+ map_opts, reduce_opts, opts)
94
+ (0..reducers-1).each do |i|
95
+ n = sprintf("%05d", i)
96
+ system("hadoop fs -cat #{out}/part-#{n} >out/#{out}/part-#{n}")
97
+ end
98
+ end
99
+ end
100
+
data/mrtoolkit.gemspec ADDED
@@ -0,0 +1,79 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{mrtoolkit}
8
+ s.version = "0.1.2"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["cchayden", "vadimj", "jashmenn"]
12
+ s.date = %q{2010-05-17}
13
+ s.email = %q{nate@natemurray.com}
14
+ s.extra_rdoc_files = [
15
+ "README.rdoc"
16
+ ]
17
+ s.files = [
18
+ ".document",
19
+ ".gitignore",
20
+ "Makefile",
21
+ "README.rdoc",
22
+ "Rakefile",
23
+ "VERSION.yml",
24
+ "examples/Rakefile",
25
+ "examples/Readme",
26
+ "examples/hour.rb",
27
+ "examples/import-logs",
28
+ "examples/import.rb",
29
+ "examples/ip-result.rb",
30
+ "examples/ip-size.rb",
31
+ "examples/ip-ua.rb",
32
+ "examples/ip.rb",
33
+ "examples/section.rb",
34
+ "examples/top-file.rb",
35
+ "lib/mrtoolkit.rb",
36
+ "lib/regression.rb",
37
+ "lib/stream_runner.rb",
38
+ "mrtoolkit.gemspec",
39
+ "standalone/hadoop",
40
+ "test/Rakefile",
41
+ "test/test-in/test1-in",
42
+ "test/test-in/test2-in",
43
+ "test/test-in/test3-in",
44
+ "test/test-in/test4-in",
45
+ "test/test-in/test5-in",
46
+ "test/test-in/test6-in",
47
+ "test/test-in/test7-in",
48
+ "test/test-in/test8-in",
49
+ "test/test-in/test9-in",
50
+ "test/utest.rb"
51
+ ]
52
+ s.homepage = %q{http://github.com/jashmenn/mrtoolkit}
53
+ s.rdoc_options = ["--charset=UTF-8"]
54
+ s.require_paths = ["lib"]
55
+ s.rubygems_version = %q{1.3.6}
56
+ s.summary = %q{Simplify the creation of Hadoop Map/Reduce jobs}
57
+ s.test_files = [
58
+ "test/utest.rb",
59
+ "examples/hour.rb",
60
+ "examples/import.rb",
61
+ "examples/ip-result.rb",
62
+ "examples/ip-size.rb",
63
+ "examples/ip-ua.rb",
64
+ "examples/ip.rb",
65
+ "examples/section.rb",
66
+ "examples/top-file.rb"
67
+ ]
68
+
69
+ if s.respond_to? :specification_version then
70
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
71
+ s.specification_version = 3
72
+
73
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
74
+ else
75
+ end
76
+ else
77
+ end
78
+ end
79
+
data/standalone/hadoop ADDED
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/ruby
2
+
3
+ # Simulate hadoop well enough to do local testing.
4
+
5
+ case ARGV[0]
6
+ #
7
+ # Do hadoop fs commands
8
+ when 'fs':
9
+ ARGV.shift
10
+ case ARGV[0]
11
+ when '-rmr':
12
+ dest = ARGV[1]
13
+ if dest.nil?
14
+ STDERR.puts "missing argument"
15
+ exit 1
16
+ end
17
+ system("rm -rf #{dest}")
18
+ when '-rm':
19
+ dest = ARGV[1]
20
+ if dest.nil?
21
+ STDERR.puts "missing argument"
22
+ exit 1
23
+ end
24
+ system("rm -f #{dest}")
25
+ when '-mkdir':
26
+ dest = ARGV[1]
27
+ if dest.nil?
28
+ STDERR.puts "missing argument"
29
+ exit 1
30
+ end
31
+ system("mkdir -p #{dest}")
32
+ when '-copyFromLocal':
33
+ when '-put':
34
+ src = ARGV[1]
35
+ dest = ARGV[2]
36
+ if src.nil? || dest.nil?
37
+ STDERR.puts "missing argument"
38
+ exit 1
39
+ end
40
+ system("cp #{src} #{dest}")
41
+ when '-cat':
42
+ src = ARGV[1]
43
+ if src.nil?
44
+ STDERR.puts "missing argument"
45
+ exit 1
46
+ end
47
+ if File.exists?(src)
48
+ system("cat #{src}")
49
+ end
50
+ when '-ls':
51
+ dest = ARGV[1]
52
+ if dest.nil?
53
+ system("cd #{HADOOP_DIR}; ls -l '--time-style=+%Y-%m-%d %H:%M'")
54
+ else
55
+ system("cd #{HADOOP_DIR};ls -l '--time-style=+%Y-%m-%d %H:%M' #{dest}")
56
+ end
57
+ when '-lsr':
58
+ dest = ARGV[1]
59
+ if dest == '/user'
60
+ # special test data
61
+ system("cat sample-ls-data")
62
+ else
63
+ system("ls -lR '--time-style=+%Y-%m-%d %H:%M' #{dest}")
64
+ end
65
+ else
66
+ STDERR.puts "command not recognized #{ARGV[0]}"
67
+ exit 1
68
+ end
69
+ #
70
+ # Do hadoop jar commands (run map reduce)
71
+ when 'jar':
72
+ ARGV.shift
73
+ jar = ARGV[0]
74
+ ARGV.shift
75
+ file = []
76
+ reducers = 1
77
+ while ARGV.size > 0
78
+ case ARGV[0]
79
+ when '-input':
80
+ indir = ARGV[1]; 2.times {ARGV.shift}
81
+ when '-output':
82
+ outdir = ARGV[1]; 2.times {ARGV.shift}
83
+ when '-mapper':
84
+ mapper = ARGV[1]; 2.times {ARGV.shift}
85
+ when '-reducer':
86
+ reducer = ARGV[1]; 2.times {ARGV.shift}
87
+ when '-jobconf':
88
+ jobconf = ARGV[1]; 2.times {ARGV.shift}
89
+ if jobconf =~ /mapred.reduce.tasks=([\d]*)/
90
+ reducers = $1.to_i
91
+ end
92
+ when '-file':
93
+ file << ARGV[1]; 2.times {ARGV.shift}
94
+ end
95
+ end
96
+ system("hadoop fs -mkdir #{outdir}")
97
+ # Simulate hadoop by running mapper, then sort, then run reducer.
98
+ cmd = "ls #{indir}|while read line;do #{mapper} < #{indir}/$line;done|LC_ALL=C sort|#{reducer} >#{outdir}/part-00000"
99
+ #puts "running: #{cmd}"
100
+ system cmd
101
+ else
102
+ STDERR.puts "*** hadoop #{ARGV.join(' ')}"
103
+ exit 1
104
+ end
data/test/Rakefile ADDED
@@ -0,0 +1,21 @@
1
+ # add mrtoolkit to $RUBYLIB
2
+ if ENV['RUBYLIB']
3
+ ENV['RUBYLIB'] = ENV['RUBYLIB'].split(':').concat(['../lib']).uniq.join(':')
4
+ else
5
+ ENV['RUBYLIB'] = '../lib'
6
+ end
7
+ # add everything in $RUBYLIB to search path
8
+ ENV['RUBYLIB'].split(':').each {|f| $:.concat([f]) unless $:.include?(f)}
9
+
10
+ ######################################################################
11
+ desc "clean up"
12
+ task "clean" do
13
+ to_clean = %w{test-out}
14
+ to_clean.each {|f| system "rm -rf #{f}"}
15
+ end
16
+ ######################################################################
17
+ desc "unit tests"
18
+ task :test do
19
+ system "ruby utest.rb"
20
+ end
21
+
@@ -0,0 +1,2 @@
1
+ 2008-10-02 11:30:00 1.2.3.5
2
+ 2008-10-01 10:30:00 1.2.3.4
@@ -0,0 +1,4 @@
1
+ 1
2
+ 2
3
+ 10
4
+ 30
@@ -0,0 +1,5 @@
1
+ 2008-10-31 21:01:00
2
+ 2008-10-31 21:00:00
3
+ 2008-10-31 21:10:00
4
+ 2008-10-31 21:10:00
5
+ 2008-10-31 21:10:00
@@ -0,0 +1,6 @@
1
+ 1 1
2
+ 2 2
3
+ 3 3
4
+ 4 10
5
+ 5 1
6
+ 6 2
@@ -0,0 +1,12 @@
1
+ 100 1
2
+ 100 1
3
+ 100 1
4
+ 101 1
5
+ 101 1
6
+ 102 1
7
+ 102 1
8
+ 102 1
9
+ 102 1
10
+ 103 1
11
+ 104 1
12
+ 104 1
@@ -0,0 +1,3 @@
1
+ 1 1 1
2
+ 1 3 4
3
+ 10 5 3
@@ -0,0 +1,20 @@
1
+ 1
2
+ 2
3
+ 3
4
+ 4
5
+ 5
6
+ 6
7
+ 7
8
+ 8
9
+ 9
10
+ 10
11
+ 11
12
+ 12
13
+ 13
14
+ 14
15
+ 15
16
+ 16
17
+ 17
18
+ 18
19
+ 19
20
+ 20
@@ -0,0 +1,12 @@
1
+ 100 1000 1
2
+ 100 1001 1
3
+ 100 1000 1
4
+ 200 1000 1
5
+ 200 1001 1
6
+ 200 1000 1
7
+ 100 1000 1
8
+
9
+
10
+
11
+
12
+
@@ -0,0 +1,6 @@
1
+ 1000 a a a
2
+ 1000 b b b
3
+ 1000 c c c
4
+ 1001 x1 y1 z1
5
+ 1001 x2 y2 z2
6
+ 1000 d1 d2 d3