mrtoolkit 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/Makefile +6 -0
- data/README.rdoc +19 -0
- data/Rakefile +57 -0
- data/VERSION.yml +4 -0
- data/examples/Rakefile +80 -0
- data/examples/Readme +12 -0
- data/examples/hour.rb +57 -0
- data/examples/import-logs +14 -0
- data/examples/import.rb +22 -0
- data/examples/ip-result.rb +33 -0
- data/examples/ip-size.rb +33 -0
- data/examples/ip-ua.rb +36 -0
- data/examples/ip.rb +10 -0
- data/examples/section.rb +37 -0
- data/examples/top-file.rb +36 -0
- data/lib/mrtoolkit.rb +908 -0
- data/lib/regression.rb +33 -0
- data/lib/stream_runner.rb +100 -0
- data/mrtoolkit.gemspec +79 -0
- data/standalone/hadoop +104 -0
- data/test/Rakefile +21 -0
- data/test/test-in/test1-in +2 -0
- data/test/test-in/test2-in +4 -0
- data/test/test-in/test3-in +5 -0
- data/test/test-in/test4-in +6 -0
- data/test/test-in/test5-in +12 -0
- data/test/test-in/test6-in +3 -0
- data/test/test-in/test7-in +20 -0
- data/test/test-in/test8-in +12 -0
- data/test/test-in/test9-in +6 -0
- data/test/utest.rb +471 -0
- metadata +104 -0
data/lib/regression.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
|
2
|
+
class LinearRegression
|
3
|
+
attr_accessor :slope, :offset
|
4
|
+
|
5
|
+
def initialize (dx, dy=nil)
|
6
|
+
@size = dx.size
|
7
|
+
dy,dx = dx,axis() unless dy # make 2D if given 1D
|
8
|
+
raise "arguments not same length!" unless @size == dy.size
|
9
|
+
sxx = sxy = sx = sy = 0
|
10
|
+
dx.zip(dy).each do |x,y|
|
11
|
+
sxy += x*y
|
12
|
+
sxx += x*x
|
13
|
+
sx += x
|
14
|
+
sy += y
|
15
|
+
end
|
16
|
+
@slope = ( @size * sxy - sx*sy ) / ( @size * sxx - sx * sx )
|
17
|
+
@offset = (sy - @slope*sx) / @size
|
18
|
+
end
|
19
|
+
|
20
|
+
def fit(dx = nil)
|
21
|
+
dx = axis unless dx
|
22
|
+
return dx.map{|data| predict(data) }
|
23
|
+
end
|
24
|
+
|
25
|
+
def predict( x )
|
26
|
+
y = @slope * x + @offset
|
27
|
+
end
|
28
|
+
|
29
|
+
def axis
|
30
|
+
(0...@size).to_a
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
@@ -0,0 +1,100 @@
|
|
1
|
+
|
2
|
+
# StreamRunner
|
3
|
+
# This class is responsible for running stream jobs in hadoop.
|
4
|
+
#
|
5
|
+
# Streaming is a simplified programming model in which map and reduce
|
6
|
+
# proceses that read from STDIN and write to STDOUT are given.
|
7
|
+
# StreamRunner runs *ruby programs* as the map and reduce steps.
|
8
|
+
#
|
9
|
+
# Additional services provided:
|
10
|
+
# * the number of reducers can be specified
|
11
|
+
# * extra files to include can be given
|
12
|
+
# * input can be one directory or an array of directories
|
13
|
+
# * collects the output and copies it to local file in "out" directory
|
14
|
+
# * deletes hadoop output direcory before starting job -- BE CAREFUL
|
15
|
+
#
|
16
|
+
# Extra files are distributed to each cluster member, and are stored in the
|
17
|
+
# directory with the map or reduce programs. You must include any data files
|
18
|
+
# your program reads or library files it requires. You do not have to
|
19
|
+
# include the program itself -- this is done automatically.
|
20
|
+
#
|
21
|
+
# HADOOP_HOME must be set.
|
22
|
+
# It might be necessary to change HADOOP_STREAMING_VERSION if the version changes.
|
23
|
+
|
24
|
+
streaming_version = ENV['HADOOP_STREAMING_VERSION']
|
25
|
+
streaming_version ="0.20.0" unless streaming_version
|
26
|
+
|
27
|
+
HADOOP_HOME=ENV['HADOOP_HOME']
|
28
|
+
HADOOP_STREAMING="#{HADOOP_HOME}/contrib/streaming/hadoop-#{streaming_version}-streaming.jar"
|
29
|
+
|
30
|
+
class StreamRunner
|
31
|
+
def expand_path(file)
|
32
|
+
return file if File.exist?(file)
|
33
|
+
rlib = ENV['RUBYLIB'] || File.dirname(__FILE__)
|
34
|
+
raise "Cannot resolve path to #{file} -- no RUBYLIB" unless rlib
|
35
|
+
(rlib.split(':') + [File.dirname(__FILE__)]).each do |rp|
|
36
|
+
trial = "#{rp}/#{file}"
|
37
|
+
return trial if File.exists?(trial)
|
38
|
+
end
|
39
|
+
raise "Cannot resolve path to #{file}. Is it in RUBYLIB?"
|
40
|
+
end
|
41
|
+
def expand_paths(extra)
|
42
|
+
extras = []
|
43
|
+
extra.collect { |e| expand_path(e)}
|
44
|
+
end
|
45
|
+
|
46
|
+
def run_hadoop_stream(input, out, mapper, reducer, reducers, extra,
|
47
|
+
map_opts, reduce_opts, opts)
|
48
|
+
extras = ''
|
49
|
+
extra << mapper.split(' ')[0]
|
50
|
+
extra << reducer.split(' ')[0]
|
51
|
+
expand_paths(extra.uniq).each {|e| extras += "-file #{e} "}
|
52
|
+
map_opt = ''
|
53
|
+
map_opts.each {|n, v| map_opt += "-jobconf #{n}=#{v} "}
|
54
|
+
reduce_opt = ''
|
55
|
+
reduce_opts.each {|n, v| reduce_opt += "-jobconf #{n}=#{v} "}
|
56
|
+
if input.class == Array
|
57
|
+
input = input.collect {|i| "-input #{i}"}.join(" ")
|
58
|
+
else
|
59
|
+
input = "-input #{input}"
|
60
|
+
end
|
61
|
+
|
62
|
+
if reducer.nil?
|
63
|
+
cmd = "hadoop jar #{HADOOP_STREAMING} " +
|
64
|
+
"#{input} " +
|
65
|
+
"-output NONE " +
|
66
|
+
"-mapper \"ruby #{mapper}\"" +
|
67
|
+
"-jobconf mapred.reduce.tasks=0 " +
|
68
|
+
map_opt +
|
69
|
+
"#{extras}"
|
70
|
+
else
|
71
|
+
cmd = "hadoop jar #{HADOOP_STREAMING} " +
|
72
|
+
"#{input} " +
|
73
|
+
"-output #{out} " +
|
74
|
+
"-mapper \"ruby #{mapper}\" " +
|
75
|
+
map_opt +
|
76
|
+
"-reducer \"ruby #{reducer}\" " +
|
77
|
+
"-jobconf mapred.reduce.tasks=#{reducers} " +
|
78
|
+
reduce_opt +
|
79
|
+
"#{extras}"
|
80
|
+
end
|
81
|
+
cmd += " -verbose " if opts.has_key?(:verbose)
|
82
|
+
cmd += " #{opts[:hadoop_opts]}" if opts.has_key?(:hadoop_opts)
|
83
|
+
puts cmd if opts.has_key?(:verbose)
|
84
|
+
system(cmd)
|
85
|
+
end
|
86
|
+
|
87
|
+
def run_map_reduce(input, out, map, reduce, reducers, extra,
|
88
|
+
map_opts = {}, reduce_opts = {}, opts = {})
|
89
|
+
system("hadoop fs -rmr #{out}")
|
90
|
+
system("rm -rf out/#{out}")
|
91
|
+
system("mkdir -p out/#{out}")
|
92
|
+
run_hadoop_stream(input, out, map, reduce, reducers, extra,
|
93
|
+
map_opts, reduce_opts, opts)
|
94
|
+
(0..reducers-1).each do |i|
|
95
|
+
n = sprintf("%05d", i)
|
96
|
+
system("hadoop fs -cat #{out}/part-#{n} >out/#{out}/part-#{n}")
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
data/mrtoolkit.gemspec
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{mrtoolkit}
|
8
|
+
s.version = "0.1.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["cchayden", "vadimj", "jashmenn"]
|
12
|
+
s.date = %q{2010-05-17}
|
13
|
+
s.email = %q{nate@natemurray.com}
|
14
|
+
s.extra_rdoc_files = [
|
15
|
+
"README.rdoc"
|
16
|
+
]
|
17
|
+
s.files = [
|
18
|
+
".document",
|
19
|
+
".gitignore",
|
20
|
+
"Makefile",
|
21
|
+
"README.rdoc",
|
22
|
+
"Rakefile",
|
23
|
+
"VERSION.yml",
|
24
|
+
"examples/Rakefile",
|
25
|
+
"examples/Readme",
|
26
|
+
"examples/hour.rb",
|
27
|
+
"examples/import-logs",
|
28
|
+
"examples/import.rb",
|
29
|
+
"examples/ip-result.rb",
|
30
|
+
"examples/ip-size.rb",
|
31
|
+
"examples/ip-ua.rb",
|
32
|
+
"examples/ip.rb",
|
33
|
+
"examples/section.rb",
|
34
|
+
"examples/top-file.rb",
|
35
|
+
"lib/mrtoolkit.rb",
|
36
|
+
"lib/regression.rb",
|
37
|
+
"lib/stream_runner.rb",
|
38
|
+
"mrtoolkit.gemspec",
|
39
|
+
"standalone/hadoop",
|
40
|
+
"test/Rakefile",
|
41
|
+
"test/test-in/test1-in",
|
42
|
+
"test/test-in/test2-in",
|
43
|
+
"test/test-in/test3-in",
|
44
|
+
"test/test-in/test4-in",
|
45
|
+
"test/test-in/test5-in",
|
46
|
+
"test/test-in/test6-in",
|
47
|
+
"test/test-in/test7-in",
|
48
|
+
"test/test-in/test8-in",
|
49
|
+
"test/test-in/test9-in",
|
50
|
+
"test/utest.rb"
|
51
|
+
]
|
52
|
+
s.homepage = %q{http://github.com/jashmenn/mrtoolkit}
|
53
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
54
|
+
s.require_paths = ["lib"]
|
55
|
+
s.rubygems_version = %q{1.3.6}
|
56
|
+
s.summary = %q{Simplify the creation of Hadoop Map/Reduce jobs}
|
57
|
+
s.test_files = [
|
58
|
+
"test/utest.rb",
|
59
|
+
"examples/hour.rb",
|
60
|
+
"examples/import.rb",
|
61
|
+
"examples/ip-result.rb",
|
62
|
+
"examples/ip-size.rb",
|
63
|
+
"examples/ip-ua.rb",
|
64
|
+
"examples/ip.rb",
|
65
|
+
"examples/section.rb",
|
66
|
+
"examples/top-file.rb"
|
67
|
+
]
|
68
|
+
|
69
|
+
if s.respond_to? :specification_version then
|
70
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
71
|
+
s.specification_version = 3
|
72
|
+
|
73
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
74
|
+
else
|
75
|
+
end
|
76
|
+
else
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
data/standalone/hadoop
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
# Simulate hadoop well enough to do local testing.
|
4
|
+
|
5
|
+
case ARGV[0]
|
6
|
+
#
|
7
|
+
# Do hadoop fs commands
|
8
|
+
when 'fs':
|
9
|
+
ARGV.shift
|
10
|
+
case ARGV[0]
|
11
|
+
when '-rmr':
|
12
|
+
dest = ARGV[1]
|
13
|
+
if dest.nil?
|
14
|
+
STDERR.puts "missing argument"
|
15
|
+
exit 1
|
16
|
+
end
|
17
|
+
system("rm -rf #{dest}")
|
18
|
+
when '-rm':
|
19
|
+
dest = ARGV[1]
|
20
|
+
if dest.nil?
|
21
|
+
STDERR.puts "missing argument"
|
22
|
+
exit 1
|
23
|
+
end
|
24
|
+
system("rm -f #{dest}")
|
25
|
+
when '-mkdir':
|
26
|
+
dest = ARGV[1]
|
27
|
+
if dest.nil?
|
28
|
+
STDERR.puts "missing argument"
|
29
|
+
exit 1
|
30
|
+
end
|
31
|
+
system("mkdir -p #{dest}")
|
32
|
+
when '-copyFromLocal':
|
33
|
+
when '-put':
|
34
|
+
src = ARGV[1]
|
35
|
+
dest = ARGV[2]
|
36
|
+
if src.nil? || dest.nil?
|
37
|
+
STDERR.puts "missing argument"
|
38
|
+
exit 1
|
39
|
+
end
|
40
|
+
system("cp #{src} #{dest}")
|
41
|
+
when '-cat':
|
42
|
+
src = ARGV[1]
|
43
|
+
if src.nil?
|
44
|
+
STDERR.puts "missing argument"
|
45
|
+
exit 1
|
46
|
+
end
|
47
|
+
if File.exists?(src)
|
48
|
+
system("cat #{src}")
|
49
|
+
end
|
50
|
+
when '-ls':
|
51
|
+
dest = ARGV[1]
|
52
|
+
if dest.nil?
|
53
|
+
system("cd #{HADOOP_DIR}; ls -l '--time-style=+%Y-%m-%d %H:%M'")
|
54
|
+
else
|
55
|
+
system("cd #{HADOOP_DIR};ls -l '--time-style=+%Y-%m-%d %H:%M' #{dest}")
|
56
|
+
end
|
57
|
+
when '-lsr':
|
58
|
+
dest = ARGV[1]
|
59
|
+
if dest == '/user'
|
60
|
+
# special test data
|
61
|
+
system("cat sample-ls-data")
|
62
|
+
else
|
63
|
+
system("ls -lR '--time-style=+%Y-%m-%d %H:%M' #{dest}")
|
64
|
+
end
|
65
|
+
else
|
66
|
+
STDERR.puts "command not recognized #{ARGV[0]}"
|
67
|
+
exit 1
|
68
|
+
end
|
69
|
+
#
|
70
|
+
# Do hadoop jar commands (run map reduce)
|
71
|
+
when 'jar':
|
72
|
+
ARGV.shift
|
73
|
+
jar = ARGV[0]
|
74
|
+
ARGV.shift
|
75
|
+
file = []
|
76
|
+
reducers = 1
|
77
|
+
while ARGV.size > 0
|
78
|
+
case ARGV[0]
|
79
|
+
when '-input':
|
80
|
+
indir = ARGV[1]; 2.times {ARGV.shift}
|
81
|
+
when '-output':
|
82
|
+
outdir = ARGV[1]; 2.times {ARGV.shift}
|
83
|
+
when '-mapper':
|
84
|
+
mapper = ARGV[1]; 2.times {ARGV.shift}
|
85
|
+
when '-reducer':
|
86
|
+
reducer = ARGV[1]; 2.times {ARGV.shift}
|
87
|
+
when '-jobconf':
|
88
|
+
jobconf = ARGV[1]; 2.times {ARGV.shift}
|
89
|
+
if jobconf =~ /mapred.reduce.tasks=([\d]*)/
|
90
|
+
reducers = $1.to_i
|
91
|
+
end
|
92
|
+
when '-file':
|
93
|
+
file << ARGV[1]; 2.times {ARGV.shift}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
system("hadoop fs -mkdir #{outdir}")
|
97
|
+
# Simulate hadoop by running mapper, then sort, then run reducer.
|
98
|
+
cmd = "ls #{indir}|while read line;do #{mapper} < #{indir}/$line;done|LC_ALL=C sort|#{reducer} >#{outdir}/part-00000"
|
99
|
+
#puts "running: #{cmd}"
|
100
|
+
system cmd
|
101
|
+
else
|
102
|
+
STDERR.puts "*** hadoop #{ARGV.join(' ')}"
|
103
|
+
exit 1
|
104
|
+
end
|
data/test/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# add mrtoolkit to $RUBYLIB
|
2
|
+
if ENV['RUBYLIB']
|
3
|
+
ENV['RUBYLIB'] = ENV['RUBYLIB'].split(':').concat(['../lib']).uniq.join(':')
|
4
|
+
else
|
5
|
+
ENV['RUBYLIB'] = '../lib'
|
6
|
+
end
|
7
|
+
# add everything in $RUBYLIB to search path
|
8
|
+
ENV['RUBYLIB'].split(':').each {|f| $:.concat([f]) unless $:.include?(f)}
|
9
|
+
|
10
|
+
######################################################################
|
11
|
+
desc "clean up"
|
12
|
+
task "clean" do
|
13
|
+
to_clean = %w{test-out}
|
14
|
+
to_clean.each {|f| system "rm -rf #{f}"}
|
15
|
+
end
|
16
|
+
######################################################################
|
17
|
+
desc "unit tests"
|
18
|
+
task :test do
|
19
|
+
system "ruby utest.rb"
|
20
|
+
end
|
21
|
+
|