mrtoolkit 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/Makefile +6 -0
- data/README.rdoc +19 -0
- data/Rakefile +57 -0
- data/VERSION.yml +4 -0
- data/examples/Rakefile +80 -0
- data/examples/Readme +12 -0
- data/examples/hour.rb +57 -0
- data/examples/import-logs +14 -0
- data/examples/import.rb +22 -0
- data/examples/ip-result.rb +33 -0
- data/examples/ip-size.rb +33 -0
- data/examples/ip-ua.rb +36 -0
- data/examples/ip.rb +10 -0
- data/examples/section.rb +37 -0
- data/examples/top-file.rb +36 -0
- data/lib/mrtoolkit.rb +908 -0
- data/lib/regression.rb +33 -0
- data/lib/stream_runner.rb +100 -0
- data/mrtoolkit.gemspec +79 -0
- data/standalone/hadoop +104 -0
- data/test/Rakefile +21 -0
- data/test/test-in/test1-in +2 -0
- data/test/test-in/test2-in +4 -0
- data/test/test-in/test3-in +5 -0
- data/test/test-in/test4-in +6 -0
- data/test/test-in/test5-in +12 -0
- data/test/test-in/test6-in +3 -0
- data/test/test-in/test7-in +20 -0
- data/test/test-in/test8-in +12 -0
- data/test/test-in/test9-in +6 -0
- data/test/utest.rb +471 -0
- metadata +104 -0
data/.document
ADDED
data/.gitignore
ADDED
data/Makefile
ADDED
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= mrtoolkit
|
2
|
+
|
3
|
+
MRToolkit provides a framework for building simple Map/Reduce jobs in just a few lines of code. You provide only the map and reduce logic, the framework does the rest. Or use one of the provided map or reduce tools, and write even less.
|
4
|
+
|
5
|
+
Map and reduce jobs are written in Ruby. MRToolkit was inspired by Google's Sawzall.
|
6
|
+
|
7
|
+
== Acknowledgements
|
8
|
+
|
9
|
+
MRToolkit was inspired by Google's Sawzall. We wanted to make it even easier by making use of an existing language, rather than inventing a new one. Ruby was a perfect fit.
|
10
|
+
|
11
|
+
The initial development of this software was supported by the New York Times, with the support and encouragement of Vadim Jelezniakov and Ranjit Prabhu.
|
12
|
+
|
13
|
+
== This github repo
|
14
|
+
|
15
|
+
This github repo is a mirror + patches to the mrtoolkit that is hosted on code.google.com: http://code.google.com/p/mrtoolkit/wiki/Introduction
|
16
|
+
|
17
|
+
This repo adds, among other things, the ability to install mrtoolkit as a gem:
|
18
|
+
|
19
|
+
gem install jashmenn-mrtoolkit --source=http://gems.github/com
|
data/Rakefile
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "mrtoolkit"
|
8
|
+
gem.summary = %Q{Simplify the creation of Hadoop Map/Reduce jobs}
|
9
|
+
gem.email = "nate@natemurray.com"
|
10
|
+
gem.homepage = "http://github.com/jashmenn/mrtoolkit"
|
11
|
+
gem.authors = ["cchayden", "vadimj", "jashmenn"]
|
12
|
+
|
13
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
|
+
end
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'rake/testtask'
|
20
|
+
Rake::TestTask.new(:test) do |test|
|
21
|
+
test.libs << 'lib' << 'test'
|
22
|
+
test.pattern = 'test/**/*_test.rb'
|
23
|
+
test.verbose = true
|
24
|
+
end
|
25
|
+
|
26
|
+
begin
|
27
|
+
require 'rcov/rcovtask'
|
28
|
+
Rcov::RcovTask.new do |test|
|
29
|
+
test.libs << 'test'
|
30
|
+
test.pattern = 'test/**/*_test.rb'
|
31
|
+
test.verbose = true
|
32
|
+
end
|
33
|
+
rescue LoadError
|
34
|
+
task :rcov do
|
35
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
task :default => :test
|
41
|
+
|
42
|
+
require 'yaml'
|
43
|
+
require 'rake/rdoctask'
|
44
|
+
Rake::RDocTask.new do |rdoc|
|
45
|
+
if File.exist?('VERSION.yml')
|
46
|
+
config = YAML.load(File.read('VERSION.yml'))
|
47
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
48
|
+
else
|
49
|
+
version = ""
|
50
|
+
end
|
51
|
+
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "mrtoolkit #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
|
+
end
|
57
|
+
|
data/VERSION.yml
ADDED
data/examples/Rakefile
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'pp'
|
2
|
+
# add mrtoolkit to $RUBYLIB
|
3
|
+
if ENV['RUBYLIB']
|
4
|
+
ENV['RUBYLIB'] = ENV['RUBYLIB'].split(':').concat(['../lib']).uniq.join(':')
|
5
|
+
else
|
6
|
+
ENV['RUBYLIB'] = '../lib'
|
7
|
+
end
|
8
|
+
# add everything in $RUBYLIB to search path
|
9
|
+
ENV['RUBYLIB'].split(':').each {|f| $:.concat([f]) unless $:.include?(f)}
|
10
|
+
|
11
|
+
require 'mrtoolkit'
|
12
|
+
require 'time'
|
13
|
+
|
14
|
+
|
15
|
+
def outfiles(base)
|
16
|
+
"out/#{base}/part-*"
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
######################################################################
|
21
|
+
desc "clean up"
|
22
|
+
task "clean" do
|
23
|
+
to_clean = %w{hour hours ip ips section sections top-file top-files ip-size ip-sizes ip-result ip-results ip-uas ip-ua out }
|
24
|
+
to_clean.each {|f| system "rm -rf #{f}"}
|
25
|
+
end
|
26
|
+
|
27
|
+
######################################################################
|
28
|
+
desc "import data to hdfs"
|
29
|
+
task "import" do
|
30
|
+
system "./import-logs"
|
31
|
+
end
|
32
|
+
|
33
|
+
######################################################################
|
34
|
+
desc "traffic by IP address"
|
35
|
+
file "ips" do
|
36
|
+
system "ruby ip.rb"
|
37
|
+
system "cat #{outfiles('ip')}|sort -nr -k2 >ips"
|
38
|
+
end
|
39
|
+
|
40
|
+
######################################################################
|
41
|
+
desc "returned result size by IP address"
|
42
|
+
file "ip-sizes" do
|
43
|
+
system "ruby ip-size.rb"
|
44
|
+
system "cat #{outfiles('ip-size')}|sort -nr -k2 >ip-sizes"
|
45
|
+
end
|
46
|
+
|
47
|
+
######################################################################
|
48
|
+
desc "show all combinations of ip and user agent"
|
49
|
+
file "ip-uas" do
|
50
|
+
system "ruby ip-ua.rb"
|
51
|
+
system "cat #{outfiles('ip-ua')}|cut -f 2-4|sort -k2 -k3 >ip-uas"
|
52
|
+
end
|
53
|
+
|
54
|
+
######################################################################
|
55
|
+
desc "show all combinations of ip and result"
|
56
|
+
file "ip-results" do
|
57
|
+
system "ruby ip-result.rb"
|
58
|
+
system "cat #{outfiles('ip-result')}|sort -k1 -k2 >ip-results"
|
59
|
+
end
|
60
|
+
|
61
|
+
######################################################################
|
62
|
+
desc "traffic by hour"
|
63
|
+
file "hours" do
|
64
|
+
system "ruby hour.rb"
|
65
|
+
system "cat #{outfiles('hour')}|sort -n >hours"
|
66
|
+
end
|
67
|
+
|
68
|
+
######################################################################
|
69
|
+
desc "traffic by section"
|
70
|
+
file "sections" do
|
71
|
+
system "ruby section.rb"
|
72
|
+
system "cat #{outfiles('section')}|sort -nr -k2 >sections"
|
73
|
+
end
|
74
|
+
|
75
|
+
######################################################################
|
76
|
+
desc "top 10 files"
|
77
|
+
file "top-files" do
|
78
|
+
system "ruby top-file.rb"
|
79
|
+
system "cat #{outfiles('top-file')} >top-files"
|
80
|
+
end
|
data/examples/Readme
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
This contains:
|
2
|
+
logs apache log files
|
3
|
+
import-logs imports log files to hdfs (uses hadoop fs)
|
4
|
+
import.rb does the actual parsing
|
5
|
+
|
6
|
+
It also contains some mrtoolkit programs that make use of the
|
7
|
+
imported logs files:
|
8
|
+
ip.rb summarizes the usage by IP address
|
9
|
+
hour.rb counts hits per hour over multiple days
|
10
|
+
section.rb counts hits by section (top level directory)
|
11
|
+
top-file.rb shows hits for top 10 files
|
12
|
+
|
data/examples/hour.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'mrtoolkit'
|
2
|
+
|
3
|
+
class MainMap < MapBase
|
4
|
+
def declare
|
5
|
+
# declare log fields
|
6
|
+
field :ip
|
7
|
+
field :client_id
|
8
|
+
field :user_id
|
9
|
+
field :dt_tm
|
10
|
+
field :request
|
11
|
+
field :status
|
12
|
+
field :result_size
|
13
|
+
field :referer
|
14
|
+
field :ua
|
15
|
+
|
16
|
+
emit :hour
|
17
|
+
emit :count
|
18
|
+
end
|
19
|
+
|
20
|
+
def process_begin(input, output)
|
21
|
+
@hours = Array.new(24, 0)
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
def process(input, output)
|
25
|
+
dt_tm = input.dt_tm
|
26
|
+
return nil if dt_tm.nil?
|
27
|
+
fields = dt_tm.split(':')
|
28
|
+
return nil if fields.nil? || fields.size < 2 || fields[1].nil?
|
29
|
+
hour = fields[1].to_i
|
30
|
+
if hour >= 0 && hour < 24
|
31
|
+
@hours[hour] += 1
|
32
|
+
else
|
33
|
+
STDERR.puts "bad hour: #{hour}"
|
34
|
+
end
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
def process_end(input, output)
|
38
|
+
out = []
|
39
|
+
@hours.each_index do |hr|
|
40
|
+
output = new_output
|
41
|
+
output.hour = hr
|
42
|
+
output.count = @hours[hr]
|
43
|
+
out << output
|
44
|
+
end
|
45
|
+
out
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
class MainJob < JobBase
|
50
|
+
def job
|
51
|
+
mapper MainMap
|
52
|
+
reducer UniqueSumReduce
|
53
|
+
indir "logs"
|
54
|
+
outdir "hour"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#! /bin/bash
|
2
|
+
|
3
|
+
# Creates log files in HDFS.
|
4
|
+
# All files in sample-data/raw-logs are processed and
|
5
|
+
# stored with the proper delimiters for use with streaming.
|
6
|
+
|
7
|
+
hadoop fs -rmr logs
|
8
|
+
hadoop fs -mkdir logs
|
9
|
+
for i in ../sample-data/raw-logs/*
|
10
|
+
do
|
11
|
+
ruby import.rb <$i >/tmp/import
|
12
|
+
f=`basename $i`
|
13
|
+
hadoop fs -put /tmp/import logs/$f
|
14
|
+
done
|
data/examples/import.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Import combined log format
|
2
|
+
#
|
3
|
+
# Parse and remove delimiters.
|
4
|
+
# Add unambiguous field separators.
|
5
|
+
class ImportLogFile
|
6
|
+
|
7
|
+
def parse(line)
|
8
|
+
patt = /(\S*)\s*(\S*)\s*(\S*)\s*\[([^\]]*)\]\s*"([^"]*)"\s*(\S*)\s*(\S*)\s*"([^"]*)"\s*"([^"]*)"/
|
9
|
+
md = patt.match(line)
|
10
|
+
return nil if md.nil?
|
11
|
+
return md[1, 9]
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse_all(fp)
|
15
|
+
fp.each_line do |line|
|
16
|
+
res = parse(line)
|
17
|
+
puts res.join("\t")if res
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
ImportLogFile.new.parse_all(STDIN)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'mrtoolkit'
|
2
|
+
|
3
|
+
class MainMap < MapBase
|
4
|
+
def declare
|
5
|
+
# declare log fields
|
6
|
+
field :ip
|
7
|
+
field :client_id
|
8
|
+
field :user_id
|
9
|
+
field :dt_tm
|
10
|
+
field :request
|
11
|
+
field :status
|
12
|
+
field :result_size
|
13
|
+
field :referer
|
14
|
+
field :ua
|
15
|
+
|
16
|
+
emit :ip
|
17
|
+
emit :result
|
18
|
+
end
|
19
|
+
def process(input, output)
|
20
|
+
output.ip = input.ip
|
21
|
+
output.result = input.status
|
22
|
+
output
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class MainJob < JobBase
|
27
|
+
def job
|
28
|
+
mapper MainMap
|
29
|
+
reducer UniqueIndexedCountReduce
|
30
|
+
indir "logs"
|
31
|
+
outdir "ip-result"
|
32
|
+
end
|
33
|
+
end
|
data/examples/ip-size.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'mrtoolkit'
|
2
|
+
|
3
|
+
class MainMap < MapBase
|
4
|
+
def declare
|
5
|
+
# declare log fields
|
6
|
+
field :ip
|
7
|
+
field :client_id
|
8
|
+
field :user_id
|
9
|
+
field :dt_tm
|
10
|
+
field :request
|
11
|
+
field :status
|
12
|
+
field :result_size
|
13
|
+
field :referer
|
14
|
+
field :ua
|
15
|
+
|
16
|
+
emit :ip
|
17
|
+
emit :size
|
18
|
+
end
|
19
|
+
def process(input, output)
|
20
|
+
output.ip = input.ip
|
21
|
+
output.size = input.result_size
|
22
|
+
output
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class MainJob < JobBase
|
27
|
+
def job
|
28
|
+
mapper MainMap
|
29
|
+
reducer UniqueSumReduce
|
30
|
+
indir "logs"
|
31
|
+
outdir "ip-size"
|
32
|
+
end
|
33
|
+
end
|
data/examples/ip-ua.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'mrtoolkit'
|
2
|
+
|
3
|
+
class MainMap < MapBase
|
4
|
+
def declare
|
5
|
+
# declare log fields
|
6
|
+
field :ip
|
7
|
+
field :client_id
|
8
|
+
field :user_id
|
9
|
+
field :dt_tm
|
10
|
+
field :request
|
11
|
+
field :status
|
12
|
+
field :result_size
|
13
|
+
field :referer
|
14
|
+
field :ua
|
15
|
+
|
16
|
+
emit :ip_ua
|
17
|
+
emit :ip
|
18
|
+
emit :ua
|
19
|
+
end
|
20
|
+
def process(input, output)
|
21
|
+
ua = input.ua.split(/\s/)[0]
|
22
|
+
output.ip_ua = "#{input.ip}|#{ua}"
|
23
|
+
output.ip = input.ip
|
24
|
+
output.ua = ua
|
25
|
+
output
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class MainJob < JobBase
|
30
|
+
def job
|
31
|
+
mapper MainMap
|
32
|
+
reducer UniqueCountReduce, 2
|
33
|
+
indir "logs"
|
34
|
+
outdir "ip-ua"
|
35
|
+
end
|
36
|
+
end
|
data/examples/ip.rb
ADDED
data/examples/section.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'mrtoolkit'
|
2
|
+
|
3
|
+
class MainMap < MapBase
|
4
|
+
def declare
|
5
|
+
# declare log fields
|
6
|
+
field :ip
|
7
|
+
field :client_id
|
8
|
+
field :user_id
|
9
|
+
field :dt_tm
|
10
|
+
field :request
|
11
|
+
field :status
|
12
|
+
field :result_size
|
13
|
+
field :referer
|
14
|
+
field :ua
|
15
|
+
|
16
|
+
emit :section
|
17
|
+
emit :count
|
18
|
+
end
|
19
|
+
def process(input, output)
|
20
|
+
if input.request =~ /\/(\w+)\//
|
21
|
+
output.section = $1
|
22
|
+
output.count = 1
|
23
|
+
return output
|
24
|
+
end
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class MainJob < JobBase
|
30
|
+
def job
|
31
|
+
mapper MainMap
|
32
|
+
reducer UniqueSumReduce
|
33
|
+
indir "logs"
|
34
|
+
outdir "section"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require 'mrtoolkit'
|
2
|
+
|
3
|
+
class MainMap < MapBase
|
4
|
+
def declare
|
5
|
+
# declare log fields
|
6
|
+
field :ip
|
7
|
+
field :client_id
|
8
|
+
field :user_id
|
9
|
+
field :dt_tm
|
10
|
+
field :request
|
11
|
+
field :status
|
12
|
+
field :result_size
|
13
|
+
field :referer
|
14
|
+
field :ua
|
15
|
+
|
16
|
+
emit :path
|
17
|
+
emit :count
|
18
|
+
end
|
19
|
+
def process(input, output)
|
20
|
+
if input.request =~ /GET\s+(\S+)\s/
|
21
|
+
output.path = $1
|
22
|
+
output.count = 1
|
23
|
+
return output
|
24
|
+
end
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class MainJob < JobBase
|
30
|
+
def job
|
31
|
+
mapper MainMap
|
32
|
+
reducer MaxUniqueSumReduce, 10
|
33
|
+
indir "logs"
|
34
|
+
outdir "top-file"
|
35
|
+
end
|
36
|
+
end
|