mrtoolkit 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
6
+ .autosession.vim
data/Makefile ADDED
@@ -0,0 +1,6 @@
1
+ #
2
+ tar:
3
+ tar cfv ../mrtoolkit.tar -C.. --exclude=\.svn --exclude=sample-data mrtoolkit
4
+
5
+ data:
6
+ tar cfv ../sample-data.tar -C.. --exclude=\.svn mrtoolkit/sample-data
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = mrtoolkit
2
+
3
+ MRToolkit provides a framework for building simple Map/Reduce jobs in just a few lines of code. You provide only the map and reduce logic, the framework does the rest. Or use one of the provided map or reduce tools, and write even less.
4
+
5
+ Map and reduce jobs are written in Ruby. MRToolkit was inspired by Google's Sawzall.
6
+
7
+ == Acknowledgements
8
+
9
+ MRToolkit was inspired by Google's Sawzall. We wanted to make it even easier by making use of an existing language, rather than inventing a new one. Ruby was a perfect fit.
10
+
11
+ The initial development of this software was supported by the New York Times, with the support and encouragement of Vadim Jelezniakov and Ranjit Prabhu.
12
+
13
+ == This github repo
14
+
15
+ This github repo is a mirror + patches to the mrtoolkit that is hosted on code.google.com: http://code.google.com/p/mrtoolkit/wiki/Introduction
16
+
17
+ This repo adds, among other things, the ability to install mrtoolkit as a gem:
18
+
19
+ gem install jashmenn-mrtoolkit --source=http://gems.github/com
data/Rakefile ADDED
@@ -0,0 +1,57 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "mrtoolkit"
8
+ gem.summary = %Q{Simplify the creation of Hadoop Map/Reduce jobs}
9
+ gem.email = "nate@natemurray.com"
10
+ gem.homepage = "http://github.com/jashmenn/mrtoolkit"
11
+ gem.authors = ["cchayden", "vadimj", "jashmenn"]
12
+
13
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
+ end
15
+ rescue LoadError
16
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
17
+ end
18
+
19
+ require 'rake/testtask'
20
+ Rake::TestTask.new(:test) do |test|
21
+ test.libs << 'lib' << 'test'
22
+ test.pattern = 'test/**/*_test.rb'
23
+ test.verbose = true
24
+ end
25
+
26
+ begin
27
+ require 'rcov/rcovtask'
28
+ Rcov::RcovTask.new do |test|
29
+ test.libs << 'test'
30
+ test.pattern = 'test/**/*_test.rb'
31
+ test.verbose = true
32
+ end
33
+ rescue LoadError
34
+ task :rcov do
35
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
36
+ end
37
+ end
38
+
39
+
40
+ task :default => :test
41
+
42
+ require 'yaml'
43
+ require 'rake/rdoctask'
44
+ Rake::RDocTask.new do |rdoc|
45
+ if File.exist?('VERSION.yml')
46
+ config = YAML.load(File.read('VERSION.yml'))
47
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
48
+ else
49
+ version = ""
50
+ end
51
+
52
+ rdoc.rdoc_dir = 'rdoc'
53
+ rdoc.title = "mrtoolkit #{version}"
54
+ rdoc.rdoc_files.include('README*')
55
+ rdoc.rdoc_files.include('lib/**/*.rb')
56
+ end
57
+
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 0
3
+ :minor: 1
4
+ :patch: 2
data/examples/Rakefile ADDED
@@ -0,0 +1,80 @@
1
+ require 'pp'
2
+ # add mrtoolkit to $RUBYLIB
3
+ if ENV['RUBYLIB']
4
+ ENV['RUBYLIB'] = ENV['RUBYLIB'].split(':').concat(['../lib']).uniq.join(':')
5
+ else
6
+ ENV['RUBYLIB'] = '../lib'
7
+ end
8
+ # add everything in $RUBYLIB to search path
9
+ ENV['RUBYLIB'].split(':').each {|f| $:.concat([f]) unless $:.include?(f)}
10
+
11
+ require 'mrtoolkit'
12
+ require 'time'
13
+
14
+
15
+ def outfiles(base)
16
+ "out/#{base}/part-*"
17
+ end
18
+
19
+
20
+ ######################################################################
21
+ desc "clean up"
22
+ task "clean" do
23
+ to_clean = %w{hour hours ip ips section sections top-file top-files ip-size ip-sizes ip-result ip-results ip-uas ip-ua out }
24
+ to_clean.each {|f| system "rm -rf #{f}"}
25
+ end
26
+
27
+ ######################################################################
28
+ desc "import data to hdfs"
29
+ task "import" do
30
+ system "./import-logs"
31
+ end
32
+
33
+ ######################################################################
34
+ desc "traffic by IP address"
35
+ file "ips" do
36
+ system "ruby ip.rb"
37
+ system "cat #{outfiles('ip')}|sort -nr -k2 >ips"
38
+ end
39
+
40
+ ######################################################################
41
+ desc "returned result size by IP address"
42
+ file "ip-sizes" do
43
+ system "ruby ip-size.rb"
44
+ system "cat #{outfiles('ip-size')}|sort -nr -k2 >ip-sizes"
45
+ end
46
+
47
+ ######################################################################
48
+ desc "show all combinations of ip and user agent"
49
+ file "ip-uas" do
50
+ system "ruby ip-ua.rb"
51
+ system "cat #{outfiles('ip-ua')}|cut -f 2-4|sort -k2 -k3 >ip-uas"
52
+ end
53
+
54
+ ######################################################################
55
+ desc "show all combinations of ip and result"
56
+ file "ip-results" do
57
+ system "ruby ip-result.rb"
58
+ system "cat #{outfiles('ip-result')}|sort -k1 -k2 >ip-results"
59
+ end
60
+
61
+ ######################################################################
62
+ desc "traffic by hour"
63
+ file "hours" do
64
+ system "ruby hour.rb"
65
+ system "cat #{outfiles('hour')}|sort -n >hours"
66
+ end
67
+
68
+ ######################################################################
69
+ desc "traffic by section"
70
+ file "sections" do
71
+ system "ruby section.rb"
72
+ system "cat #{outfiles('section')}|sort -nr -k2 >sections"
73
+ end
74
+
75
+ ######################################################################
76
+ desc "top 10 files"
77
+ file "top-files" do
78
+ system "ruby top-file.rb"
79
+ system "cat #{outfiles('top-file')} >top-files"
80
+ end
data/examples/Readme ADDED
@@ -0,0 +1,12 @@
1
+ This contains:
2
+ logs apache log files
3
+ import-logs imports log files to hdfs (uses hadoop fs)
4
+ import.rb does the actual parsing
5
+
6
+ It also contains some mrtoolkit programs that make use of the
7
+ imported logs files:
8
+ ip.rb summarizes the usage by IP address
9
+ hour.rb counts hits per hour over multiple days
10
+ section.rb counts hits by section (top level directory)
11
+ top-file.rb shows hits for top 10 files
12
+
data/examples/hour.rb ADDED
@@ -0,0 +1,57 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainMap < MapBase
4
+ def declare
5
+ # declare log fields
6
+ field :ip
7
+ field :client_id
8
+ field :user_id
9
+ field :dt_tm
10
+ field :request
11
+ field :status
12
+ field :result_size
13
+ field :referer
14
+ field :ua
15
+
16
+ emit :hour
17
+ emit :count
18
+ end
19
+
20
+ def process_begin(input, output)
21
+ @hours = Array.new(24, 0)
22
+ nil
23
+ end
24
+ def process(input, output)
25
+ dt_tm = input.dt_tm
26
+ return nil if dt_tm.nil?
27
+ fields = dt_tm.split(':')
28
+ return nil if fields.nil? || fields.size < 2 || fields[1].nil?
29
+ hour = fields[1].to_i
30
+ if hour >= 0 && hour < 24
31
+ @hours[hour] += 1
32
+ else
33
+ STDERR.puts "bad hour: #{hour}"
34
+ end
35
+ nil
36
+ end
37
+ def process_end(input, output)
38
+ out = []
39
+ @hours.each_index do |hr|
40
+ output = new_output
41
+ output.hour = hr
42
+ output.count = @hours[hr]
43
+ out << output
44
+ end
45
+ out
46
+ end
47
+ end
48
+
49
+ class MainJob < JobBase
50
+ def job
51
+ mapper MainMap
52
+ reducer UniqueSumReduce
53
+ indir "logs"
54
+ outdir "hour"
55
+ end
56
+ end
57
+
@@ -0,0 +1,14 @@
1
+ #! /bin/bash
2
+
3
+ # Creates log files in HDFS.
4
+ # All files in sample-data/raw-logs are processed and
5
+ # stored with the proper delimiters for use with streaming.
6
+
7
+ hadoop fs -rmr logs
8
+ hadoop fs -mkdir logs
9
+ for i in ../sample-data/raw-logs/*
10
+ do
11
+ ruby import.rb <$i >/tmp/import
12
+ f=`basename $i`
13
+ hadoop fs -put /tmp/import logs/$f
14
+ done
@@ -0,0 +1,22 @@
1
+ # Import combined log format
2
+ #
3
+ # Parse and remove delimiters.
4
+ # Add unambiguous field separators.
5
+ class ImportLogFile
6
+
7
+ def parse(line)
8
+ patt = /(\S*)\s*(\S*)\s*(\S*)\s*\[([^\]]*)\]\s*"([^"]*)"\s*(\S*)\s*(\S*)\s*"([^"]*)"\s*"([^"]*)"/
9
+ md = patt.match(line)
10
+ return nil if md.nil?
11
+ return md[1, 9]
12
+ end
13
+
14
+ def parse_all(fp)
15
+ fp.each_line do |line|
16
+ res = parse(line)
17
+ puts res.join("\t")if res
18
+ end
19
+ end
20
+ end
21
+
22
+ ImportLogFile.new.parse_all(STDIN)
@@ -0,0 +1,33 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainMap < MapBase
4
+ def declare
5
+ # declare log fields
6
+ field :ip
7
+ field :client_id
8
+ field :user_id
9
+ field :dt_tm
10
+ field :request
11
+ field :status
12
+ field :result_size
13
+ field :referer
14
+ field :ua
15
+
16
+ emit :ip
17
+ emit :result
18
+ end
19
+ def process(input, output)
20
+ output.ip = input.ip
21
+ output.result = input.status
22
+ output
23
+ end
24
+ end
25
+
26
+ class MainJob < JobBase
27
+ def job
28
+ mapper MainMap
29
+ reducer UniqueIndexedCountReduce
30
+ indir "logs"
31
+ outdir "ip-result"
32
+ end
33
+ end
@@ -0,0 +1,33 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainMap < MapBase
4
+ def declare
5
+ # declare log fields
6
+ field :ip
7
+ field :client_id
8
+ field :user_id
9
+ field :dt_tm
10
+ field :request
11
+ field :status
12
+ field :result_size
13
+ field :referer
14
+ field :ua
15
+
16
+ emit :ip
17
+ emit :size
18
+ end
19
+ def process(input, output)
20
+ output.ip = input.ip
21
+ output.size = input.result_size
22
+ output
23
+ end
24
+ end
25
+
26
+ class MainJob < JobBase
27
+ def job
28
+ mapper MainMap
29
+ reducer UniqueSumReduce
30
+ indir "logs"
31
+ outdir "ip-size"
32
+ end
33
+ end
data/examples/ip-ua.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainMap < MapBase
4
+ def declare
5
+ # declare log fields
6
+ field :ip
7
+ field :client_id
8
+ field :user_id
9
+ field :dt_tm
10
+ field :request
11
+ field :status
12
+ field :result_size
13
+ field :referer
14
+ field :ua
15
+
16
+ emit :ip_ua
17
+ emit :ip
18
+ emit :ua
19
+ end
20
+ def process(input, output)
21
+ ua = input.ua.split(/\s/)[0]
22
+ output.ip_ua = "#{input.ip}|#{ua}"
23
+ output.ip = input.ip
24
+ output.ua = ua
25
+ output
26
+ end
27
+ end
28
+
29
+ class MainJob < JobBase
30
+ def job
31
+ mapper MainMap
32
+ reducer UniqueCountReduce, 2
33
+ indir "logs"
34
+ outdir "ip-ua"
35
+ end
36
+ end
data/examples/ip.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainJob < JobBase
4
+ def job
5
+ mapper CopyMap
6
+ reducer UniqueCountReduce
7
+ indir "logs"
8
+ outdir "ip"
9
+ end
10
+ end
@@ -0,0 +1,37 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainMap < MapBase
4
+ def declare
5
+ # declare log fields
6
+ field :ip
7
+ field :client_id
8
+ field :user_id
9
+ field :dt_tm
10
+ field :request
11
+ field :status
12
+ field :result_size
13
+ field :referer
14
+ field :ua
15
+
16
+ emit :section
17
+ emit :count
18
+ end
19
+ def process(input, output)
20
+ if input.request =~ /\/(\w+)\//
21
+ output.section = $1
22
+ output.count = 1
23
+ return output
24
+ end
25
+ nil
26
+ end
27
+ end
28
+
29
+ class MainJob < JobBase
30
+ def job
31
+ mapper MainMap
32
+ reducer UniqueSumReduce
33
+ indir "logs"
34
+ outdir "section"
35
+ end
36
+ end
37
+
@@ -0,0 +1,36 @@
1
+ require 'mrtoolkit'
2
+
3
+ class MainMap < MapBase
4
+ def declare
5
+ # declare log fields
6
+ field :ip
7
+ field :client_id
8
+ field :user_id
9
+ field :dt_tm
10
+ field :request
11
+ field :status
12
+ field :result_size
13
+ field :referer
14
+ field :ua
15
+
16
+ emit :path
17
+ emit :count
18
+ end
19
+ def process(input, output)
20
+ if input.request =~ /GET\s+(\S+)\s/
21
+ output.path = $1
22
+ output.count = 1
23
+ return output
24
+ end
25
+ nil
26
+ end
27
+ end
28
+
29
+ class MainJob < JobBase
30
+ def job
31
+ mapper MainMap
32
+ reducer MaxUniqueSumReduce, 10
33
+ indir "logs"
34
+ outdir "top-file"
35
+ end
36
+ end