RubyGems - mrtoolkit - Versions diffs - 0.1.2 - Mend

Files changed (34) hide show

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED Viewed

@@ -0,0 +1,6 @@
+*.sw?
+.DS_Store
+coverage
+rdoc
+pkg
+.autosession.vim

data/Makefile ADDED Viewed

@@ -0,0 +1,6 @@
+#
+tar:
+	tar cfv ../mrtoolkit.tar -C.. --exclude=\.svn --exclude=sample-data mrtoolkit
+data:
+	tar cfv ../sample-data.tar -C.. --exclude=\.svn mrtoolkit/sample-data

data/README.rdoc ADDED Viewed

@@ -0,0 +1,19 @@
+= mrtoolkit
+MRToolkit provides a framework for building simple Map/Reduce jobs in just a few lines of code. You provide only the map and reduce logic, the framework does the rest. Or use one of the provided map or reduce tools, and write even less.
+Map and reduce jobs are written in Ruby. MRToolkit was inspired by Google's Sawzall.
+== Acknowledgements
+MRToolkit was inspired by Google's Sawzall. We wanted to make it even easier by making use of an existing language, rather than inventing a new one. Ruby was a perfect fit.
+The initial development of this software was supported by the New York Times, with the support and encouragement of Vadim Jelezniakov and Ranjit Prabhu.
+== This github repo
+This github repo is a mirror + patches to the mrtoolkit that is hosted on code.google.com: http://code.google.com/p/mrtoolkit/wiki/Introduction
+This repo adds, among other things, the ability to install mrtoolkit as a gem:
+    gem install jashmenn-mrtoolkit --source=http://gems.github/com

data/Rakefile ADDED Viewed

@@ -0,0 +1,57 @@
+require 'rubygems'
+require 'rake'
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "mrtoolkit"
+    gem.summary = %Q{Simplify the creation of Hadoop Map/Reduce jobs}
+    gem.email = "nate@natemurray.com"
+    gem.homepage = "http://github.com/jashmenn/mrtoolkit"
+    gem.authors = ["cchayden", "vadimj", "jashmenn"]
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+rescue LoadError
+  puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+task :default => :test
+require 'yaml'
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION.yml')
+    config = YAML.load(File.read('VERSION.yml'))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "mrtoolkit #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/VERSION.yml ADDED Viewed

@@ -0,0 +1,4 @@
+---
+:major: 0
+:minor: 1
+:patch: 2

data/examples/Rakefile ADDED Viewed

@@ -0,0 +1,80 @@
+require 'pp'
+# add mrtoolkit to $RUBYLIB
+if ENV['RUBYLIB']
+  ENV['RUBYLIB'] = ENV['RUBYLIB'].split(':').concat(['../lib']).uniq.join(':')
+else
+  ENV['RUBYLIB'] = '../lib'
+end
+# add everything in $RUBYLIB to search path
+ENV['RUBYLIB'].split(':').each {|f| $:.concat([f]) unless $:.include?(f)}
+require 'mrtoolkit'
+require 'time'
+def outfiles(base)
+  "out/#{base}/part-*"
+end
+######################################################################
+desc "clean up"
+task "clean" do
+  to_clean = %w{hour hours ip ips section sections top-file top-files ip-size ip-sizes ip-result ip-results ip-uas ip-ua out }
+  to_clean.each {|f| system "rm -rf #{f}"}
+end
+######################################################################
+desc "import data to hdfs"
+task "import" do
+  system "./import-logs"
+end
+######################################################################
+desc "traffic by IP address"
+file "ips" do
+  system "ruby ip.rb"
+  system "cat #{outfiles('ip')}|sort -nr -k2 >ips"
+end
+######################################################################
+desc "returned result size by IP address"
+file "ip-sizes" do
+  system "ruby ip-size.rb"
+  system "cat #{outfiles('ip-size')}|sort -nr -k2 >ip-sizes"
+end
+######################################################################
+desc "show all combinations of ip and user agent"
+file "ip-uas" do
+  system "ruby ip-ua.rb"
+  system "cat #{outfiles('ip-ua')}|cut -f 2-4|sort -k2 -k3 >ip-uas"
+end
+######################################################################
+desc "show all combinations of ip and result"
+file "ip-results" do
+  system "ruby ip-result.rb"
+  system "cat #{outfiles('ip-result')}|sort -k1 -k2 >ip-results"
+end
+######################################################################
+desc "traffic by hour"
+file "hours" do
+  system "ruby hour.rb"
+  system "cat #{outfiles('hour')}|sort -n >hours"
+end
+######################################################################
+desc "traffic by section"
+file "sections" do
+  system "ruby section.rb"
+  system "cat #{outfiles('section')}|sort -nr -k2 >sections"
+end
+######################################################################
+desc "top 10 files"
+file "top-files" do
+  system "ruby top-file.rb"
+  system "cat #{outfiles('top-file')} >top-files"
+end

data/examples/Readme ADDED Viewed

@@ -0,0 +1,12 @@
+This contains:
+  logs		apache log files
+  import-logs	imports log files to hdfs (uses hadoop fs)
+  import.rb	does the actual parsing
+It also contains some mrtoolkit programs that make use of the
+imported logs files:
+  ip.rb		summarizes the usage by IP address
+  hour.rb	counts hits per hour over multiple days
+  section.rb	counts hits by section (top level directory)
+  top-file.rb	shows hits for top 10 files

data/examples/hour.rb ADDED Viewed

@@ -0,0 +1,57 @@
+require 'mrtoolkit'
+class MainMap < MapBase
+  def declare
+    # declare log fields
+    field :ip
+    field :client_id
+    field :user_id
+    field :dt_tm
+    field :request
+    field :status
+    field :result_size
+    field :referer
+    field :ua
+    emit :hour
+    emit :count
+  end
+  def process_begin(input, output)
+    @hours = Array.new(24, 0)
+    nil
+  end
+  def process(input, output)
+    dt_tm = input.dt_tm
+    return nil if dt_tm.nil?
+    fields = dt_tm.split(':')
+    return nil if fields.nil? || fields.size < 2 || fields[1].nil?
+    hour = fields[1].to_i
+    if hour >= 0 && hour < 24
+      @hours[hour] += 1
+    else
+      STDERR.puts "bad hour: #{hour}"
+    end
+    nil
+  end
+  def process_end(input, output)
+    out = []
+    @hours.each_index do |hr|
+      output = new_output
+      output.hour = hr
+      output.count = @hours[hr]
+      out << output
+    end
+    out
+  end
+end
+class MainJob < JobBase
+  def job
+    mapper MainMap
+    reducer UniqueSumReduce
+    indir "logs"
+    outdir "hour"
+  end
+end

data/examples/import-logs ADDED Viewed

@@ -0,0 +1,14 @@
+#! /bin/bash
+# Creates log files in HDFS.
+# All files in sample-data/raw-logs are processed and
+#  stored with the proper delimiters for use with streaming.
+hadoop fs -rmr logs
+hadoop fs -mkdir logs
+for i in ../sample-data/raw-logs/*
+do
+  ruby import.rb <$i >/tmp/import
+  f=`basename $i`
+  hadoop fs -put /tmp/import logs/$f
+done

data/examples/import.rb ADDED Viewed

@@ -0,0 +1,22 @@
+# Import combined log format
+#
+# Parse and remove delimiters.
+# Add unambiguous field separators.
+class ImportLogFile
+  def parse(line)
+    patt = /(\S*)\s*(\S*)\s*(\S*)\s*\[([^\]]*)\]\s*"([^"]*)"\s*(\S*)\s*(\S*)\s*"([^"]*)"\s*"([^"]*)"/
+    md = patt.match(line)
+    return nil if md.nil?
+    return md[1, 9]
+  end
+  def parse_all(fp)
+    fp.each_line do |line|
+      res = parse(line)
+      puts res.join("\t")if res
+    end
+  end
+end
+ImportLogFile.new.parse_all(STDIN)

data/examples/ip-result.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'mrtoolkit'
+class MainMap < MapBase
+  def declare
+    # declare log fields
+    field :ip
+    field :client_id
+    field :user_id
+    field :dt_tm
+    field :request
+    field :status
+    field :result_size
+    field :referer
+    field :ua
+    emit :ip
+    emit :result
+  end
+  def process(input, output)
+    output.ip = input.ip
+    output.result = input.status
+    output
+  end
+end
+class MainJob < JobBase
+  def job
+    mapper MainMap
+    reducer UniqueIndexedCountReduce
+    indir "logs"
+    outdir "ip-result"
+  end
+end

data/examples/ip-size.rb ADDED Viewed

@@ -0,0 +1,33 @@
+require 'mrtoolkit'
+class MainMap < MapBase
+  def declare
+    # declare log fields
+    field :ip
+    field :client_id
+    field :user_id
+    field :dt_tm
+    field :request
+    field :status
+    field :result_size
+    field :referer
+    field :ua
+    emit :ip
+    emit :size
+  end
+  def process(input, output)
+    output.ip = input.ip
+    output.size = input.result_size
+    output
+  end
+end
+class MainJob < JobBase
+  def job
+    mapper MainMap
+    reducer UniqueSumReduce
+    indir "logs"
+    outdir "ip-size"
+  end
+end

data/examples/ip-ua.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'mrtoolkit'
+class MainMap < MapBase
+  def declare
+    # declare log fields
+    field :ip
+    field :client_id
+    field :user_id
+    field :dt_tm
+    field :request
+    field :status
+    field :result_size
+    field :referer
+    field :ua
+    emit :ip_ua
+    emit :ip
+    emit :ua
+  end
+  def process(input, output)
+    ua = input.ua.split(/\s/)[0]
+    output.ip_ua = "#{input.ip}|#{ua}"
+    output.ip = input.ip
+    output.ua = ua
+    output
+  end
+end
+class MainJob < JobBase
+  def job
+    mapper MainMap
+    reducer UniqueCountReduce, 2
+    indir "logs"
+    outdir "ip-ua"
+  end
+end

data/examples/ip.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'mrtoolkit'
+class MainJob < JobBase
+  def job
+    mapper CopyMap
+    reducer UniqueCountReduce
+    indir "logs"
+    outdir "ip"
+  end
+end

data/examples/section.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require 'mrtoolkit'
+class MainMap < MapBase
+  def declare
+    # declare log fields
+    field :ip
+    field :client_id
+    field :user_id
+    field :dt_tm
+    field :request
+    field :status
+    field :result_size
+    field :referer
+    field :ua
+    emit :section
+    emit :count
+  end
+  def process(input, output)
+    if input.request =~ /\/(\w+)\//
+      output.section = $1
+      output.count = 1
+      return output
+    end
+    nil
+  end
+end
+class MainJob < JobBase
+  def job
+    mapper MainMap
+    reducer UniqueSumReduce
+    indir "logs"
+    outdir "section"
+  end
+end

data/examples/top-file.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'mrtoolkit'
+class MainMap < MapBase
+  def declare
+    # declare log fields
+    field :ip
+    field :client_id
+    field :user_id
+    field :dt_tm
+    field :request
+    field :status
+    field :result_size
+    field :referer
+    field :ua
+    emit :path
+    emit :count
+  end
+  def process(input, output)
+    if input.request =~ /GET\s+(\S+)\s/
+      output.path = $1
+      output.count = 1
+      return output
+    end
+    nil
+  end
+end
+class MainJob < JobBase
+  def job
+    mapper MainMap
+    reducer MaxUniqueSumReduce, 10
+    indir "logs"
+    outdir "top-file"
+  end
+end

mrtoolkit 0.1.2