RubyGems - elasticrawl - Versions diffs - 1.0.0 - Mend

elasticrawl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/.gitignore +21 -0
data/.travis.yml +5 -0
data/Cheffile +14 -0
data/Cheffile.lock +37 -0
data/Gemfile +3 -0
data/LICENSE +22 -0
data/README.md +232 -0
data/Rakefile +11 -0
data/Vagrantfile +58 -0
data/bin/elasticrawl +141 -0
data/db/migrate/201401051536_create_crawls.rb +10 -0
data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
data/db/migrate/201401101723_create_jobs.rb +14 -0
data/db/migrate/201401141606_create_job_steps.rb +11 -0
data/elasticrawl.gemspec +35 -0
data/lib/elasticrawl/cluster.rb +128 -0
data/lib/elasticrawl/combine_job.rb +86 -0
data/lib/elasticrawl/config.rb +242 -0
data/lib/elasticrawl/crawl.rb +114 -0
data/lib/elasticrawl/crawl_segment.rb +8 -0
data/lib/elasticrawl/error.rb +22 -0
data/lib/elasticrawl/job.rb +68 -0
data/lib/elasticrawl/job_step.rb +46 -0
data/lib/elasticrawl/parse_job.rb +84 -0
data/lib/elasticrawl/version.rb +3 -0
data/lib/elasticrawl.rb +21 -0
data/spec/fixtures/aws.yml +4 -0
data/spec/fixtures/cluster.yml +44 -0
data/spec/fixtures/jobs.yml +31 -0
data/spec/spec_helper.rb +35 -0
data/spec/unit/cluster_spec.rb +54 -0
data/spec/unit/combine_job_spec.rb +97 -0
data/spec/unit/config_spec.rb +17 -0
data/spec/unit/crawl_segment_spec.rb +27 -0
data/spec/unit/crawl_spec.rb +137 -0
data/spec/unit/job_spec.rb +10 -0
data/spec/unit/job_step_spec.rb +60 -0
data/spec/unit/parse_job_spec.rb +130 -0
data/templates/aws.yml +7 -0
data/templates/cluster.yml +44 -0
data/templates/jobs.yml +31 -0
metadata +315 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,21 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.vagrant
+cookbooks
+spec/fixtures/elasticrawl.sqlite3

data/.travis.yml ADDED Viewed

@@ -0,0 +1,5 @@
+language: ruby
+rvm:
+  - 1.9.3
+  - 2.0.0
+  - 2.1.0

data/Cheffile ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+#^syntax detection
+site "http://community.opscode.com/api/v1"
+cookbook "apt",
+  :version => "1.7.0"
+cookbook "build-essential"
+cookbook "git"
+cookbook "rbenv",
+  :git => "https://github.com/fnichol/chef-rbenv.git",
+  :ref => "v0.7.2"
+cookbook "ruby_build"
+cookbook "vim"

data/Cheffile.lock ADDED Viewed

@@ -0,0 +1,37 @@
+SITE
+  remote: http://community.opscode.com/api/v1
+  specs:
+    apt (2.2.1)
+    build-essential (1.4.2)
+    chef_handler (1.1.4)
+    dmg (2.0.4)
+    git (2.7.0)
+      build-essential (>= 0.0.0)
+      dmg (>= 0.0.0)
+      runit (>= 1.0.0)
+      windows (>= 0.0.0)
+      yum (>= 0.0.0)
+    ruby_build (0.8.0)
+    runit (1.3.0)
+      build-essential (>= 0.0.0)
+      yum (>= 0.0.0)
+    vim (1.0.2)
+    windows (1.11.0)
+      chef_handler (>= 0.0.0)
+    yum (2.3.4)
+GIT
+  remote: https://github.com/fnichol/chef-rbenv.git
+  ref: v0.7.2
+  sha: f2b53292e810dd2b43f6121f9958f5f29979dcb1
+  specs:
+    rbenv (0.7.2)
+DEPENDENCIES
+  apt (>= 0)
+  build-essential (>= 0)
+  git (>= 0)
+  rbenv (>= 0)
+  ruby_build (>= 0)
+  vim (>= 0)

data/Gemfile ADDED Viewed

@@ -0,0 +1,3 @@
+source 'http://rubygems.org'
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2014 Ross Fairbanks
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,232 @@
+# Elasticrawl
+Launch AWS Elastic MapReduce jobs that process Common Crawl data.
+Elasticrawl works with the latest Common Crawl data structure and file formats
+([2013 data onwards](http://commoncrawl.org/new-crawl-data-available/)).
+Ships with a default configuration that launches the
+[elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples) jobs.
+This is an implementation of the standard Hadoop Word Count example.
+## Overview
+Common Crawl have released 2 web crawls of 2013 data. Further crawls will be released
+during 2014. Each crawl is split into multiple segments that contain 3 file types.
+* WARC - WARC files with the HTTP request and response for each fetch
+* WAT - WARC encoded files containing JSON metadata
+* WET - WARC encoded text extractions of the HTTP responses
+| Crawl Name     | Date     | Segments | Pages         | Size (uncompressed) |
+| -------------- |:--------:|:--------:|:-------------:|:-------------------:|
+| CC-MAIN-2013-48| Nov 2013 | 517      | ~ 2.3 billion | 148 TB              |
+| CC-MAIN-2013-20| May 2013 | 316      | ~ 2.0 billion | 102 TB              |
+Elasticrawl is a command line tool that automates launching Elastic MapReduce
+jobs against this data.
+[![Code Climate](https://codeclimate.com/github/rossf7/elasticrawl.png)](https://codeclimate.com/github/rossf7/elasticrawl)
+[![Build Status](https://travis-ci.org/rossf7/elasticrawl.png?branch=master)](https://travis-ci.org/rossf7/elasticrawl) 1.9.3, 2.0.0, 2.1.0
+## Installation
+### Dependencies
+Elasticrawl is developed in Ruby and requires Ruby 1.9.3 or later.
+Installing using [rbenv](https://github.com/sstephenson/rbenv#installation)
+and the ruby-build plugin is recommended.
+### Install elasticrawl
+```bash
+~$ gem install elasticrawl --no-rdoc --no-ri
+```
+If you're using rbenv you need to do a rehash to add the elasticrawl executable
+to your path.
+```bash
+~$ rbenv rehash
+```
+## Quick Start
+In this example you'll launch 2 EMR jobs against a small portion of the Nov
+2013 crawl. Each job will take around 20 minutes to run. Most of this is setup
+time while your EC2 spot instances are provisioned and your Hadoop cluster is
+configured.
+You'll need to have an [AWS account](https://portal.aws.amazon.com/gp/aws/developer/registration/index.html)
+to use elasticrawl. The total cost of the 2 EMR jobs will be under $1 USD.
+### Setup
+You'll need to choose an S3 bucket name and enter your AWS access key and
+secret key. The S3 bucket will be used for storing data and logs. S3 bucket
+names must be unique, using hyphens rather than underscores is recommended.
+```bash
+~$ elasticrawl init your-s3-bucket
+Enter AWS Access Key ID: ************
+Enter AWS Secret Access Key: ************
+...
+Bucket s3://elasticrawl-test created
+Config dir /Users/ross/.elasticrawl created
+Config complete
+```
+### Parse Job
+For this example you'll parse the first 2 WET files in the first 2 segments
+of the Nov 2013 crawl.
+```bash
+~$ elasticrawl parse CC-MAIN-2013-48 --max-segments 2 --max-files 2
+Job configuration
+Crawl: CC-MAIN-2013-48 Segments: 2 Parsing: 2 files per segment
+Cluster configuration
+Master: 1 m1.medium  (Spot: 0.12)
+Core:   2 m1.medium  (Spot: 0.12)
+Task:   --
+Launch job? (y/n)
+y
+Job Name: 1391458746774 Job Flow ID: j-2X9JVDC1UKEQ1
+```
+You can monitor the progress of your job in the Elastic MapReduce section
+of the AWS web console.
+### Combine Job
+The combine job will aggregate the word count results from both segments into
+a single set of files.
+```bash
+~$ elasticrawl combine --input-jobs 1391458746774
+Job configuration
+Combining: 2 segments
+Cluster configuration
+Master: 1 m1.medium  (Spot: 0.12)
+Core:   2 m1.medium  (Spot: 0.12)
+Task:   --
+Launch job? (y/n)
+y
+Job Name: 1391459918730 Job Flow ID: j-GTJ2M7D1TXO6
+```
+Once the combine job is complete you can download your results from the
+S3 section of the AWS web console. Your data will be stored in
+[your S3 bucket]/data/2-combine/[job name]
+### Cleaning Up
+You'll be charged by AWS for any data stored in your S3 bucket. The destroy
+command deletes your S3 bucket and the ~/.elasticrawl/ directory.
+```bash
+~$ elasticrawl destroy
+WARNING:
+Bucket s3://elasticrawl-test and its data will be deleted
+Config dir /home/vagrant/.elasticrawl will be deleted
+Delete? (y/n)
+y
+Bucket s3://elasticrawl-test deleted
+Config dir /home/vagrant/.elasticrawl deleted
+Config deleted
+```
+## Configuring Elasticrawl
+The elasticrawl init command creates the ~/elasticrawl/ directory which
+contains
+* [aws.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/aws.yml) -
+stores your AWS access credentials. Or you can set the environment
+variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
+* [cluster.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/cluster.yml) -
+configures the EC2 instances that are launched to form your EMR cluster
+* [jobs.yml](https://github.com/rossf7/elasticrawl/blob/master/templates/jobs.yml) -
+stores your S3 bucket name and the config for the parse and combine jobs
+## Managing Segments
+Each Common Crawl segment is parsed as a separate EMR job step. This avoids
+overloading the job tracker and means if a job fails then only data from the
+current segment is lost. However an EMR job flow can only contain 256 steps.
+So to process an entire crawl multiple parse jobs must be combined.
+```bash
+~$ elasticrawl combine --input-jobs 1391430796774 1391458746774 1391498046704
+```
+You can use the status command to see details of crawls and jobs.
+```bash
+~$ elasticrawl status
+Crawl Status
+CC-MAIN-2013-48 Segments: to parse 517, parsed 2, total 519
+Job History (last 10)
+1391459918730 Launched: 2014-02-04 13:58:12 Combining: 2 segments
+1391458746774 Launched: 2014-02-04 13:55:50 Crawl: CC-MAIN-2013-48 Segments: 2 Parsing: 2 files per segment
+```
+You can use the reset command to parse a crawl again.
+```bash
+~$ elasticrawl reset CC-MAIN-2013-48
+Reset crawl? (y/n)
+y
+CC-MAIN-2013-48 Segments: to parse 519, parsed 0, total 519
+```
+To parse the same segments multiple times.
+```bash
+~$ elasticrawl parse CC-MAIN-2013-48 --segment-list 1386163036037 1386163035819 --max-files 2
+```
+## Running your own Jobs
+1. Fork the [elasticrawl-examples](https://github.com/rossf7/elasticrawl-examples)
+2. Make your changes
+3. Compile your changes into a JAR using Maven
+4. Upload your JAR to your own S3 bucket
+5. Edit ~/.elasticrawl/jobs.yml with your JAR and class names
+## TODO
+* Add support for Streaming and Pig jobs
+## Thanks
+* Thanks to everyone at Common Crawl for making this awesome dataset available.
+* Thanks to Robert Slifka for the [elasticity](https://github.com/rslifka/elasticity)
+gem which provides a nice Ruby wrapper for the EMR REST API.
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## License
+This code is licensed under the MIT license.

data/Rakefile ADDED Viewed

@@ -0,0 +1,11 @@
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+namespace :spec do
+  RSpec::Core::RakeTask.new(:unit) do |t|
+    t.pattern = 'spec/unit/*_spec.rb'
+  end
+end
+desc 'Run unit specs'
+task :default => 'spec:unit'

data/Vagrantfile ADDED Viewed

@@ -0,0 +1,58 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+Vagrant.configure("2") do |config|
+  # All Vagrant configuration is done here. The most common configuration
+  # options are documented and commented below. For a complete reference,
+  # please see the online documentation at vagrantup.com.
+  # Fix DNS issues with Ubuntu 12.04 by always using host's resolver
+  config.vm.provider "virtualbox" do |vbox|
+    vbox.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
+  end
+  # Elasticrawl launches Hadoop jobs for the CommonCrawl dataset using the AWS EMR service.
+  config.vm.define :elasticrawl do |elasticrawl|
+    elasticrawl.vm.box = "elasticrawl"
+    # Ubuntu Server 12.04 LTS
+    elasticrawl.vm.box_url = "http://files.vagrantup.com/precise64.box"
+    # Network config
+    elasticrawl.vm.network :public_network
+    # Provision using Chef Solo
+    elasticrawl.vm.provision "chef_solo" do |chef|
+      chef.cookbooks_path = "cookbooks"
+      chef.add_recipe "apt"
+      chef.add_recipe "build-essential"
+      chef.add_recipe "ruby_build"
+      chef.add_recipe "rbenv::user"
+      chef.add_recipe "git"
+      chef.add_recipe "vim"
+      chef.json = {
+        "rbenv" => {
+          "user_installs" => [
+            {
+              "user" => "vagrant",
+              "rubies" => ["1.9.3-p484", "2.0.0-p353", "2.1.0"],
+              "global" => "1.9.3-p484",
+              "gems" => {
+                "1.9.3-p484" => [
+                  { "name" => "bundler" }
+                ],
+                "2.0.0-p353" => [
+                  { "name" => "bundler" }
+                ],
+                "2.1.0" => [
+                  { "name" => "bundler" }
+                ]
+              }
+            }
+          ]
+        }
+      }
+    end
+  end
+end

data/bin/elasticrawl ADDED Viewed

@@ -0,0 +1,141 @@
+#!/usr/bin/env ruby
+require 'elasticrawl'
+module Elasticrawl
+  class Cli < Thor
+    desc 'init S3_BUCKET_NAME', 'Creates S3 bucket and config directory'
+    method_option :access_key_id, :type => :string, :desc => 'AWS Access Key ID'
+    method_option :secret_access_key, :type => :string, :desc => 'AWS Secret Access Key'
+    def init(s3_bucket_name)
+      key = options[:access_key_id]
+      secret = options[:secret_access_key]
+      if key.nil? || secret.nil?
+        config = Config.new
+        # Prompt for credentials showing the current values.
+        key = ask(config.access_key_prompt)
+        secret = ask(config.secret_key_prompt)
+        # Use current values if user has selected them.
+        key = config.access_key_id if key.blank?
+        secret = config.secret_access_key if secret.blank?
+      end
+      # Create new config object with updated credentials.
+      config = Config.new(key, secret)
+      if config.bucket_exists?(s3_bucket_name)
+        puts('ERROR: S3 bucket already exists')
+      else
+        if config.dir_exists?
+          puts("WARNING: Config dir #{config.config_dir} already exists")
+          overwrite = agree('Overwrite? (y/n)', true)
+        end
+        puts(config.create(s3_bucket_name)) if !config.dir_exists? || overwrite == true
+      end
+    end
+    desc 'parse CRAWL_NAME', 'Launches parse job against Common Crawl corpus'
+    method_option :max_segments, :type => :numeric, :desc => 'number of crawl segments to parse'
+    method_option :max_files, :type => :numeric, :desc => 'number of files to parse per segment'
+    method_option :segment_list, :type => :array, :desc => 'list of segment names to parse'
+    def parse(crawl_name)
+      load_database
+      crawl = find_crawl(crawl_name)
+      if crawl.has_segments?
+        segment_list = options[:segment_list]
+        if segment_list.present?
+          segments = crawl.select_segments(segment_list)
+        else
+          segments = crawl.next_segments(options[:max_segments])
+        end
+        if segments.count == 0
+          puts('ERROR: No segments matched for parsing')
+        else
+          job = ParseJob.new
+          job.set_segments(segments, options[:max_files])
+          puts(job.confirm_message)
+          launch = agree('Launch job? (y/n)', true)
+          puts(job.run) if launch == true
+        end
+      else
+        puts('ERROR: Crawl does not exist')
+      end
+    end
+    desc 'combine', 'Launches combine job against parse job results'
+    method_option :input_jobs, :type => :array, :required => true,
+      :desc => 'list of input jobs to combine'
+    def combine
+      load_database
+      job = CombineJob.new
+      job.set_input_jobs(options[:input_jobs])
+      puts(job.confirm_message)
+      launch = agree('Launch job? (y/n)', true)
+      puts(job.run) if launch == true
+    end
+    desc 'status', 'Shows crawl status and lists jobs'
+    method_option :show_all, :type => :boolean, :desc => 'list all jobs'
+    def status
+      load_database
+      puts(Crawl.status(options[:show_all]))
+    end
+    desc 'reset CRAWL_NAME', 'Resets a crawl so its segments are parsed again'
+    def reset(crawl_name)
+      load_database
+      crawl = find_crawl(crawl_name)
+      if crawl.has_segments?
+        reset = agree('Reset crawl? (y/n)', true)
+        puts(crawl.reset) if reset == true
+      else
+        puts('ERROR: Crawl does not exist')
+      end
+    end
+    desc 'destroy', 'Deletes S3 bucket and config directory'
+    def destroy
+      config = Config.new
+      if config.dir_exists?
+        puts(config.delete_warning)
+        delete = agree('Delete? (y/n)', true)
+        puts(config.delete) if delete == true
+      else
+        puts('No config dir. Nothing to do')
+      end
+    end
+  private
+    # Find a crawl record in the database.
+    def find_crawl(crawl_name)
+      Crawl.where(:crawl_name => crawl_name).first_or_initialize
+    end
+    # Load sqlite database.
+    def load_database
+      config = Config.new
+      config.load_database
+    end
+  end
+end
+begin
+  Elasticrawl::Cli.start(ARGV)
+# Show errors parsing command line arguments.
+rescue Thor::Error => e
+  puts(e.message)
+# Show elasticrawl errors.
+rescue Elasticrawl::Error => e
+  puts("ERROR: #{e.message}")
+end

data/db/migrate/201401051536_create_crawls.rb ADDED Viewed

@@ -0,0 +1,10 @@
+class CreateCrawls < ActiveRecord::Migration
+  def change
+    create_table :crawls do |t|
+      t.string :crawl_name
+      t.timestamps
+    end
+    add_index(:crawls, :crawl_name, :unique => true)
+  end
+end

data/db/migrate/201401051855_create_crawl_segments.rb ADDED Viewed

@@ -0,0 +1,14 @@
+class CreateCrawlSegments < ActiveRecord::Migration
+  def change
+    create_table :crawl_segments do |t|
+      t.references :crawl
+      t.string :segment_name
+      t.string :segment_s3_uri
+      t.datetime :parse_time
+      t.timestamps
+    end
+    add_index(:crawl_segments, :segment_name, :unique => true)
+    add_index(:crawl_segments, :segment_s3_uri, :unique => true)
+  end
+end

data/db/migrate/201401101723_create_jobs.rb ADDED Viewed

@@ -0,0 +1,14 @@
+class CreateJobs < ActiveRecord::Migration
+  def change
+    create_table :jobs do |t|
+      t.string :type
+      t.string :job_name
+      t.string :job_desc
+      t.integer :max_files
+      t.string :job_flow_id
+      t.timestamps
+    end
+    add_index(:jobs, :job_name, :unique => true)
+  end
+end

data/db/migrate/201401141606_create_job_steps.rb ADDED Viewed

@@ -0,0 +1,11 @@
+class CreateJobSteps < ActiveRecord::Migration
+  def change
+    create_table :job_steps do |t|
+      t.references :job
+      t.references :crawl_segment
+      t.text :input_paths
+      t.text :output_path
+      t.timestamps
+    end
+  end
+end

data/elasticrawl.gemspec ADDED Viewed

@@ -0,0 +1,35 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'elasticrawl/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'elasticrawl'
+  spec.version       = Elasticrawl::VERSION
+  spec.authors       = ['Ross Fairbanks']
+  spec.email         = ['ross@rossfairbanks.com']
+  spec.summary       = %q{Launch AWS Elastic MapReduce jobs that process Common Crawl data.}
+  spec.description   = %q{Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process Common Crawl data.}
+  spec.homepage      = 'https://github.com/rossf7/elasticrawl'
+  spec.license       = 'MIT'
+  spec.files         = `git ls-files`.split($/)
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ['lib']
+  spec.add_dependency 'activerecord', '~> 4.0.2'
+  spec.add_dependency 'activesupport', '~> 4.0.2'
+  spec.add_dependency 'aws-sdk', '~> 1.0'
+  spec.add_dependency 'elasticity', '~> 2.7'
+  spec.add_dependency 'highline', '~> 1.6.20'
+  spec.add_dependency 'sqlite3', '~> 1.3.8'
+  spec.add_dependency 'thor', '~> 0.18.1'
+  spec.add_development_dependency 'rake'
+  spec.add_development_dependency 'bundler', '~> 1.3'
+  spec.add_development_dependency 'rspec', '~> 2.14.1'
+  spec.add_development_dependency 'mocha', '~> 1.0.0'
+  spec.add_development_dependency 'database_cleaner', '~> 1.2.0'
+  spec.add_development_dependency 'shoulda-matchers', '~> 2.4.0'
+end