RubyGems - elasticrawl - Versions diffs - 1.0.0 - Mend

elasticrawl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/.gitignore +21 -0
data/.travis.yml +5 -0
data/Cheffile +14 -0
data/Cheffile.lock +37 -0
data/Gemfile +3 -0
data/LICENSE +22 -0
data/README.md +232 -0
data/Rakefile +11 -0
data/Vagrantfile +58 -0
data/bin/elasticrawl +141 -0
data/db/migrate/201401051536_create_crawls.rb +10 -0
data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
data/db/migrate/201401101723_create_jobs.rb +14 -0
data/db/migrate/201401141606_create_job_steps.rb +11 -0
data/elasticrawl.gemspec +35 -0
data/lib/elasticrawl/cluster.rb +128 -0
data/lib/elasticrawl/combine_job.rb +86 -0
data/lib/elasticrawl/config.rb +242 -0
data/lib/elasticrawl/crawl.rb +114 -0
data/lib/elasticrawl/crawl_segment.rb +8 -0
data/lib/elasticrawl/error.rb +22 -0
data/lib/elasticrawl/job.rb +68 -0
data/lib/elasticrawl/job_step.rb +46 -0
data/lib/elasticrawl/parse_job.rb +84 -0
data/lib/elasticrawl/version.rb +3 -0
data/lib/elasticrawl.rb +21 -0
data/spec/fixtures/aws.yml +4 -0
data/spec/fixtures/cluster.yml +44 -0
data/spec/fixtures/jobs.yml +31 -0
data/spec/spec_helper.rb +35 -0
data/spec/unit/cluster_spec.rb +54 -0
data/spec/unit/combine_job_spec.rb +97 -0
data/spec/unit/config_spec.rb +17 -0
data/spec/unit/crawl_segment_spec.rb +27 -0
data/spec/unit/crawl_spec.rb +137 -0
data/spec/unit/job_spec.rb +10 -0
data/spec/unit/job_step_spec.rb +60 -0
data/spec/unit/parse_job_spec.rb +130 -0
data/templates/aws.yml +7 -0
data/templates/cluster.yml +44 -0
data/templates/jobs.yml +31 -0
metadata +315 -0

data/spec/unit/parse_job_spec.rb ADDED Viewed

@@ -0,0 +1,130 @@
+require 'spec_helper'
+describe Elasticrawl::ParseJob do
+  describe '#set_segments' do
+    let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
+    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:max_files) { 5 }
+    let(:parse_job) { Elasticrawl::ParseJob.new }
+    before do
+      crawl.create_segments
+      parse_job.set_segments(crawl.crawl_segments[0..1], max_files)
+    end
+    it 'should have a job name based on current time' do
+      expect(parse_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
+    end
+    it 'should have a job desc' do
+      expect(parse_job.job_desc).to eq job_desc
+    end
+    it 'should create 2 job steps' do
+      expect(parse_job.job_steps.count).to eq 2
+    end
+    it 'should set steps input path to segment uri' do
+      input_path = parse_job.job_steps[0].input_paths
+      segment_uri = crawl.crawl_segments[0].segment_s3_uri
+      expect(input_path.starts_with?(segment_uri)).to eq true
+    end
+    it 'should set output path' do
+      output_path = parse_job.job_steps[0].output_path
+      segment_name = crawl.crawl_segments[0].segment_name
+      expect(output_path.include?(parse_job.job_name)).to eq true
+      expect(output_path.include?(segment_name)).to eq true
+    end
+  end
+  describe '#confirm_message' do
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:job) { Elasticrawl::ParseJob.new }
+    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: 5 files per segment' }
+    let(:cluster_desc) {
+      cluster_desc = <<-HERE
+Cluster configuration
+Master: 1 m1.medium  (Spot: 0.12)
+Core:   2 m1.medium  (Spot: 0.12)
+Task:   --
+      HERE
+    }
+    before do
+      crawl.create_segments
+      job.set_segments(crawl.crawl_segments[0..2], 5)
+    end
+    it 'should display message including job desc' do
+      expect(job.confirm_message.include?(job_desc)).to eq true
+    end
+    it 'should display message including cluster desc' do
+      expect(job.confirm_message.include?(cluster_desc)).to eq true
+    end
+  end
+  describe '#run' do
+    let(:crawl_name) { 'CC-MAIN-2013-20' }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
+    let(:job) { Elasticrawl::ParseJob.new }
+    let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
+    before do
+      crawl.create_segments
+      job.set_segments(crawl.crawl_segments[0..1], 5)
+      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      job.run
+    end
+    it 'should set a job flow id' do
+      expect(job.job_flow_id).to eq job_flow_id
+    end
+    it 'should set parse time for parsed segments' do
+      expect(crawl.crawl_segments[0].parse_time.present?).to eq true
+      expect(crawl.crawl_segments[1].parse_time.present?).to eq true
+      expect(crawl.crawl_segments[2].parse_time.present?).to eq false
+    end
+  end
+  describe '#log_uri' do
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:job) { Elasticrawl::ParseJob.new }
+    before do
+      crawl.create_segments
+      job.set_segments(crawl.crawl_segments)
+    end
+    it 'should set a log uri including the job name' do
+      expect(job.log_uri).to eq "s3://elasticrawl/logs/1-parse/#{job.job_name}/"
+    end
+  end
+  describe '#history' do
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:job) { Elasticrawl::ParseJob.new }
+    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: all files' }
+    let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
+    before do
+      crawl.create_segments
+      job.set_segments(crawl.crawl_segments)
+      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      job.run
+    end
+    it 'should return the job name, history and launch time' do
+      expect(job.history.include?(job.job_name)).to eq true
+      expect(job.history.include?(job.job_desc)).to eq true
+      expect(job.history.include?(job.created_at.strftime('%Y-%m-%d %H:%M:%S'))).to eq true
+    end
+  end
+end

data/templates/aws.yml ADDED Viewed

@@ -0,0 +1,7 @@
+# Configures the AWS access credentials used when calling the AWS
+# Elastic MapReduce and S3 APIs.  This file is populated by the init command.
+#
+# Instead of configuring this file you can set the environment variables
+# AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
+access_key_id: 'ACCESS_KEY_ID'
+secret_access_key: 'SECRET_ACCESS_KEY'

data/templates/cluster.yml ADDED Viewed

@@ -0,0 +1,44 @@
+# Configures the Elastic MapReduce cluster that is launched to run parse and
+# combine jobs. The list of EC2 instance types can be found at
+# http://aws.amazon.com/ec2/instance-types/#instance-details
+# Using spot instances is recommended to reduce costs. However if the spot
+# price rises above your bid price the cluster may be terminated. Elasticrawl
+# tries to reduce the effect of this by parsing each Commmon Crawl segment
+# in a separate job flow step.
+# The master node manages the cluster.
+master_instance_group:
+  instance_type: m1.medium
+  use_spot_instances: true
+  bid_price: 0.120
+# Core nodes run map and reduce tasks and store data using HDFS.
+core_instance_group:
+  instance_type: m1.medium
+  instance_count: 2
+  use_spot_instances: true
+  bid_price: 0.120
+# Task nodes are optional and only run map and reduce tasks.
+task_instance_group:
+  instance_type: m1.small
+  instance_count: 0
+  use_spot_instances: true
+  bid_price: 0.080
+# Array of bootstrap scripts that will be applied when the cluster nodes are
+# initialized. The example installs the Ganglia distributed monitoring system.
+bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
+# Specifying an EC2 key pair allows SSH access to the master node. This also
+# allows accessing the Hadoop Web UI over an SSH tunnel.
+ec2_key_name: # 'key-pair-name'
+# Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
+# recommended since the Common Crawl corpus is stored there. Otherwise inter
+# region data transfer charges will apply.
+placement: 'us-east-1a'
+# The AMI version to use when launching instances.
+emr_ami_version: 'latest'

data/templates/jobs.yml ADDED Viewed

@@ -0,0 +1,31 @@
+# Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
+# corpus.
+# An S3 bucket is created by the init command and is used to store data and logs.
+s3_bucket_name: 'BUCKET_NAME'
+# A parse step is created per Common Crawl segment.  A combine step takes the
+# results from multiple segments to create a single set of output files.
+# The parse input filter is used to specify the Common Crawl file type.
+# WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
+# WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
+# WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
+# The EMR config is an XML file that sets Hadoop properties.  If a config file
+# is specified then a bootstrap action is run on each node to apply it.
+steps:
+  # Parse step for the Example Elasticrawl JAR.  This does a word count
+  # against the text extractions of the corpus.
+  parse:
+    jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
+    class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
+    input_filter: 'wet/*.warc.wet.gz'
+    emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
+  # Combine step for the Example Elasticrawl JAR.
+  combine:
+    jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
+    class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
+    input_filter: 'part-*'
+    emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'

metadata ADDED Viewed

@@ -0,0 +1,315 @@
+--- !ruby/object:Gem::Specification
+name: elasticrawl
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+  prerelease:
+platform: ruby
+authors:
+- Ross Fairbanks
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-02-04 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: activerecord
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 4.0.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 4.0.2
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 4.0.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 4.0.2
+- !ruby/object:Gem::Dependency
+  name: aws-sdk
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.0'
+- !ruby/object:Gem::Dependency
+  name: elasticity
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.7'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2.7'
+- !ruby/object:Gem::Dependency
+  name: highline
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.6.20
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.6.20
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.3.8
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.3.8
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.18.1
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 0.18.1
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '1.3'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.14.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.14.1
+- !ruby/object:Gem::Dependency
+  name: mocha
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.0
+- !ruby/object:Gem::Dependency
+  name: database_cleaner
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.2.0
+- !ruby/object:Gem::Dependency
+  name: shoulda-matchers
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.4.0
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 2.4.0
+description: Elasticrawl is a tool for launching AWS Elastic MapReduce jobs that process
+  Common Crawl data.
+email:
+- ross@rossfairbanks.com
+executables:
+- elasticrawl
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .travis.yml
+- Cheffile
+- Cheffile.lock
+- Gemfile
+- LICENSE
+- README.md
+- Rakefile
+- Vagrantfile
+- bin/elasticrawl
+- db/migrate/201401051536_create_crawls.rb
+- db/migrate/201401051855_create_crawl_segments.rb
+- db/migrate/201401101723_create_jobs.rb
+- db/migrate/201401141606_create_job_steps.rb
+- elasticrawl.gemspec
+- lib/elasticrawl.rb
+- lib/elasticrawl/cluster.rb
+- lib/elasticrawl/combine_job.rb
+- lib/elasticrawl/config.rb
+- lib/elasticrawl/crawl.rb
+- lib/elasticrawl/crawl_segment.rb
+- lib/elasticrawl/error.rb
+- lib/elasticrawl/job.rb
+- lib/elasticrawl/job_step.rb
+- lib/elasticrawl/parse_job.rb
+- lib/elasticrawl/version.rb
+- spec/fixtures/aws.yml
+- spec/fixtures/cluster.yml
+- spec/fixtures/jobs.yml
+- spec/spec_helper.rb
+- spec/unit/cluster_spec.rb
+- spec/unit/combine_job_spec.rb
+- spec/unit/config_spec.rb
+- spec/unit/crawl_segment_spec.rb
+- spec/unit/crawl_spec.rb
+- spec/unit/job_spec.rb
+- spec/unit/job_step_spec.rb
+- spec/unit/parse_job_spec.rb
+- templates/aws.yml
+- templates/cluster.yml
+- templates/jobs.yml
+homepage: https://github.com/rossf7/elasticrawl
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -3344138865650739079
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+      segments:
+      - 0
+      hash: -3344138865650739079
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Launch AWS Elastic MapReduce jobs that process Common Crawl data.
+test_files:
+- spec/fixtures/aws.yml
+- spec/fixtures/cluster.yml
+- spec/fixtures/jobs.yml
+- spec/spec_helper.rb
+- spec/unit/cluster_spec.rb
+- spec/unit/combine_job_spec.rb
+- spec/unit/config_spec.rb
+- spec/unit/crawl_segment_spec.rb
+- spec/unit/crawl_spec.rb
+- spec/unit/job_spec.rb
+- spec/unit/job_step_spec.rb
+- spec/unit/parse_job_spec.rb