RubyGems - elasticrawl - Versions diffs - 1.0.0 → 1.1.0 - Mend

elasticrawl 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +7 -0
data/.travis.yml +1 -1
data/README.md +77 -108
data/Vagrantfile +5 -5
data/db/migrate/201401051536_create_crawls.rb +1 -1
data/db/migrate/201401051855_create_crawl_segments.rb +1 -1
data/db/migrate/201401101723_create_jobs.rb +1 -1
data/db/migrate/201401141606_create_job_steps.rb +1 -1
data/db/migrate/201412311554_add_file_count_to_crawl_segments.rb +5 -0
data/elasticrawl.gemspec +10 -11
data/lib/elasticrawl.rb +7 -0
data/lib/elasticrawl/cluster.rb +1 -1
data/lib/elasticrawl/crawl.rb +49 -31
data/lib/elasticrawl/crawl_segment.rb +30 -0
data/lib/elasticrawl/job.rb +13 -6
data/lib/elasticrawl/job_step.rb +5 -3
data/lib/elasticrawl/parse_job.rb +14 -0
data/lib/elasticrawl/version.rb +1 -1
data/spec/fixtures/warc.paths +6 -0
data/spec/spec_helper.rb +8 -14
data/spec/unit/cluster_spec.rb +2 -2
data/spec/unit/combine_job_spec.rb +4 -4
data/spec/unit/crawl_segment_spec.rb +19 -10
data/spec/unit/crawl_spec.rb +21 -16
data/spec/unit/job_step_spec.rb +4 -4
data/spec/unit/parse_job_spec.rb +20 -14
metadata +56 -101

data/lib/elasticrawl/crawl_segment.rb CHANGED Viewed

@@ -4,5 +4,35 @@ module Elasticrawl
   class CrawlSegment < ActiveRecord::Base
     belongs_to :crawl
     has_many :job_steps
+    # Description shows name and number of files in the segment.
+    def segment_desc
+      "Segment: #{segment_name} Files: #{file_count}"
+    end
+    # Creates a crawl segment based on its S3 path if it does not exist.
+    def self.create_segment(crawl, segment_name, file_count)
+      s3_uri = build_s3_uri(crawl.crawl_name, segment_name)
+      segment = CrawlSegment.where(:crawl_id => crawl.id,
+                                  :segment_name => segment_name,
+                                  :segment_s3_uri => s3_uri,
+                                  :file_count => file_count).first_or_create
+    end
+private
+    # Generates the S3 location where this segment is stored.
+    def self.build_s3_uri(crawl_name, segment_name)
+      s3_path = ['',
+                 Elasticrawl::COMMON_CRAWL_PATH,
+                 crawl_name,
+                 Elasticrawl::SEGMENTS_PATH,
+                 segment_name,
+                 '']
+      URI::Generic.build(:scheme => 's3',
+                         :host => Elasticrawl::COMMON_CRAWL_BUCKET,
+                         :path => s3_path.join('/'))
+    end
   end
 end

data/lib/elasticrawl/job.rb CHANGED Viewed

@@ -7,19 +7,26 @@ module Elasticrawl
     # Elastic MapReduce job flow and cluster.
     def confirm_message
       cluster = Cluster.new
-      message = []
-      message[0] = 'Job configuration'
-      message[1] = self.job_desc
-      message[2] = ''
-      message[3] = cluster.cluster_desc
+      case self.type
+      when 'Elasticrawl::ParseJob'
+        message = segment_list
+      else
+        message = []
+      end
+      message.push('Job configuration')
+      message.push(self.job_desc)
+      message.push('')
+      message.push(cluster.cluster_desc)
       message.join("\n")
     end
     # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
     # launched successfully.
     def result_message
-      "\nJob Name: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
+      "\nJob: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
     end
     # Displays the history of the current job. Called by the status command.

data/lib/elasticrawl/job_step.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module Elasticrawl
     # data files to process before the job exits.
     def job_flow_step(job_config)
       jar = job_config['jar']
-      max_files = self.job.max_files
+      max_files = self.job.max_files
       step_args = []
       step_args[0] = job_config['class']
@@ -35,8 +35,10 @@ module Elasticrawl
     def set_step_name
       case self.job.type
         when 'Elasticrawl::ParseJob'
-          segment =self.crawl_segment.segment_name if self.crawl_segment.present?
-          "Segment: #{segment}"
+          if self.crawl_segment.present?
+            max_files = self.job.max_files || 'all'
+            "#{self.crawl_segment.segment_desc} Parsing: #{max_files}"
+          end
         when 'Elasticrawl::CombineJob'
           paths = self.input_paths.split(',')
           "Combining #{paths.count} jobs"

data/lib/elasticrawl/parse_job.rb CHANGED Viewed

@@ -41,6 +41,20 @@ module Elasticrawl
       build_s3_uri(s3_path)
     end
+    # Return list of segment descriptions.
+    def segment_list
+      segments = ['Segments']
+      job_steps.each do |job_step|
+        if job_step.crawl_segment.present?
+          segment = job_step.crawl_segment
+          segments.push(segment.segment_desc)
+        end
+      end
+      segments.push('')
+    end
   private
     # Creates a job step for the crawl segment.
     def create_job_step(segment)

data/lib/elasticrawl/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Elasticrawl
-  VERSION = "1.0.0"
+  VERSION = "1.1.0"
 end

data/spec/fixtures/warc.paths ADDED Viewed

@@ -0,0 +1,6 @@
+common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00001-ip-10-235-23-156.ec2.internal.warc.gz
+common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
+common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00003-ip-10-235-23-156.ec2.internal.warc.gz
+common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
+common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
+common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372542.20/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,31 +1,25 @@
 require 'elasticrawl'
 require 'rspec'
-require 'mocha'
 require 'database_cleaner'
 require 'shoulda-matchers'
 RSpec.configure do |config|
-  config.before(:suite) do
-    # Return S3 paths that are used to create a crawl object with 3 crawl segments.
-    segment_paths = []
-    segment_paths[0] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
-    segment_paths[1] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381630/'
-    segment_paths[2] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696382185/'
-    Elasticrawl::Crawl.any_instance.stubs(:s3_segment_paths).returns(segment_paths)
+  # Run each test in a transaction and rollback data on completion.
+  DatabaseCleaner.strategy = :transaction
+  config.before(:each) do
+    # Stub S3 call to get WARC file paths
+    warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths'))
+    allow_any_instance_of(Elasticrawl::Crawl).to receive(:warc_paths).and_return(warc_paths)
     # Load config from spec/fixtures/ rather than ~/.elasticrawl/
     config_dir = File.join(File.dirname(__FILE__), 'fixtures')
-    Elasticrawl::Config.any_instance.stubs(:config_dir).returns(config_dir)
+    allow_any_instance_of(Elasticrawl::Config).to receive(:config_dir).and_return(config_dir)
     # Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
     config = Elasticrawl::Config.new
     config.load_database
-  end
-  # Run each test in a transaction and rollback data on completion.
-  DatabaseCleaner.strategy = :transaction
-  config.before(:each) do
     DatabaseCleaner.start
   end

data/spec/unit/cluster_spec.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require 'spec_helper'
 describe Elasticrawl::Cluster do
   describe '#create_job_flow' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:job) { Elasticrawl::ParseJob.new }
     let(:cluster) { Elasticrawl::Cluster.new }
     subject { cluster.create_job_flow(job) }
@@ -16,7 +16,7 @@ describe Elasticrawl::Cluster do
     end
     it 'should have a job flow name' do
-      expect(subject.name).to eq "Job Name: #{job.job_name} #{job.job_desc}"
+      expect(subject.name).to eq "Job: #{job.job_name} #{job.job_desc}"
     end
     it 'should have a log uri' do

data/spec/unit/combine_job_spec.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'spec_helper'
 describe Elasticrawl::CombineJob do
   describe '#set_input_jobs' do
     let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:segment_list_1) { crawl.crawl_segments[0..1] }
     let(:segment_list_2) { [crawl.crawl_segments[2]]}
@@ -55,7 +55,7 @@ describe Elasticrawl::CombineJob do
   end
   describe '#run' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:parse_job_1) { Elasticrawl::ParseJob.new }
     let(:parse_job_2) { Elasticrawl::ParseJob.new }
     let(:combine_job) { Elasticrawl::CombineJob.new }
@@ -71,7 +71,7 @@ describe Elasticrawl::CombineJob do
     end
     it 'should set a job flow id' do
-      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
       combine_job.run
       expect(combine_job.job_flow_id).to eq job_flow_id
@@ -79,7 +79,7 @@ describe Elasticrawl::CombineJob do
   end
   describe '#log_uri' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:parse_job) { Elasticrawl::ParseJob.new }
     let(:job) { Elasticrawl::CombineJob.new }

data/spec/unit/crawl_segment_spec.rb CHANGED Viewed

@@ -6,22 +6,31 @@ describe Elasticrawl::CrawlSegment do
   it { should have_db_column(:segment_name).of_type(:string) }
   it { should have_db_column(:segment_s3_uri).of_type(:string) }
   it { should have_db_column(:parse_time).of_type(:datetime) }
+  it { should have_db_column(:file_count).of_type(:integer) }
-  describe '#initialize' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
-    subject { crawl.crawl_segments[0]}
-    before do
-      crawl.create_segments
-    end
+  describe '.create_segment' do
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
+    let(:segment_name) { '1416400372202.67' }
+    let(:file_count) { 3 }
+    let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
+    subject { Elasticrawl::CrawlSegment.create_segment(crawl,
+                                                       segment_name,
+                                                       file_count) }
     it 'should have a segment name' do
-      expect(subject.segment_name).to eq '1368696381249'
+      expect(subject.segment_name).to eq segment_name
     end
     it 'should have an s3 uri' do
       expect(subject.segment_s3_uri).to eq \
-        's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
+        "s3://aws-publicdatasets/common-crawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/"
+    end
+    it 'should have a file count' do
+      expect(subject.file_count).to eq file_count
+    end
+    it 'should have a segment description' do
+      expect(subject.segment_desc).to eq segment_desc
     end
   end
 end

data/spec/unit/crawl_spec.rb CHANGED Viewed

@@ -5,7 +5,7 @@ describe Elasticrawl::Crawl do
   it { should have_db_column(:crawl_name).of_type(:string) }
   describe '#has_segments?' do
-    let(:crawl_name) { 'CC-MAIN-2013-20' }
+    let(:crawl_name) { 'CC-MAIN-2014-49' }
     subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
     it 'should have segments' do
@@ -14,7 +14,7 @@ describe Elasticrawl::Crawl do
   end
   describe '#create_segments' do
-    let(:crawl_name) { 'CC-MAIN-2013-20' }
+    let(:crawl_name) { 'CC-MAIN-2014-49' }
     subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
     before do
@@ -30,17 +30,21 @@ describe Elasticrawl::Crawl do
     end
     it 'should create segment names' do
-      expect(subject.crawl_segments[0].segment_name).to eq '1368696381249'
+      expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67'
     end
     it 'should create segment s3 uris' do
       expect(subject.crawl_segments[0].segment_s3_uri).to eq \
-        's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
+        's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/'
+    end
+    it 'should set file counts' do
+      expect(subject.crawl_segments[0].file_count).to eq 3
     end
   end
   describe '#next_segments' do
-    let(:crawl_name) { 'CC-MAIN-2013-20' }
+    let(:crawl_name) { 'CC-MAIN-2014-49' }
     subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
     before do
@@ -52,7 +56,7 @@ describe Elasticrawl::Crawl do
       expect(crawl_segments.count).to eq 3
       expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
-      expect(crawl_segments[0].segment_name).to eq '1368696381249'
+      expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
     end
     it 'should return first # segments' do
@@ -60,12 +64,12 @@ describe Elasticrawl::Crawl do
       expect(crawl_segments.count).to eq 2
       expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
-      expect(crawl_segments[0].segment_name).to eq '1368696381249'
+      expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
     end
   end
   describe '#select_segments' do
-    let(:crawl_name) { 'CC-MAIN-2013-20' }
+    let(:crawl_name) { 'CC-MAIN-2014-49' }
     subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
     before do
@@ -80,7 +84,7 @@ describe Elasticrawl::Crawl do
     end
     it 'should select only segments in list' do
-      segments_list = ['1368696381249', '1368696382185']
+      segments_list = ['1416400372202.67', '1416400372490.23']
       crawl_segments = subject.select_segments(segments_list)
       expect(crawl_segments.count).to eq 2
@@ -88,7 +92,7 @@ describe Elasticrawl::Crawl do
   end
   describe '#reset' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:job) { Elasticrawl::ParseJob.new }
     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
@@ -96,7 +100,7 @@ describe Elasticrawl::Crawl do
       crawl.create_segments
       job.set_segments(crawl.crawl_segments[0..1])
-      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
       job.run
       crawl.reset
@@ -109,9 +113,9 @@ describe Elasticrawl::Crawl do
   end
   describe '.status' do
-    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
-    let(:max_files) { 5 }
+    let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
+    let(:max_files) { 3 }
     let(:job) { Elasticrawl::ParseJob.new }
     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
@@ -119,17 +123,18 @@ describe Elasticrawl::Crawl do
       crawl.create_segments
       job.set_segments(crawl.crawl_segments[0..1], max_files)
-      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
       job.run
     end
     it 'should display status of crawl segments' do
       expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
-        'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3'
+        'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3'
     end
     it 'should display parse job desc' do
       crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
       expect(crawl_status.include?(job.job_name)).to eq true
       expect(crawl_status.include?(job.job_desc)).to eq true
     end

data/spec/unit/job_step_spec.rb CHANGED Viewed

@@ -8,11 +8,11 @@ describe Elasticrawl::JobStep do
   describe '#job_flow_step' do
     let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
-                                              :max_files => 5) }
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+                                              :max_files => 3) }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:crawl_segment) { crawl.crawl_segments[0] }
     let(:input_paths) {
-      's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/*.warc.wet.gz'
+      's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1368696381249/wet/*.warc.wet.gz'
     }
     let(:output_path) {
       's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
@@ -54,7 +54,7 @@ describe Elasticrawl::JobStep do
     end
     it 'should have a max files arg' do
-      expect(subject.arguments[3]).to eq '5'
+      expect(subject.arguments[3]).to eq '3'
     end
   end
 end

data/spec/unit/parse_job_spec.rb CHANGED Viewed

@@ -3,9 +3,9 @@ require 'spec_helper'
 describe Elasticrawl::ParseJob do
   describe '#set_segments' do
     let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
-    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
-    let(:max_files) { 5 }
+    let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
+    let(:max_files) { 3 }
     let(:parse_job) { Elasticrawl::ParseJob.new }
     before do
@@ -42,34 +42,40 @@ describe Elasticrawl::ParseJob do
   end
   describe '#confirm_message' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:job) { Elasticrawl::ParseJob.new }
-    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: 5 files per segment' }
+    let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: 3 files per segment' }
+    let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
     let(:cluster_desc) {
-      cluster_desc = <<-HERE
+      cluster_desc = <<-CLUSTER_DESC
 Cluster configuration
 Master: 1 m1.medium  (Spot: 0.12)
 Core:   2 m1.medium  (Spot: 0.12)
 Task:   --
-      HERE
+      CLUSTER_DESC
     }
     before do
       crawl.create_segments
-      job.set_segments(crawl.crawl_segments[0..2], 5)
+      job.set_segments(crawl.crawl_segments[0..2], 3)
     end
     it 'should display message including job desc' do
       expect(job.confirm_message.include?(job_desc)).to eq true
     end
+    it 'should display message including segment desc' do
+      expect(job.confirm_message.include?(segment_desc)).to eq true
+    end
     it 'should display message including cluster desc' do
       expect(job.confirm_message.include?(cluster_desc)).to eq true
     end
   end
   describe '#run' do
-    let(:crawl_name) { 'CC-MAIN-2013-20' }
+    let(:crawl_name) { 'CC-MAIN-2014-49' }
     let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
     let(:job) { Elasticrawl::ParseJob.new }
     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
@@ -78,7 +84,7 @@ Task:   --
       crawl.create_segments
       job.set_segments(crawl.crawl_segments[0..1], 5)
-      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
       job.run
     end
@@ -94,7 +100,7 @@ Task:   --
   end
   describe '#log_uri' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:job) { Elasticrawl::ParseJob.new }
     before do
@@ -108,16 +114,16 @@ Task:   --
   end
   describe '#history' do
-    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
+    let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
     let(:job) { Elasticrawl::ParseJob.new }
-    let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: all files' }
+    let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: all files' }
     let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
     before do
       crawl.create_segments
       job.set_segments(crawl.crawl_segments)
-      Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
+      allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
       job.run
     end