elasticrawl 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,5 +4,35 @@ module Elasticrawl
4
4
  class CrawlSegment < ActiveRecord::Base
5
5
  belongs_to :crawl
6
6
  has_many :job_steps
7
+
8
+ # Description shows name and number of files in the segment.
9
+ def segment_desc
10
+ "Segment: #{segment_name} Files: #{file_count}"
11
+ end
12
+
13
+ # Creates a crawl segment based on its S3 path if it does not exist.
14
+ def self.create_segment(crawl, segment_name, file_count)
15
+ s3_uri = build_s3_uri(crawl.crawl_name, segment_name)
16
+
17
+ segment = CrawlSegment.where(:crawl_id => crawl.id,
18
+ :segment_name => segment_name,
19
+ :segment_s3_uri => s3_uri,
20
+ :file_count => file_count).first_or_create
21
+ end
22
+
23
+ private
24
+ # Generates the S3 location where this segment is stored.
25
+ def self.build_s3_uri(crawl_name, segment_name)
26
+ s3_path = ['',
27
+ Elasticrawl::COMMON_CRAWL_PATH,
28
+ crawl_name,
29
+ Elasticrawl::SEGMENTS_PATH,
30
+ segment_name,
31
+ '']
32
+
33
+ URI::Generic.build(:scheme => 's3',
34
+ :host => Elasticrawl::COMMON_CRAWL_BUCKET,
35
+ :path => s3_path.join('/'))
36
+ end
7
37
  end
8
38
  end
@@ -7,19 +7,26 @@ module Elasticrawl
7
7
  # Elastic MapReduce job flow and cluster.
8
8
  def confirm_message
9
9
  cluster = Cluster.new
10
- message = []
11
10
 
12
- message[0] = 'Job configuration'
13
- message[1] = self.job_desc
14
- message[2] = ''
15
- message[3] = cluster.cluster_desc
11
+ case self.type
12
+ when 'Elasticrawl::ParseJob'
13
+ message = segment_list
14
+ else
15
+ message = []
16
+ end
17
+
18
+ message.push('Job configuration')
19
+ message.push(self.job_desc)
20
+ message.push('')
21
+ message.push(cluster.cluster_desc)
22
+
16
23
  message.join("\n")
17
24
  end
18
25
 
19
26
  # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
20
27
  # launched successfully.
21
28
  def result_message
22
- "\nJob Name: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
29
+ "\nJob: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
23
30
  end
24
31
 
25
32
  # Displays the history of the current job. Called by the status command.
@@ -13,7 +13,7 @@ module Elasticrawl
13
13
  # data files to process before the job exits.
14
14
  def job_flow_step(job_config)
15
15
  jar = job_config['jar']
16
- max_files = self.job.max_files
16
+ max_files = self.job.max_files
17
17
 
18
18
  step_args = []
19
19
  step_args[0] = job_config['class']
@@ -35,8 +35,10 @@ module Elasticrawl
35
35
  def set_step_name
36
36
  case self.job.type
37
37
  when 'Elasticrawl::ParseJob'
38
- segment =self.crawl_segment.segment_name if self.crawl_segment.present?
39
- "Segment: #{segment}"
38
+ if self.crawl_segment.present?
39
+ max_files = self.job.max_files || 'all'
40
+ "#{self.crawl_segment.segment_desc} Parsing: #{max_files}"
41
+ end
40
42
  when 'Elasticrawl::CombineJob'
41
43
  paths = self.input_paths.split(',')
42
44
  "Combining #{paths.count} jobs"
@@ -41,6 +41,20 @@ module Elasticrawl
41
41
  build_s3_uri(s3_path)
42
42
  end
43
43
 
44
+ # Return list of segment descriptions.
45
+ def segment_list
46
+ segments = ['Segments']
47
+
48
+ job_steps.each do |job_step|
49
+ if job_step.crawl_segment.present?
50
+ segment = job_step.crawl_segment
51
+ segments.push(segment.segment_desc)
52
+ end
53
+ end
54
+
55
+ segments.push('')
56
+ end
57
+
44
58
  private
45
59
  # Creates a job step for the crawl segment.
46
60
  def create_job_step(segment)
@@ -1,3 +1,3 @@
1
1
  module Elasticrawl
2
- VERSION = "1.0.0"
2
+ VERSION = "1.1.0"
3
3
  end
@@ -0,0 +1,6 @@
1
+ common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00001-ip-10-235-23-156.ec2.internal.warc.gz
2
+ common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
3
+ common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00003-ip-10-235-23-156.ec2.internal.warc.gz
4
+ common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
5
+ common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
6
+ common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372542.20/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
data/spec/spec_helper.rb CHANGED
@@ -1,31 +1,25 @@
1
1
  require 'elasticrawl'
2
2
  require 'rspec'
3
- require 'mocha'
4
3
  require 'database_cleaner'
5
4
  require 'shoulda-matchers'
6
5
 
7
6
  RSpec.configure do |config|
8
- config.before(:suite) do
9
- # Return S3 paths that are used to create a crawl object with 3 crawl segments.
10
- segment_paths = []
11
- segment_paths[0] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
12
- segment_paths[1] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381630/'
13
- segment_paths[2] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696382185/'
14
- Elasticrawl::Crawl.any_instance.stubs(:s3_segment_paths).returns(segment_paths)
7
+ # Run each test in a transaction and rollback data on completion.
8
+ DatabaseCleaner.strategy = :transaction
9
+
10
+ config.before(:each) do
11
+ # Stub S3 call to get WARC file paths
12
+ warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths'))
13
+ allow_any_instance_of(Elasticrawl::Crawl).to receive(:warc_paths).and_return(warc_paths)
15
14
 
16
15
  # Load config from spec/fixtures/ rather than ~/.elasticrawl/
17
16
  config_dir = File.join(File.dirname(__FILE__), 'fixtures')
18
- Elasticrawl::Config.any_instance.stubs(:config_dir).returns(config_dir)
17
+ allow_any_instance_of(Elasticrawl::Config).to receive(:config_dir).and_return(config_dir)
19
18
 
20
19
  # Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
21
20
  config = Elasticrawl::Config.new
22
21
  config.load_database
23
- end
24
22
 
25
- # Run each test in a transaction and rollback data on completion.
26
- DatabaseCleaner.strategy = :transaction
27
-
28
- config.before(:each) do
29
23
  DatabaseCleaner.start
30
24
  end
31
25
 
@@ -2,7 +2,7 @@ require 'spec_helper'
2
2
 
3
3
  describe Elasticrawl::Cluster do
4
4
  describe '#create_job_flow' do
5
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
5
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
6
6
  let(:job) { Elasticrawl::ParseJob.new }
7
7
  let(:cluster) { Elasticrawl::Cluster.new }
8
8
  subject { cluster.create_job_flow(job) }
@@ -16,7 +16,7 @@ describe Elasticrawl::Cluster do
16
16
  end
17
17
 
18
18
  it 'should have a job flow name' do
19
- expect(subject.name).to eq "Job Name: #{job.job_name} #{job.job_desc}"
19
+ expect(subject.name).to eq "Job: #{job.job_name} #{job.job_desc}"
20
20
  end
21
21
 
22
22
  it 'should have a log uri' do
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Elasticrawl::CombineJob do
4
4
  describe '#set_input_jobs' do
5
5
  let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
6
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
6
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
7
7
  let(:segment_list_1) { crawl.crawl_segments[0..1] }
8
8
  let(:segment_list_2) { [crawl.crawl_segments[2]]}
9
9
 
@@ -55,7 +55,7 @@ describe Elasticrawl::CombineJob do
55
55
  end
56
56
 
57
57
  describe '#run' do
58
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
58
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
59
59
  let(:parse_job_1) { Elasticrawl::ParseJob.new }
60
60
  let(:parse_job_2) { Elasticrawl::ParseJob.new }
61
61
  let(:combine_job) { Elasticrawl::CombineJob.new }
@@ -71,7 +71,7 @@ describe Elasticrawl::CombineJob do
71
71
  end
72
72
 
73
73
  it 'should set a job flow id' do
74
- Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
74
+ allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
75
75
  combine_job.run
76
76
 
77
77
  expect(combine_job.job_flow_id).to eq job_flow_id
@@ -79,7 +79,7 @@ describe Elasticrawl::CombineJob do
79
79
  end
80
80
 
81
81
  describe '#log_uri' do
82
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
82
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
83
83
  let(:parse_job) { Elasticrawl::ParseJob.new }
84
84
  let(:job) { Elasticrawl::CombineJob.new }
85
85
 
@@ -6,22 +6,31 @@ describe Elasticrawl::CrawlSegment do
6
6
  it { should have_db_column(:segment_name).of_type(:string) }
7
7
  it { should have_db_column(:segment_s3_uri).of_type(:string) }
8
8
  it { should have_db_column(:parse_time).of_type(:datetime) }
9
+ it { should have_db_column(:file_count).of_type(:integer) }
9
10
 
10
- describe '#initialize' do
11
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
12
- subject { crawl.crawl_segments[0]}
13
-
14
- before do
15
- crawl.create_segments
16
- end
17
-
11
+ describe '.create_segment' do
12
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
13
+ let(:segment_name) { '1416400372202.67' }
14
+ let(:file_count) { 3 }
15
+ let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
16
+ subject { Elasticrawl::CrawlSegment.create_segment(crawl,
17
+ segment_name,
18
+ file_count) }
18
19
  it 'should have a segment name' do
19
- expect(subject.segment_name).to eq '1368696381249'
20
+ expect(subject.segment_name).to eq segment_name
20
21
  end
21
22
 
22
23
  it 'should have an s3 uri' do
23
24
  expect(subject.segment_s3_uri).to eq \
24
- 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
25
+ "s3://aws-publicdatasets/common-crawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/"
26
+ end
27
+
28
+ it 'should have a file count' do
29
+ expect(subject.file_count).to eq file_count
30
+ end
31
+
32
+ it 'should have a segment description' do
33
+ expect(subject.segment_desc).to eq segment_desc
25
34
  end
26
35
  end
27
36
  end
@@ -5,7 +5,7 @@ describe Elasticrawl::Crawl do
5
5
  it { should have_db_column(:crawl_name).of_type(:string) }
6
6
 
7
7
  describe '#has_segments?' do
8
- let(:crawl_name) { 'CC-MAIN-2013-20' }
8
+ let(:crawl_name) { 'CC-MAIN-2014-49' }
9
9
  subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
10
10
 
11
11
  it 'should have segments' do
@@ -14,7 +14,7 @@ describe Elasticrawl::Crawl do
14
14
  end
15
15
 
16
16
  describe '#create_segments' do
17
- let(:crawl_name) { 'CC-MAIN-2013-20' }
17
+ let(:crawl_name) { 'CC-MAIN-2014-49' }
18
18
  subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
19
19
 
20
20
  before do
@@ -30,17 +30,21 @@ describe Elasticrawl::Crawl do
30
30
  end
31
31
 
32
32
  it 'should create segment names' do
33
- expect(subject.crawl_segments[0].segment_name).to eq '1368696381249'
33
+ expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67'
34
34
  end
35
35
 
36
36
  it 'should create segment s3 uris' do
37
37
  expect(subject.crawl_segments[0].segment_s3_uri).to eq \
38
- 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
38
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/'
39
+ end
40
+
41
+ it 'should set file counts' do
42
+ expect(subject.crawl_segments[0].file_count).to eq 3
39
43
  end
40
44
  end
41
45
 
42
46
  describe '#next_segments' do
43
- let(:crawl_name) { 'CC-MAIN-2013-20' }
47
+ let(:crawl_name) { 'CC-MAIN-2014-49' }
44
48
  subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
45
49
 
46
50
  before do
@@ -52,7 +56,7 @@ describe Elasticrawl::Crawl do
52
56
 
53
57
  expect(crawl_segments.count).to eq 3
54
58
  expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
55
- expect(crawl_segments[0].segment_name).to eq '1368696381249'
59
+ expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
56
60
  end
57
61
 
58
62
  it 'should return first # segments' do
@@ -60,12 +64,12 @@ describe Elasticrawl::Crawl do
60
64
 
61
65
  expect(crawl_segments.count).to eq 2
62
66
  expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
63
- expect(crawl_segments[0].segment_name).to eq '1368696381249'
67
+ expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
64
68
  end
65
69
  end
66
70
 
67
71
  describe '#select_segments' do
68
- let(:crawl_name) { 'CC-MAIN-2013-20' }
72
+ let(:crawl_name) { 'CC-MAIN-2014-49' }
69
73
  subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
70
74
 
71
75
  before do
@@ -80,7 +84,7 @@ describe Elasticrawl::Crawl do
80
84
  end
81
85
 
82
86
  it 'should select only segments in list' do
83
- segments_list = ['1368696381249', '1368696382185']
87
+ segments_list = ['1416400372202.67', '1416400372490.23']
84
88
  crawl_segments = subject.select_segments(segments_list)
85
89
 
86
90
  expect(crawl_segments.count).to eq 2
@@ -88,7 +92,7 @@ describe Elasticrawl::Crawl do
88
92
  end
89
93
 
90
94
  describe '#reset' do
91
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
95
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
92
96
  let(:job) { Elasticrawl::ParseJob.new }
93
97
  let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
94
98
 
@@ -96,7 +100,7 @@ describe Elasticrawl::Crawl do
96
100
  crawl.create_segments
97
101
  job.set_segments(crawl.crawl_segments[0..1])
98
102
 
99
- Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
103
+ allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
100
104
  job.run
101
105
 
102
106
  crawl.reset
@@ -109,9 +113,9 @@ describe Elasticrawl::Crawl do
109
113
  end
110
114
 
111
115
  describe '.status' do
112
- let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
113
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
114
- let(:max_files) { 5 }
116
+ let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
117
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
118
+ let(:max_files) { 3 }
115
119
  let(:job) { Elasticrawl::ParseJob.new }
116
120
  let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
117
121
 
@@ -119,17 +123,18 @@ describe Elasticrawl::Crawl do
119
123
  crawl.create_segments
120
124
  job.set_segments(crawl.crawl_segments[0..1], max_files)
121
125
 
122
- Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
126
+ allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
123
127
  job.run
124
128
  end
125
129
 
126
130
  it 'should display status of crawl segments' do
127
131
  expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
128
- 'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3'
132
+ 'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3'
129
133
  end
130
134
 
131
135
  it 'should display parse job desc' do
132
136
  crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
137
+
133
138
  expect(crawl_status.include?(job.job_name)).to eq true
134
139
  expect(crawl_status.include?(job.job_desc)).to eq true
135
140
  end
@@ -8,11 +8,11 @@ describe Elasticrawl::JobStep do
8
8
 
9
9
  describe '#job_flow_step' do
10
10
  let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
11
- :max_files => 5) }
12
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
11
+ :max_files => 3) }
12
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
13
13
  let(:crawl_segment) { crawl.crawl_segments[0] }
14
14
  let(:input_paths) {
15
- 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/*.warc.wet.gz'
15
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1368696381249/wet/*.warc.wet.gz'
16
16
  }
17
17
  let(:output_path) {
18
18
  's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
@@ -54,7 +54,7 @@ describe Elasticrawl::JobStep do
54
54
  end
55
55
 
56
56
  it 'should have a max files arg' do
57
- expect(subject.arguments[3]).to eq '5'
57
+ expect(subject.arguments[3]).to eq '3'
58
58
  end
59
59
  end
60
60
  end
@@ -3,9 +3,9 @@ require 'spec_helper'
3
3
  describe Elasticrawl::ParseJob do
4
4
  describe '#set_segments' do
5
5
  let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
6
- let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
7
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
8
- let(:max_files) { 5 }
6
+ let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
7
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
8
+ let(:max_files) { 3 }
9
9
  let(:parse_job) { Elasticrawl::ParseJob.new }
10
10
 
11
11
  before do
@@ -42,34 +42,40 @@ describe Elasticrawl::ParseJob do
42
42
  end
43
43
 
44
44
  describe '#confirm_message' do
45
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
45
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
46
46
  let(:job) { Elasticrawl::ParseJob.new }
47
- let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: 5 files per segment' }
47
+ let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: 3 files per segment' }
48
+ let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
49
+
48
50
  let(:cluster_desc) {
49
- cluster_desc = <<-HERE
51
+ cluster_desc = <<-CLUSTER_DESC
50
52
  Cluster configuration
51
53
  Master: 1 m1.medium (Spot: 0.12)
52
54
  Core: 2 m1.medium (Spot: 0.12)
53
55
  Task: --
54
- HERE
56
+ CLUSTER_DESC
55
57
  }
56
58
 
57
59
  before do
58
60
  crawl.create_segments
59
- job.set_segments(crawl.crawl_segments[0..2], 5)
61
+ job.set_segments(crawl.crawl_segments[0..2], 3)
60
62
  end
61
63
 
62
64
  it 'should display message including job desc' do
63
65
  expect(job.confirm_message.include?(job_desc)).to eq true
64
66
  end
65
67
 
68
+ it 'should display message including segment desc' do
69
+ expect(job.confirm_message.include?(segment_desc)).to eq true
70
+ end
71
+
66
72
  it 'should display message including cluster desc' do
67
73
  expect(job.confirm_message.include?(cluster_desc)).to eq true
68
74
  end
69
75
  end
70
76
 
71
77
  describe '#run' do
72
- let(:crawl_name) { 'CC-MAIN-2013-20' }
78
+ let(:crawl_name) { 'CC-MAIN-2014-49' }
73
79
  let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
74
80
  let(:job) { Elasticrawl::ParseJob.new }
75
81
  let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
@@ -78,7 +84,7 @@ Task: --
78
84
  crawl.create_segments
79
85
  job.set_segments(crawl.crawl_segments[0..1], 5)
80
86
 
81
- Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
87
+ allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
82
88
  job.run
83
89
  end
84
90
 
@@ -94,7 +100,7 @@ Task: --
94
100
  end
95
101
 
96
102
  describe '#log_uri' do
97
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
103
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
98
104
  let(:job) { Elasticrawl::ParseJob.new }
99
105
 
100
106
  before do
@@ -108,16 +114,16 @@ Task: --
108
114
  end
109
115
 
110
116
  describe '#history' do
111
- let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
117
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
112
118
  let(:job) { Elasticrawl::ParseJob.new }
113
- let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 3 Parsing: all files' }
119
+ let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: all files' }
114
120
  let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
115
121
 
116
122
  before do
117
123
  crawl.create_segments
118
124
  job.set_segments(crawl.crawl_segments)
119
125
 
120
- Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
126
+ allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
121
127
  job.run
122
128
  end
123
129