elasticrawl 1.0.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.travis.yml +1 -1
- data/README.md +77 -108
- data/Vagrantfile +5 -5
- data/db/migrate/201401051536_create_crawls.rb +1 -1
- data/db/migrate/201401051855_create_crawl_segments.rb +1 -1
- data/db/migrate/201401101723_create_jobs.rb +1 -1
- data/db/migrate/201401141606_create_job_steps.rb +1 -1
- data/db/migrate/201412311554_add_file_count_to_crawl_segments.rb +5 -0
- data/elasticrawl.gemspec +10 -11
- data/lib/elasticrawl.rb +7 -0
- data/lib/elasticrawl/cluster.rb +1 -1
- data/lib/elasticrawl/crawl.rb +49 -31
- data/lib/elasticrawl/crawl_segment.rb +30 -0
- data/lib/elasticrawl/job.rb +13 -6
- data/lib/elasticrawl/job_step.rb +5 -3
- data/lib/elasticrawl/parse_job.rb +14 -0
- data/lib/elasticrawl/version.rb +1 -1
- data/spec/fixtures/warc.paths +6 -0
- data/spec/spec_helper.rb +8 -14
- data/spec/unit/cluster_spec.rb +2 -2
- data/spec/unit/combine_job_spec.rb +4 -4
- data/spec/unit/crawl_segment_spec.rb +19 -10
- data/spec/unit/crawl_spec.rb +21 -16
- data/spec/unit/job_step_spec.rb +4 -4
- data/spec/unit/parse_job_spec.rb +20 -14
- metadata +56 -101
@@ -4,5 +4,35 @@ module Elasticrawl
|
|
4
4
|
class CrawlSegment < ActiveRecord::Base
|
5
5
|
belongs_to :crawl
|
6
6
|
has_many :job_steps
|
7
|
+
|
8
|
+
# Description shows name and number of files in the segment.
|
9
|
+
def segment_desc
|
10
|
+
"Segment: #{segment_name} Files: #{file_count}"
|
11
|
+
end
|
12
|
+
|
13
|
+
# Creates a crawl segment based on its S3 path if it does not exist.
|
14
|
+
def self.create_segment(crawl, segment_name, file_count)
|
15
|
+
s3_uri = build_s3_uri(crawl.crawl_name, segment_name)
|
16
|
+
|
17
|
+
segment = CrawlSegment.where(:crawl_id => crawl.id,
|
18
|
+
:segment_name => segment_name,
|
19
|
+
:segment_s3_uri => s3_uri,
|
20
|
+
:file_count => file_count).first_or_create
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
# Generates the S3 location where this segment is stored.
|
25
|
+
def self.build_s3_uri(crawl_name, segment_name)
|
26
|
+
s3_path = ['',
|
27
|
+
Elasticrawl::COMMON_CRAWL_PATH,
|
28
|
+
crawl_name,
|
29
|
+
Elasticrawl::SEGMENTS_PATH,
|
30
|
+
segment_name,
|
31
|
+
'']
|
32
|
+
|
33
|
+
URI::Generic.build(:scheme => 's3',
|
34
|
+
:host => Elasticrawl::COMMON_CRAWL_BUCKET,
|
35
|
+
:path => s3_path.join('/'))
|
36
|
+
end
|
7
37
|
end
|
8
38
|
end
|
data/lib/elasticrawl/job.rb
CHANGED
@@ -7,19 +7,26 @@ module Elasticrawl
|
|
7
7
|
# Elastic MapReduce job flow and cluster.
|
8
8
|
def confirm_message
|
9
9
|
cluster = Cluster.new
|
10
|
-
message = []
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
case self.type
|
12
|
+
when 'Elasticrawl::ParseJob'
|
13
|
+
message = segment_list
|
14
|
+
else
|
15
|
+
message = []
|
16
|
+
end
|
17
|
+
|
18
|
+
message.push('Job configuration')
|
19
|
+
message.push(self.job_desc)
|
20
|
+
message.push('')
|
21
|
+
message.push(cluster.cluster_desc)
|
22
|
+
|
16
23
|
message.join("\n")
|
17
24
|
end
|
18
25
|
|
19
26
|
# Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
|
20
27
|
# launched successfully.
|
21
28
|
def result_message
|
22
|
-
"\nJob
|
29
|
+
"\nJob: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
|
23
30
|
end
|
24
31
|
|
25
32
|
# Displays the history of the current job. Called by the status command.
|
data/lib/elasticrawl/job_step.rb
CHANGED
@@ -13,7 +13,7 @@ module Elasticrawl
|
|
13
13
|
# data files to process before the job exits.
|
14
14
|
def job_flow_step(job_config)
|
15
15
|
jar = job_config['jar']
|
16
|
-
max_files = self.job.max_files
|
16
|
+
max_files = self.job.max_files
|
17
17
|
|
18
18
|
step_args = []
|
19
19
|
step_args[0] = job_config['class']
|
@@ -35,8 +35,10 @@ module Elasticrawl
|
|
35
35
|
def set_step_name
|
36
36
|
case self.job.type
|
37
37
|
when 'Elasticrawl::ParseJob'
|
38
|
-
|
39
|
-
|
38
|
+
if self.crawl_segment.present?
|
39
|
+
max_files = self.job.max_files || 'all'
|
40
|
+
"#{self.crawl_segment.segment_desc} Parsing: #{max_files}"
|
41
|
+
end
|
40
42
|
when 'Elasticrawl::CombineJob'
|
41
43
|
paths = self.input_paths.split(',')
|
42
44
|
"Combining #{paths.count} jobs"
|
@@ -41,6 +41,20 @@ module Elasticrawl
|
|
41
41
|
build_s3_uri(s3_path)
|
42
42
|
end
|
43
43
|
|
44
|
+
# Return list of segment descriptions.
|
45
|
+
def segment_list
|
46
|
+
segments = ['Segments']
|
47
|
+
|
48
|
+
job_steps.each do |job_step|
|
49
|
+
if job_step.crawl_segment.present?
|
50
|
+
segment = job_step.crawl_segment
|
51
|
+
segments.push(segment.segment_desc)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
segments.push('')
|
56
|
+
end
|
57
|
+
|
44
58
|
private
|
45
59
|
# Creates a job step for the crawl segment.
|
46
60
|
def create_job_step(segment)
|
data/lib/elasticrawl/version.rb
CHANGED
@@ -0,0 +1,6 @@
|
|
1
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00001-ip-10-235-23-156.ec2.internal.warc.gz
|
2
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
|
3
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00003-ip-10-235-23-156.ec2.internal.warc.gz
|
4
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
|
5
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
|
6
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372542.20/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
|
data/spec/spec_helper.rb
CHANGED
@@ -1,31 +1,25 @@
|
|
1
1
|
require 'elasticrawl'
|
2
2
|
require 'rspec'
|
3
|
-
require 'mocha'
|
4
3
|
require 'database_cleaner'
|
5
4
|
require 'shoulda-matchers'
|
6
5
|
|
7
6
|
RSpec.configure do |config|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
Elasticrawl::Crawl.
|
7
|
+
# Run each test in a transaction and rollback data on completion.
|
8
|
+
DatabaseCleaner.strategy = :transaction
|
9
|
+
|
10
|
+
config.before(:each) do
|
11
|
+
# Stub S3 call to get WARC file paths
|
12
|
+
warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths'))
|
13
|
+
allow_any_instance_of(Elasticrawl::Crawl).to receive(:warc_paths).and_return(warc_paths)
|
15
14
|
|
16
15
|
# Load config from spec/fixtures/ rather than ~/.elasticrawl/
|
17
16
|
config_dir = File.join(File.dirname(__FILE__), 'fixtures')
|
18
|
-
Elasticrawl::Config.
|
17
|
+
allow_any_instance_of(Elasticrawl::Config).to receive(:config_dir).and_return(config_dir)
|
19
18
|
|
20
19
|
# Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
|
21
20
|
config = Elasticrawl::Config.new
|
22
21
|
config.load_database
|
23
|
-
end
|
24
22
|
|
25
|
-
# Run each test in a transaction and rollback data on completion.
|
26
|
-
DatabaseCleaner.strategy = :transaction
|
27
|
-
|
28
|
-
config.before(:each) do
|
29
23
|
DatabaseCleaner.start
|
30
24
|
end
|
31
25
|
|
data/spec/unit/cluster_spec.rb
CHANGED
@@ -2,7 +2,7 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Elasticrawl::Cluster do
|
4
4
|
describe '#create_job_flow' do
|
5
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
5
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
6
6
|
let(:job) { Elasticrawl::ParseJob.new }
|
7
7
|
let(:cluster) { Elasticrawl::Cluster.new }
|
8
8
|
subject { cluster.create_job_flow(job) }
|
@@ -16,7 +16,7 @@ describe Elasticrawl::Cluster do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'should have a job flow name' do
|
19
|
-
expect(subject.name).to eq "Job
|
19
|
+
expect(subject.name).to eq "Job: #{job.job_name} #{job.job_desc}"
|
20
20
|
end
|
21
21
|
|
22
22
|
it 'should have a log uri' do
|
@@ -3,7 +3,7 @@ require 'spec_helper'
|
|
3
3
|
describe Elasticrawl::CombineJob do
|
4
4
|
describe '#set_input_jobs' do
|
5
5
|
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
6
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
7
7
|
let(:segment_list_1) { crawl.crawl_segments[0..1] }
|
8
8
|
let(:segment_list_2) { [crawl.crawl_segments[2]]}
|
9
9
|
|
@@ -55,7 +55,7 @@ describe Elasticrawl::CombineJob do
|
|
55
55
|
end
|
56
56
|
|
57
57
|
describe '#run' do
|
58
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
58
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
59
59
|
let(:parse_job_1) { Elasticrawl::ParseJob.new }
|
60
60
|
let(:parse_job_2) { Elasticrawl::ParseJob.new }
|
61
61
|
let(:combine_job) { Elasticrawl::CombineJob.new }
|
@@ -71,7 +71,7 @@ describe Elasticrawl::CombineJob do
|
|
71
71
|
end
|
72
72
|
|
73
73
|
it 'should set a job flow id' do
|
74
|
-
Elasticity::JobFlow.
|
74
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
75
75
|
combine_job.run
|
76
76
|
|
77
77
|
expect(combine_job.job_flow_id).to eq job_flow_id
|
@@ -79,7 +79,7 @@ describe Elasticrawl::CombineJob do
|
|
79
79
|
end
|
80
80
|
|
81
81
|
describe '#log_uri' do
|
82
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
82
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
83
83
|
let(:parse_job) { Elasticrawl::ParseJob.new }
|
84
84
|
let(:job) { Elasticrawl::CombineJob.new }
|
85
85
|
|
@@ -6,22 +6,31 @@ describe Elasticrawl::CrawlSegment do
|
|
6
6
|
it { should have_db_column(:segment_name).of_type(:string) }
|
7
7
|
it { should have_db_column(:segment_s3_uri).of_type(:string) }
|
8
8
|
it { should have_db_column(:parse_time).of_type(:datetime) }
|
9
|
+
it { should have_db_column(:file_count).of_type(:integer) }
|
9
10
|
|
10
|
-
describe '
|
11
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
11
|
+
describe '.create_segment' do
|
12
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
13
|
+
let(:segment_name) { '1416400372202.67' }
|
14
|
+
let(:file_count) { 3 }
|
15
|
+
let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
|
16
|
+
subject { Elasticrawl::CrawlSegment.create_segment(crawl,
|
17
|
+
segment_name,
|
18
|
+
file_count) }
|
18
19
|
it 'should have a segment name' do
|
19
|
-
expect(subject.segment_name).to eq
|
20
|
+
expect(subject.segment_name).to eq segment_name
|
20
21
|
end
|
21
22
|
|
22
23
|
it 'should have an s3 uri' do
|
23
24
|
expect(subject.segment_s3_uri).to eq \
|
24
|
-
|
25
|
+
"s3://aws-publicdatasets/common-crawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/"
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should have a file count' do
|
29
|
+
expect(subject.file_count).to eq file_count
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should have a segment description' do
|
33
|
+
expect(subject.segment_desc).to eq segment_desc
|
25
34
|
end
|
26
35
|
end
|
27
36
|
end
|
data/spec/unit/crawl_spec.rb
CHANGED
@@ -5,7 +5,7 @@ describe Elasticrawl::Crawl do
|
|
5
5
|
it { should have_db_column(:crawl_name).of_type(:string) }
|
6
6
|
|
7
7
|
describe '#has_segments?' do
|
8
|
-
let(:crawl_name) { 'CC-MAIN-
|
8
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
9
9
|
subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
|
10
10
|
|
11
11
|
it 'should have segments' do
|
@@ -14,7 +14,7 @@ describe Elasticrawl::Crawl do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
describe '#create_segments' do
|
17
|
-
let(:crawl_name) { 'CC-MAIN-
|
17
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
18
18
|
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
19
19
|
|
20
20
|
before do
|
@@ -30,17 +30,21 @@ describe Elasticrawl::Crawl do
|
|
30
30
|
end
|
31
31
|
|
32
32
|
it 'should create segment names' do
|
33
|
-
expect(subject.crawl_segments[0].segment_name).to eq '
|
33
|
+
expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67'
|
34
34
|
end
|
35
35
|
|
36
36
|
it 'should create segment s3 uris' do
|
37
37
|
expect(subject.crawl_segments[0].segment_s3_uri).to eq \
|
38
|
-
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-
|
38
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/'
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should set file counts' do
|
42
|
+
expect(subject.crawl_segments[0].file_count).to eq 3
|
39
43
|
end
|
40
44
|
end
|
41
45
|
|
42
46
|
describe '#next_segments' do
|
43
|
-
let(:crawl_name) { 'CC-MAIN-
|
47
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
44
48
|
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
45
49
|
|
46
50
|
before do
|
@@ -52,7 +56,7 @@ describe Elasticrawl::Crawl do
|
|
52
56
|
|
53
57
|
expect(crawl_segments.count).to eq 3
|
54
58
|
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
55
|
-
expect(crawl_segments[0].segment_name).to eq '
|
59
|
+
expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
|
56
60
|
end
|
57
61
|
|
58
62
|
it 'should return first # segments' do
|
@@ -60,12 +64,12 @@ describe Elasticrawl::Crawl do
|
|
60
64
|
|
61
65
|
expect(crawl_segments.count).to eq 2
|
62
66
|
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
63
|
-
expect(crawl_segments[0].segment_name).to eq '
|
67
|
+
expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
|
64
68
|
end
|
65
69
|
end
|
66
70
|
|
67
71
|
describe '#select_segments' do
|
68
|
-
let(:crawl_name) { 'CC-MAIN-
|
72
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
69
73
|
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
70
74
|
|
71
75
|
before do
|
@@ -80,7 +84,7 @@ describe Elasticrawl::Crawl do
|
|
80
84
|
end
|
81
85
|
|
82
86
|
it 'should select only segments in list' do
|
83
|
-
segments_list = ['
|
87
|
+
segments_list = ['1416400372202.67', '1416400372490.23']
|
84
88
|
crawl_segments = subject.select_segments(segments_list)
|
85
89
|
|
86
90
|
expect(crawl_segments.count).to eq 2
|
@@ -88,7 +92,7 @@ describe Elasticrawl::Crawl do
|
|
88
92
|
end
|
89
93
|
|
90
94
|
describe '#reset' do
|
91
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
95
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
92
96
|
let(:job) { Elasticrawl::ParseJob.new }
|
93
97
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
94
98
|
|
@@ -96,7 +100,7 @@ describe Elasticrawl::Crawl do
|
|
96
100
|
crawl.create_segments
|
97
101
|
job.set_segments(crawl.crawl_segments[0..1])
|
98
102
|
|
99
|
-
Elasticity::JobFlow.
|
103
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
100
104
|
job.run
|
101
105
|
|
102
106
|
crawl.reset
|
@@ -109,9 +113,9 @@ describe Elasticrawl::Crawl do
|
|
109
113
|
end
|
110
114
|
|
111
115
|
describe '.status' do
|
112
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
113
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
114
|
-
let(:max_files) {
|
116
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
|
117
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
118
|
+
let(:max_files) { 3 }
|
115
119
|
let(:job) { Elasticrawl::ParseJob.new }
|
116
120
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
117
121
|
|
@@ -119,17 +123,18 @@ describe Elasticrawl::Crawl do
|
|
119
123
|
crawl.create_segments
|
120
124
|
job.set_segments(crawl.crawl_segments[0..1], max_files)
|
121
125
|
|
122
|
-
Elasticity::JobFlow.
|
126
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
123
127
|
job.run
|
124
128
|
end
|
125
129
|
|
126
130
|
it 'should display status of crawl segments' do
|
127
131
|
expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
|
128
|
-
'CC-MAIN-
|
132
|
+
'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3'
|
129
133
|
end
|
130
134
|
|
131
135
|
it 'should display parse job desc' do
|
132
136
|
crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
|
137
|
+
|
133
138
|
expect(crawl_status.include?(job.job_name)).to eq true
|
134
139
|
expect(crawl_status.include?(job.job_desc)).to eq true
|
135
140
|
end
|
data/spec/unit/job_step_spec.rb
CHANGED
@@ -8,11 +8,11 @@ describe Elasticrawl::JobStep do
|
|
8
8
|
|
9
9
|
describe '#job_flow_step' do
|
10
10
|
let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
|
11
|
-
:max_files =>
|
12
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
11
|
+
:max_files => 3) }
|
12
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
13
13
|
let(:crawl_segment) { crawl.crawl_segments[0] }
|
14
14
|
let(:input_paths) {
|
15
|
-
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-
|
15
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1368696381249/wet/*.warc.wet.gz'
|
16
16
|
}
|
17
17
|
let(:output_path) {
|
18
18
|
's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
|
@@ -54,7 +54,7 @@ describe Elasticrawl::JobStep do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
it 'should have a max files arg' do
|
57
|
-
expect(subject.arguments[3]).to eq '
|
57
|
+
expect(subject.arguments[3]).to eq '3'
|
58
58
|
end
|
59
59
|
end
|
60
60
|
end
|
data/spec/unit/parse_job_spec.rb
CHANGED
@@ -3,9 +3,9 @@ require 'spec_helper'
|
|
3
3
|
describe Elasticrawl::ParseJob do
|
4
4
|
describe '#set_segments' do
|
5
5
|
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
7
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
8
|
-
let(:max_files) {
|
6
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
|
7
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
8
|
+
let(:max_files) { 3 }
|
9
9
|
let(:parse_job) { Elasticrawl::ParseJob.new }
|
10
10
|
|
11
11
|
before do
|
@@ -42,34 +42,40 @@ describe Elasticrawl::ParseJob do
|
|
42
42
|
end
|
43
43
|
|
44
44
|
describe '#confirm_message' do
|
45
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
45
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
46
46
|
let(:job) { Elasticrawl::ParseJob.new }
|
47
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
47
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: 3 files per segment' }
|
48
|
+
let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
|
49
|
+
|
48
50
|
let(:cluster_desc) {
|
49
|
-
cluster_desc = <<-
|
51
|
+
cluster_desc = <<-CLUSTER_DESC
|
50
52
|
Cluster configuration
|
51
53
|
Master: 1 m1.medium (Spot: 0.12)
|
52
54
|
Core: 2 m1.medium (Spot: 0.12)
|
53
55
|
Task: --
|
54
|
-
|
56
|
+
CLUSTER_DESC
|
55
57
|
}
|
56
58
|
|
57
59
|
before do
|
58
60
|
crawl.create_segments
|
59
|
-
job.set_segments(crawl.crawl_segments[0..2],
|
61
|
+
job.set_segments(crawl.crawl_segments[0..2], 3)
|
60
62
|
end
|
61
63
|
|
62
64
|
it 'should display message including job desc' do
|
63
65
|
expect(job.confirm_message.include?(job_desc)).to eq true
|
64
66
|
end
|
65
67
|
|
68
|
+
it 'should display message including segment desc' do
|
69
|
+
expect(job.confirm_message.include?(segment_desc)).to eq true
|
70
|
+
end
|
71
|
+
|
66
72
|
it 'should display message including cluster desc' do
|
67
73
|
expect(job.confirm_message.include?(cluster_desc)).to eq true
|
68
74
|
end
|
69
75
|
end
|
70
76
|
|
71
77
|
describe '#run' do
|
72
|
-
let(:crawl_name) { 'CC-MAIN-
|
78
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
73
79
|
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
74
80
|
let(:job) { Elasticrawl::ParseJob.new }
|
75
81
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
@@ -78,7 +84,7 @@ Task: --
|
|
78
84
|
crawl.create_segments
|
79
85
|
job.set_segments(crawl.crawl_segments[0..1], 5)
|
80
86
|
|
81
|
-
Elasticity::JobFlow.
|
87
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
82
88
|
job.run
|
83
89
|
end
|
84
90
|
|
@@ -94,7 +100,7 @@ Task: --
|
|
94
100
|
end
|
95
101
|
|
96
102
|
describe '#log_uri' do
|
97
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
103
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
98
104
|
let(:job) { Elasticrawl::ParseJob.new }
|
99
105
|
|
100
106
|
before do
|
@@ -108,16 +114,16 @@ Task: --
|
|
108
114
|
end
|
109
115
|
|
110
116
|
describe '#history' do
|
111
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
117
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
112
118
|
let(:job) { Elasticrawl::ParseJob.new }
|
113
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
119
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: all files' }
|
114
120
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
115
121
|
|
116
122
|
before do
|
117
123
|
crawl.create_segments
|
118
124
|
job.set_segments(crawl.crawl_segments)
|
119
125
|
|
120
|
-
Elasticity::JobFlow.
|
126
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
121
127
|
job.run
|
122
128
|
end
|
123
129
|
|