elasticrawl 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +1 -1
- data/README.md +77 -108
- data/Vagrantfile +5 -5
- data/db/migrate/201401051536_create_crawls.rb +1 -1
- data/db/migrate/201401051855_create_crawl_segments.rb +1 -1
- data/db/migrate/201401101723_create_jobs.rb +1 -1
- data/db/migrate/201401141606_create_job_steps.rb +1 -1
- data/db/migrate/201412311554_add_file_count_to_crawl_segments.rb +5 -0
- data/elasticrawl.gemspec +10 -11
- data/lib/elasticrawl.rb +7 -0
- data/lib/elasticrawl/cluster.rb +1 -1
- data/lib/elasticrawl/crawl.rb +49 -31
- data/lib/elasticrawl/crawl_segment.rb +30 -0
- data/lib/elasticrawl/job.rb +13 -6
- data/lib/elasticrawl/job_step.rb +5 -3
- data/lib/elasticrawl/parse_job.rb +14 -0
- data/lib/elasticrawl/version.rb +1 -1
- data/spec/fixtures/warc.paths +6 -0
- data/spec/spec_helper.rb +8 -14
- data/spec/unit/cluster_spec.rb +2 -2
- data/spec/unit/combine_job_spec.rb +4 -4
- data/spec/unit/crawl_segment_spec.rb +19 -10
- data/spec/unit/crawl_spec.rb +21 -16
- data/spec/unit/job_step_spec.rb +4 -4
- data/spec/unit/parse_job_spec.rb +20 -14
- metadata +56 -101
@@ -4,5 +4,35 @@ module Elasticrawl
|
|
4
4
|
class CrawlSegment < ActiveRecord::Base
|
5
5
|
belongs_to :crawl
|
6
6
|
has_many :job_steps
|
7
|
+
|
8
|
+
# Description shows name and number of files in the segment.
|
9
|
+
def segment_desc
|
10
|
+
"Segment: #{segment_name} Files: #{file_count}"
|
11
|
+
end
|
12
|
+
|
13
|
+
# Creates a crawl segment based on its S3 path if it does not exist.
|
14
|
+
def self.create_segment(crawl, segment_name, file_count)
|
15
|
+
s3_uri = build_s3_uri(crawl.crawl_name, segment_name)
|
16
|
+
|
17
|
+
segment = CrawlSegment.where(:crawl_id => crawl.id,
|
18
|
+
:segment_name => segment_name,
|
19
|
+
:segment_s3_uri => s3_uri,
|
20
|
+
:file_count => file_count).first_or_create
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
# Generates the S3 location where this segment is stored.
|
25
|
+
def self.build_s3_uri(crawl_name, segment_name)
|
26
|
+
s3_path = ['',
|
27
|
+
Elasticrawl::COMMON_CRAWL_PATH,
|
28
|
+
crawl_name,
|
29
|
+
Elasticrawl::SEGMENTS_PATH,
|
30
|
+
segment_name,
|
31
|
+
'']
|
32
|
+
|
33
|
+
URI::Generic.build(:scheme => 's3',
|
34
|
+
:host => Elasticrawl::COMMON_CRAWL_BUCKET,
|
35
|
+
:path => s3_path.join('/'))
|
36
|
+
end
|
7
37
|
end
|
8
38
|
end
|
data/lib/elasticrawl/job.rb
CHANGED
@@ -7,19 +7,26 @@ module Elasticrawl
|
|
7
7
|
# Elastic MapReduce job flow and cluster.
|
8
8
|
def confirm_message
|
9
9
|
cluster = Cluster.new
|
10
|
-
message = []
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
11
|
+
case self.type
|
12
|
+
when 'Elasticrawl::ParseJob'
|
13
|
+
message = segment_list
|
14
|
+
else
|
15
|
+
message = []
|
16
|
+
end
|
17
|
+
|
18
|
+
message.push('Job configuration')
|
19
|
+
message.push(self.job_desc)
|
20
|
+
message.push('')
|
21
|
+
message.push(cluster.cluster_desc)
|
22
|
+
|
16
23
|
message.join("\n")
|
17
24
|
end
|
18
25
|
|
19
26
|
# Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
|
20
27
|
# launched successfully.
|
21
28
|
def result_message
|
22
|
-
"\nJob
|
29
|
+
"\nJob: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
|
23
30
|
end
|
24
31
|
|
25
32
|
# Displays the history of the current job. Called by the status command.
|
data/lib/elasticrawl/job_step.rb
CHANGED
@@ -13,7 +13,7 @@ module Elasticrawl
|
|
13
13
|
# data files to process before the job exits.
|
14
14
|
def job_flow_step(job_config)
|
15
15
|
jar = job_config['jar']
|
16
|
-
max_files = self.job.max_files
|
16
|
+
max_files = self.job.max_files
|
17
17
|
|
18
18
|
step_args = []
|
19
19
|
step_args[0] = job_config['class']
|
@@ -35,8 +35,10 @@ module Elasticrawl
|
|
35
35
|
def set_step_name
|
36
36
|
case self.job.type
|
37
37
|
when 'Elasticrawl::ParseJob'
|
38
|
-
|
39
|
-
|
38
|
+
if self.crawl_segment.present?
|
39
|
+
max_files = self.job.max_files || 'all'
|
40
|
+
"#{self.crawl_segment.segment_desc} Parsing: #{max_files}"
|
41
|
+
end
|
40
42
|
when 'Elasticrawl::CombineJob'
|
41
43
|
paths = self.input_paths.split(',')
|
42
44
|
"Combining #{paths.count} jobs"
|
@@ -41,6 +41,20 @@ module Elasticrawl
|
|
41
41
|
build_s3_uri(s3_path)
|
42
42
|
end
|
43
43
|
|
44
|
+
# Return list of segment descriptions.
|
45
|
+
def segment_list
|
46
|
+
segments = ['Segments']
|
47
|
+
|
48
|
+
job_steps.each do |job_step|
|
49
|
+
if job_step.crawl_segment.present?
|
50
|
+
segment = job_step.crawl_segment
|
51
|
+
segments.push(segment.segment_desc)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
segments.push('')
|
56
|
+
end
|
57
|
+
|
44
58
|
private
|
45
59
|
# Creates a job step for the crawl segment.
|
46
60
|
def create_job_step(segment)
|
data/lib/elasticrawl/version.rb
CHANGED
@@ -0,0 +1,6 @@
|
|
1
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00001-ip-10-235-23-156.ec2.internal.warc.gz
|
2
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
|
3
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/warc/CC-MAIN-20141119123252-00003-ip-10-235-23-156.ec2.internal.warc.gz
|
4
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
|
5
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372490.23/warc/CC-MAIN-20141119123252-00002-ip-10-235-23-156.ec2.internal.warc.gz
|
6
|
+
common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372542.20/warc/CC-MAIN-20141119123252-00000-ip-10-235-23-156.ec2.internal.warc.gz
|
data/spec/spec_helper.rb
CHANGED
@@ -1,31 +1,25 @@
|
|
1
1
|
require 'elasticrawl'
|
2
2
|
require 'rspec'
|
3
|
-
require 'mocha'
|
4
3
|
require 'database_cleaner'
|
5
4
|
require 'shoulda-matchers'
|
6
5
|
|
7
6
|
RSpec.configure do |config|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
Elasticrawl::Crawl.
|
7
|
+
# Run each test in a transaction and rollback data on completion.
|
8
|
+
DatabaseCleaner.strategy = :transaction
|
9
|
+
|
10
|
+
config.before(:each) do
|
11
|
+
# Stub S3 call to get WARC file paths
|
12
|
+
warc_paths = IO.read(File.join(File.dirname(__FILE__), 'fixtures', 'warc.paths'))
|
13
|
+
allow_any_instance_of(Elasticrawl::Crawl).to receive(:warc_paths).and_return(warc_paths)
|
15
14
|
|
16
15
|
# Load config from spec/fixtures/ rather than ~/.elasticrawl/
|
17
16
|
config_dir = File.join(File.dirname(__FILE__), 'fixtures')
|
18
|
-
Elasticrawl::Config.
|
17
|
+
allow_any_instance_of(Elasticrawl::Config).to receive(:config_dir).and_return(config_dir)
|
19
18
|
|
20
19
|
# Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
|
21
20
|
config = Elasticrawl::Config.new
|
22
21
|
config.load_database
|
23
|
-
end
|
24
22
|
|
25
|
-
# Run each test in a transaction and rollback data on completion.
|
26
|
-
DatabaseCleaner.strategy = :transaction
|
27
|
-
|
28
|
-
config.before(:each) do
|
29
23
|
DatabaseCleaner.start
|
30
24
|
end
|
31
25
|
|
data/spec/unit/cluster_spec.rb
CHANGED
@@ -2,7 +2,7 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Elasticrawl::Cluster do
|
4
4
|
describe '#create_job_flow' do
|
5
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
5
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
6
6
|
let(:job) { Elasticrawl::ParseJob.new }
|
7
7
|
let(:cluster) { Elasticrawl::Cluster.new }
|
8
8
|
subject { cluster.create_job_flow(job) }
|
@@ -16,7 +16,7 @@ describe Elasticrawl::Cluster do
|
|
16
16
|
end
|
17
17
|
|
18
18
|
it 'should have a job flow name' do
|
19
|
-
expect(subject.name).to eq "Job
|
19
|
+
expect(subject.name).to eq "Job: #{job.job_name} #{job.job_desc}"
|
20
20
|
end
|
21
21
|
|
22
22
|
it 'should have a log uri' do
|
@@ -3,7 +3,7 @@ require 'spec_helper'
|
|
3
3
|
describe Elasticrawl::CombineJob do
|
4
4
|
describe '#set_input_jobs' do
|
5
5
|
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
6
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
7
7
|
let(:segment_list_1) { crawl.crawl_segments[0..1] }
|
8
8
|
let(:segment_list_2) { [crawl.crawl_segments[2]]}
|
9
9
|
|
@@ -55,7 +55,7 @@ describe Elasticrawl::CombineJob do
|
|
55
55
|
end
|
56
56
|
|
57
57
|
describe '#run' do
|
58
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
58
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
59
59
|
let(:parse_job_1) { Elasticrawl::ParseJob.new }
|
60
60
|
let(:parse_job_2) { Elasticrawl::ParseJob.new }
|
61
61
|
let(:combine_job) { Elasticrawl::CombineJob.new }
|
@@ -71,7 +71,7 @@ describe Elasticrawl::CombineJob do
|
|
71
71
|
end
|
72
72
|
|
73
73
|
it 'should set a job flow id' do
|
74
|
-
Elasticity::JobFlow.
|
74
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
75
75
|
combine_job.run
|
76
76
|
|
77
77
|
expect(combine_job.job_flow_id).to eq job_flow_id
|
@@ -79,7 +79,7 @@ describe Elasticrawl::CombineJob do
|
|
79
79
|
end
|
80
80
|
|
81
81
|
describe '#log_uri' do
|
82
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
82
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
83
83
|
let(:parse_job) { Elasticrawl::ParseJob.new }
|
84
84
|
let(:job) { Elasticrawl::CombineJob.new }
|
85
85
|
|
@@ -6,22 +6,31 @@ describe Elasticrawl::CrawlSegment do
|
|
6
6
|
it { should have_db_column(:segment_name).of_type(:string) }
|
7
7
|
it { should have_db_column(:segment_s3_uri).of_type(:string) }
|
8
8
|
it { should have_db_column(:parse_time).of_type(:datetime) }
|
9
|
+
it { should have_db_column(:file_count).of_type(:integer) }
|
9
10
|
|
10
|
-
describe '
|
11
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
11
|
+
describe '.create_segment' do
|
12
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
13
|
+
let(:segment_name) { '1416400372202.67' }
|
14
|
+
let(:file_count) { 3 }
|
15
|
+
let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
|
16
|
+
subject { Elasticrawl::CrawlSegment.create_segment(crawl,
|
17
|
+
segment_name,
|
18
|
+
file_count) }
|
18
19
|
it 'should have a segment name' do
|
19
|
-
expect(subject.segment_name).to eq
|
20
|
+
expect(subject.segment_name).to eq segment_name
|
20
21
|
end
|
21
22
|
|
22
23
|
it 'should have an s3 uri' do
|
23
24
|
expect(subject.segment_s3_uri).to eq \
|
24
|
-
|
25
|
+
"s3://aws-publicdatasets/common-crawl/crawl-data/#{crawl.crawl_name}/segments/#{segment_name}/"
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should have a file count' do
|
29
|
+
expect(subject.file_count).to eq file_count
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should have a segment description' do
|
33
|
+
expect(subject.segment_desc).to eq segment_desc
|
25
34
|
end
|
26
35
|
end
|
27
36
|
end
|
data/spec/unit/crawl_spec.rb
CHANGED
@@ -5,7 +5,7 @@ describe Elasticrawl::Crawl do
|
|
5
5
|
it { should have_db_column(:crawl_name).of_type(:string) }
|
6
6
|
|
7
7
|
describe '#has_segments?' do
|
8
|
-
let(:crawl_name) { 'CC-MAIN-
|
8
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
9
9
|
subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
|
10
10
|
|
11
11
|
it 'should have segments' do
|
@@ -14,7 +14,7 @@ describe Elasticrawl::Crawl do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
describe '#create_segments' do
|
17
|
-
let(:crawl_name) { 'CC-MAIN-
|
17
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
18
18
|
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
19
19
|
|
20
20
|
before do
|
@@ -30,17 +30,21 @@ describe Elasticrawl::Crawl do
|
|
30
30
|
end
|
31
31
|
|
32
32
|
it 'should create segment names' do
|
33
|
-
expect(subject.crawl_segments[0].segment_name).to eq '
|
33
|
+
expect(subject.crawl_segments[0].segment_name).to eq '1416400372202.67'
|
34
34
|
end
|
35
35
|
|
36
36
|
it 'should create segment s3 uris' do
|
37
37
|
expect(subject.crawl_segments[0].segment_s3_uri).to eq \
|
38
|
-
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-
|
38
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1416400372202.67/'
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'should set file counts' do
|
42
|
+
expect(subject.crawl_segments[0].file_count).to eq 3
|
39
43
|
end
|
40
44
|
end
|
41
45
|
|
42
46
|
describe '#next_segments' do
|
43
|
-
let(:crawl_name) { 'CC-MAIN-
|
47
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
44
48
|
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
45
49
|
|
46
50
|
before do
|
@@ -52,7 +56,7 @@ describe Elasticrawl::Crawl do
|
|
52
56
|
|
53
57
|
expect(crawl_segments.count).to eq 3
|
54
58
|
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
55
|
-
expect(crawl_segments[0].segment_name).to eq '
|
59
|
+
expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
|
56
60
|
end
|
57
61
|
|
58
62
|
it 'should return first # segments' do
|
@@ -60,12 +64,12 @@ describe Elasticrawl::Crawl do
|
|
60
64
|
|
61
65
|
expect(crawl_segments.count).to eq 2
|
62
66
|
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
63
|
-
expect(crawl_segments[0].segment_name).to eq '
|
67
|
+
expect(crawl_segments[0].segment_name).to eq '1416400372202.67'
|
64
68
|
end
|
65
69
|
end
|
66
70
|
|
67
71
|
describe '#select_segments' do
|
68
|
-
let(:crawl_name) { 'CC-MAIN-
|
72
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
69
73
|
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
70
74
|
|
71
75
|
before do
|
@@ -80,7 +84,7 @@ describe Elasticrawl::Crawl do
|
|
80
84
|
end
|
81
85
|
|
82
86
|
it 'should select only segments in list' do
|
83
|
-
segments_list = ['
|
87
|
+
segments_list = ['1416400372202.67', '1416400372490.23']
|
84
88
|
crawl_segments = subject.select_segments(segments_list)
|
85
89
|
|
86
90
|
expect(crawl_segments.count).to eq 2
|
@@ -88,7 +92,7 @@ describe Elasticrawl::Crawl do
|
|
88
92
|
end
|
89
93
|
|
90
94
|
describe '#reset' do
|
91
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
95
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
92
96
|
let(:job) { Elasticrawl::ParseJob.new }
|
93
97
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
94
98
|
|
@@ -96,7 +100,7 @@ describe Elasticrawl::Crawl do
|
|
96
100
|
crawl.create_segments
|
97
101
|
job.set_segments(crawl.crawl_segments[0..1])
|
98
102
|
|
99
|
-
Elasticity::JobFlow.
|
103
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
100
104
|
job.run
|
101
105
|
|
102
106
|
crawl.reset
|
@@ -109,9 +113,9 @@ describe Elasticrawl::Crawl do
|
|
109
113
|
end
|
110
114
|
|
111
115
|
describe '.status' do
|
112
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
113
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
114
|
-
let(:max_files) {
|
116
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
|
117
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
118
|
+
let(:max_files) { 3 }
|
115
119
|
let(:job) { Elasticrawl::ParseJob.new }
|
116
120
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
117
121
|
|
@@ -119,17 +123,18 @@ describe Elasticrawl::Crawl do
|
|
119
123
|
crawl.create_segments
|
120
124
|
job.set_segments(crawl.crawl_segments[0..1], max_files)
|
121
125
|
|
122
|
-
Elasticity::JobFlow.
|
126
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
123
127
|
job.run
|
124
128
|
end
|
125
129
|
|
126
130
|
it 'should display status of crawl segments' do
|
127
131
|
expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
|
128
|
-
'CC-MAIN-
|
132
|
+
'CC-MAIN-2014-49 Segments: to parse 1, parsed 2, total 3'
|
129
133
|
end
|
130
134
|
|
131
135
|
it 'should display parse job desc' do
|
132
136
|
crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
|
137
|
+
|
133
138
|
expect(crawl_status.include?(job.job_name)).to eq true
|
134
139
|
expect(crawl_status.include?(job.job_desc)).to eq true
|
135
140
|
end
|
data/spec/unit/job_step_spec.rb
CHANGED
@@ -8,11 +8,11 @@ describe Elasticrawl::JobStep do
|
|
8
8
|
|
9
9
|
describe '#job_flow_step' do
|
10
10
|
let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
|
11
|
-
:max_files =>
|
12
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
11
|
+
:max_files => 3) }
|
12
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
13
13
|
let(:crawl_segment) { crawl.crawl_segments[0] }
|
14
14
|
let(:input_paths) {
|
15
|
-
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-
|
15
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2014-49/segments/1368696381249/wet/*.warc.wet.gz'
|
16
16
|
}
|
17
17
|
let(:output_path) {
|
18
18
|
's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
|
@@ -54,7 +54,7 @@ describe Elasticrawl::JobStep do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
it 'should have a max files arg' do
|
57
|
-
expect(subject.arguments[3]).to eq '
|
57
|
+
expect(subject.arguments[3]).to eq '3'
|
58
58
|
end
|
59
59
|
end
|
60
60
|
end
|
data/spec/unit/parse_job_spec.rb
CHANGED
@@ -3,9 +3,9 @@ require 'spec_helper'
|
|
3
3
|
describe Elasticrawl::ParseJob do
|
4
4
|
describe '#set_segments' do
|
5
5
|
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
7
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
8
|
-
let(:max_files) {
|
6
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 2 Parsing: 3 files per segment' }
|
7
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
8
|
+
let(:max_files) { 3 }
|
9
9
|
let(:parse_job) { Elasticrawl::ParseJob.new }
|
10
10
|
|
11
11
|
before do
|
@@ -42,34 +42,40 @@ describe Elasticrawl::ParseJob do
|
|
42
42
|
end
|
43
43
|
|
44
44
|
describe '#confirm_message' do
|
45
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
45
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
46
46
|
let(:job) { Elasticrawl::ParseJob.new }
|
47
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
47
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: 3 files per segment' }
|
48
|
+
let(:segment_desc) { 'Segment: 1416400372202.67 Files: 3' }
|
49
|
+
|
48
50
|
let(:cluster_desc) {
|
49
|
-
cluster_desc = <<-
|
51
|
+
cluster_desc = <<-CLUSTER_DESC
|
50
52
|
Cluster configuration
|
51
53
|
Master: 1 m1.medium (Spot: 0.12)
|
52
54
|
Core: 2 m1.medium (Spot: 0.12)
|
53
55
|
Task: --
|
54
|
-
|
56
|
+
CLUSTER_DESC
|
55
57
|
}
|
56
58
|
|
57
59
|
before do
|
58
60
|
crawl.create_segments
|
59
|
-
job.set_segments(crawl.crawl_segments[0..2],
|
61
|
+
job.set_segments(crawl.crawl_segments[0..2], 3)
|
60
62
|
end
|
61
63
|
|
62
64
|
it 'should display message including job desc' do
|
63
65
|
expect(job.confirm_message.include?(job_desc)).to eq true
|
64
66
|
end
|
65
67
|
|
68
|
+
it 'should display message including segment desc' do
|
69
|
+
expect(job.confirm_message.include?(segment_desc)).to eq true
|
70
|
+
end
|
71
|
+
|
66
72
|
it 'should display message including cluster desc' do
|
67
73
|
expect(job.confirm_message.include?(cluster_desc)).to eq true
|
68
74
|
end
|
69
75
|
end
|
70
76
|
|
71
77
|
describe '#run' do
|
72
|
-
let(:crawl_name) { 'CC-MAIN-
|
78
|
+
let(:crawl_name) { 'CC-MAIN-2014-49' }
|
73
79
|
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
74
80
|
let(:job) { Elasticrawl::ParseJob.new }
|
75
81
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
@@ -78,7 +84,7 @@ Task: --
|
|
78
84
|
crawl.create_segments
|
79
85
|
job.set_segments(crawl.crawl_segments[0..1], 5)
|
80
86
|
|
81
|
-
Elasticity::JobFlow.
|
87
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
82
88
|
job.run
|
83
89
|
end
|
84
90
|
|
@@ -94,7 +100,7 @@ Task: --
|
|
94
100
|
end
|
95
101
|
|
96
102
|
describe '#log_uri' do
|
97
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
103
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
98
104
|
let(:job) { Elasticrawl::ParseJob.new }
|
99
105
|
|
100
106
|
before do
|
@@ -108,16 +114,16 @@ Task: --
|
|
108
114
|
end
|
109
115
|
|
110
116
|
describe '#history' do
|
111
|
-
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-
|
117
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2014-49') }
|
112
118
|
let(:job) { Elasticrawl::ParseJob.new }
|
113
|
-
let(:job_desc) { 'Crawl: CC-MAIN-
|
119
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2014-49 Segments: 3 Parsing: all files' }
|
114
120
|
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
115
121
|
|
116
122
|
before do
|
117
123
|
crawl.create_segments
|
118
124
|
job.set_segments(crawl.crawl_segments)
|
119
125
|
|
120
|
-
Elasticity::JobFlow.
|
126
|
+
allow_any_instance_of(Elasticity::JobFlow).to receive(:run).and_return(job_flow_id)
|
121
127
|
job.run
|
122
128
|
end
|
123
129
|
|