elasticrawl 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/.travis.yml +5 -0
- data/Cheffile +14 -0
- data/Cheffile.lock +37 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +232 -0
- data/Rakefile +11 -0
- data/Vagrantfile +58 -0
- data/bin/elasticrawl +141 -0
- data/db/migrate/201401051536_create_crawls.rb +10 -0
- data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
- data/db/migrate/201401101723_create_jobs.rb +14 -0
- data/db/migrate/201401141606_create_job_steps.rb +11 -0
- data/elasticrawl.gemspec +35 -0
- data/lib/elasticrawl/cluster.rb +128 -0
- data/lib/elasticrawl/combine_job.rb +86 -0
- data/lib/elasticrawl/config.rb +242 -0
- data/lib/elasticrawl/crawl.rb +114 -0
- data/lib/elasticrawl/crawl_segment.rb +8 -0
- data/lib/elasticrawl/error.rb +22 -0
- data/lib/elasticrawl/job.rb +68 -0
- data/lib/elasticrawl/job_step.rb +46 -0
- data/lib/elasticrawl/parse_job.rb +84 -0
- data/lib/elasticrawl/version.rb +3 -0
- data/lib/elasticrawl.rb +21 -0
- data/spec/fixtures/aws.yml +4 -0
- data/spec/fixtures/cluster.yml +44 -0
- data/spec/fixtures/jobs.yml +31 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/unit/cluster_spec.rb +54 -0
- data/spec/unit/combine_job_spec.rb +97 -0
- data/spec/unit/config_spec.rb +17 -0
- data/spec/unit/crawl_segment_spec.rb +27 -0
- data/spec/unit/crawl_spec.rb +137 -0
- data/spec/unit/job_spec.rb +10 -0
- data/spec/unit/job_step_spec.rb +60 -0
- data/spec/unit/parse_job_spec.rb +130 -0
- data/templates/aws.yml +7 -0
- data/templates/cluster.yml +44 -0
- data/templates/jobs.yml +31 -0
- metadata +315 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents an Elastic MapReduce job flow step. For a parse job this will
|
3
|
+
# process a single Common Crawl segment. For a combine job a single step
|
4
|
+
# will aggregate the results of multiple parse jobs.
|
5
|
+
class JobStep < ActiveRecord::Base
|
6
|
+
belongs_to :job
|
7
|
+
belongs_to :crawl_segment
|
8
|
+
|
9
|
+
# Returns a custom jar step that is configured with the jar location,
|
10
|
+
# class name and input and output paths.
|
11
|
+
#
|
12
|
+
# For parse jobs optionally specifies the maximum # of Common Crawl
|
13
|
+
# data files to process before the job exits.
|
14
|
+
def job_flow_step(job_config)
|
15
|
+
jar = job_config['jar']
|
16
|
+
max_files = self.job.max_files
|
17
|
+
|
18
|
+
step_args = []
|
19
|
+
step_args[0] = job_config['class']
|
20
|
+
step_args[1] = self.input_paths
|
21
|
+
step_args[2] = self.output_path
|
22
|
+
# All arguments must be strings.
|
23
|
+
step_args[3] = max_files.to_s if max_files.present?
|
24
|
+
|
25
|
+
step = Elasticity::CustomJarStep.new(jar)
|
26
|
+
step.name = set_step_name
|
27
|
+
step.arguments = step_args
|
28
|
+
|
29
|
+
step
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
# Sets the Elastic MapReduce job flow step name based on the type of job it
|
34
|
+
# belongs to.
|
35
|
+
def set_step_name
|
36
|
+
case self.job.type
|
37
|
+
when 'Elasticrawl::ParseJob'
|
38
|
+
segment =self.crawl_segment.segment_name if self.crawl_segment.present?
|
39
|
+
"Segment: #{segment}"
|
40
|
+
when 'Elasticrawl::CombineJob'
|
41
|
+
paths = self.input_paths.split(',')
|
42
|
+
"Combining #{paths.count} jobs"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents an Elastic MapReduce job flow that parses segments of
|
3
|
+
# Common Crawl data. A job step is created per segment.
|
4
|
+
#
|
5
|
+
# Inherits from Job which is the ActiveRecord model class.
|
6
|
+
class ParseJob < Job
|
7
|
+
# Populates the job from the list of segments to be parsed.
|
8
|
+
def set_segments(crawl_segments, max_files = nil)
|
9
|
+
self.job_name = set_job_name
|
10
|
+
self.job_desc = set_job_desc(crawl_segments, max_files)
|
11
|
+
self.max_files = max_files
|
12
|
+
|
13
|
+
crawl_segments.each do |segment|
|
14
|
+
self.job_steps.push(create_job_step(segment))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Runs the job by calling Elastic MapReduce API. If successful the
|
19
|
+
# parse time is set for each segment.
|
20
|
+
def run
|
21
|
+
emr_config = job_config['emr_config']
|
22
|
+
job_flow_id = run_job_flow(emr_config)
|
23
|
+
|
24
|
+
if job_flow_id.present?
|
25
|
+
self.job_flow_id = job_flow_id
|
26
|
+
|
27
|
+
self.job_steps.each do |step|
|
28
|
+
segment = step.crawl_segment
|
29
|
+
segment.parse_time = DateTime.now
|
30
|
+
segment.save
|
31
|
+
end
|
32
|
+
|
33
|
+
self.save
|
34
|
+
self.result_message
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns the S3 location for storing Elastic MapReduce job logs.
|
39
|
+
def log_uri
|
40
|
+
s3_path = "/logs/1-parse/#{self.job_name}/"
|
41
|
+
build_s3_uri(s3_path)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
# Creates a job step for the crawl segment.
|
46
|
+
def create_job_step(segment)
|
47
|
+
JobStep.create(:job => self,
|
48
|
+
:crawl_segment => segment,
|
49
|
+
:input_paths => segment_input(segment),
|
50
|
+
:output_path => segment_output(segment))
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the S3 location for reading a crawl segment. The input filter
|
54
|
+
# determines which type of Common Crawl data files are parsed.
|
55
|
+
def segment_input(segment)
|
56
|
+
segment.segment_s3_uri + job_config['input_filter']
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the S3 location for storing the step results. This includes
|
60
|
+
# the segment name.
|
61
|
+
def segment_output(segment)
|
62
|
+
job_path = "/data/1-parse/#{self.job_name}"
|
63
|
+
s3_path = "#{job_path}/segments/#{segment.segment_name}/"
|
64
|
+
build_s3_uri(s3_path)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Sets the job description which forms part of the Elastic MapReduce
|
68
|
+
# job flow name.
|
69
|
+
def set_job_desc(segments, max_files)
|
70
|
+
if segments.count > 0
|
71
|
+
crawl_name = segments[0].crawl.crawl_name if segments[0].crawl.present?
|
72
|
+
file_desc = max_files.nil? ? 'all files' : "#{max_files} files per segment"
|
73
|
+
end
|
74
|
+
|
75
|
+
"Crawl: #{crawl_name} Segments: #{segments.count} Parsing: #{file_desc}"
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns the parse job configuration from ~/.elasticrawl.jobs.yml.
|
79
|
+
def job_config
|
80
|
+
config = Config.new
|
81
|
+
config.load_config('jobs')['steps']['parse']
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/elasticrawl.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'active_record'
|
3
|
+
require 'active_support'
|
4
|
+
require 'elasticity'
|
5
|
+
require 'highline/import'
|
6
|
+
require 'thor'
|
7
|
+
|
8
|
+
module Elasticrawl
|
9
|
+
require 'elasticrawl/version'
|
10
|
+
|
11
|
+
require 'elasticrawl/config'
|
12
|
+
require 'elasticrawl/error'
|
13
|
+
|
14
|
+
require 'elasticrawl/cluster'
|
15
|
+
require 'elasticrawl/crawl'
|
16
|
+
require 'elasticrawl/crawl_segment'
|
17
|
+
require 'elasticrawl/job'
|
18
|
+
require 'elasticrawl/combine_job'
|
19
|
+
require 'elasticrawl/parse_job'
|
20
|
+
require 'elasticrawl/job_step'
|
21
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# Configures the Elastic MapReduce cluster that is launched to run parse and
|
2
|
+
# combine jobs. The list of EC2 instance types can be found at
|
3
|
+
# http://aws.amazon.com/ec2/instance-types/#instance-details
|
4
|
+
|
5
|
+
# Using spot instances is recommended to reduce costs. However if the spot
|
6
|
+
# price rises above your bid price the cluster may be terminated. Elasticrawl
|
7
|
+
# tries to reduce the effect of this by parsing each Commmon Crawl segment
|
8
|
+
# in a separate job flow step.
|
9
|
+
|
10
|
+
# The master node manages the cluster.
|
11
|
+
master_instance_group:
|
12
|
+
instance_type: m1.medium
|
13
|
+
use_spot_instances: true
|
14
|
+
bid_price: 0.120
|
15
|
+
|
16
|
+
# Core nodes run map and reduce tasks and store data using HDFS.
|
17
|
+
core_instance_group:
|
18
|
+
instance_type: m1.medium
|
19
|
+
instance_count: 2
|
20
|
+
use_spot_instances: true
|
21
|
+
bid_price: 0.120
|
22
|
+
|
23
|
+
# Task nodes are optional and only run map and reduce tasks.
|
24
|
+
task_instance_group:
|
25
|
+
instance_type: m1.small
|
26
|
+
instance_count: 0
|
27
|
+
use_spot_instances: true
|
28
|
+
bid_price: 0.080
|
29
|
+
|
30
|
+
# Array of bootstrap scripts that will be applied when the cluster nodes are
|
31
|
+
# initialized. The example installs the Ganglia distributed monitoring system.
|
32
|
+
bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
|
33
|
+
|
34
|
+
# Specifying an EC2 key pair allows SSH access to the master node. This also
|
35
|
+
# allows accessing the Hadoop Web UI over an SSH tunnel.
|
36
|
+
ec2_key_name: 'elasticrawl'
|
37
|
+
|
38
|
+
# Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
|
39
|
+
# recommended since the Common Crawl corpus is stored there. Otherwise inter
|
40
|
+
# region data transfer charges will apply.
|
41
|
+
placement: 'us-east-1c'
|
42
|
+
|
43
|
+
# The AMI version to use when launching instances.
|
44
|
+
emr_ami_version: 'latest'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
|
2
|
+
# corpus.
|
3
|
+
|
4
|
+
# An S3 bucket is created by the init command and is used to store data and logs.
|
5
|
+
s3_bucket_name: 'elasticrawl'
|
6
|
+
|
7
|
+
# A parse step is created per Common Crawl segment. A combine step takes the
|
8
|
+
# results from multiple segments to create a single set of output files.
|
9
|
+
|
10
|
+
# The parse input filter is used to specify the Common Crawl file type.
|
11
|
+
|
12
|
+
# WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
|
13
|
+
# WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
|
14
|
+
# WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
|
15
|
+
|
16
|
+
# The EMR config is an XML file that sets Hadoop properties. If a config file
|
17
|
+
# is specified then a bootstrap action is run on each node to apply it.
|
18
|
+
steps:
|
19
|
+
# Parse step for the Example Elasticrawl JAR. This does a word count
|
20
|
+
# against the text extractions of the corpus.
|
21
|
+
parse:
|
22
|
+
jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
|
23
|
+
class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
|
24
|
+
input_filter: 'wet/*.warc.wet.gz'
|
25
|
+
emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
|
26
|
+
# Combine step for the Example Elasticrawl JAR.
|
27
|
+
combine:
|
28
|
+
jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
|
29
|
+
class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
|
30
|
+
input_filter: 'part-*'
|
31
|
+
emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'elasticrawl'
|
2
|
+
require 'rspec'
|
3
|
+
require 'mocha'
|
4
|
+
require 'database_cleaner'
|
5
|
+
require 'shoulda-matchers'
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.before(:suite) do
|
9
|
+
# Return S3 paths that are used to create a crawl object with 3 crawl segments.
|
10
|
+
segment_paths = []
|
11
|
+
segment_paths[0] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
|
12
|
+
segment_paths[1] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381630/'
|
13
|
+
segment_paths[2] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696382185/'
|
14
|
+
Elasticrawl::Crawl.any_instance.stubs(:s3_segment_paths).returns(segment_paths)
|
15
|
+
|
16
|
+
# Load config from spec/fixtures/ rather than ~/.elasticrawl/
|
17
|
+
config_dir = File.join(File.dirname(__FILE__), 'fixtures')
|
18
|
+
Elasticrawl::Config.any_instance.stubs(:config_dir).returns(config_dir)
|
19
|
+
|
20
|
+
# Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
|
21
|
+
config = Elasticrawl::Config.new
|
22
|
+
config.load_database
|
23
|
+
end
|
24
|
+
|
25
|
+
# Run each test in a transaction and rollback data on completion.
|
26
|
+
DatabaseCleaner.strategy = :transaction
|
27
|
+
|
28
|
+
config.before(:each) do
|
29
|
+
DatabaseCleaner.start
|
30
|
+
end
|
31
|
+
|
32
|
+
config.after(:each) do
|
33
|
+
DatabaseCleaner.clean
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Cluster do
|
4
|
+
describe '#create_job_flow' do
|
5
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
6
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
7
|
+
let(:cluster) { Elasticrawl::Cluster.new }
|
8
|
+
subject { cluster.create_job_flow(job) }
|
9
|
+
|
10
|
+
before do
|
11
|
+
job.set_segments(crawl.crawl_segments)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should be an Elasticity::JobFlow' do
|
15
|
+
expect(subject).to be_a Elasticity::JobFlow
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should have a job flow name' do
|
19
|
+
expect(subject.name).to eq "Job Name: #{job.job_name} #{job.job_desc}"
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should have a log uri' do
|
23
|
+
expect(subject.log_uri).to eq job.log_uri
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should have an ec2 key name' do
|
27
|
+
expect(subject.ec2_key_name).to eq 'elasticrawl'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should have a placement az name' do
|
31
|
+
expect(subject.placement).to eq 'us-east-1c'
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should have an ami version' do
|
35
|
+
expect(subject.ami_version).to eq 'latest'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '#cluster_desc' do
|
40
|
+
let(:cluster_desc) {
|
41
|
+
cluster_desc = <<-HERE
|
42
|
+
Cluster configuration
|
43
|
+
Master: 1 m1.medium (Spot: 0.12)
|
44
|
+
Core: 2 m1.medium (Spot: 0.12)
|
45
|
+
Task: --
|
46
|
+
HERE
|
47
|
+
}
|
48
|
+
subject { Elasticrawl::Cluster.new }
|
49
|
+
|
50
|
+
it 'should describe configured instance groups' do
|
51
|
+
expect(subject.cluster_desc).to eq cluster_desc
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::CombineJob do
|
4
|
+
describe '#set_input_jobs' do
|
5
|
+
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
7
|
+
let(:segment_list_1) { crawl.crawl_segments[0..1] }
|
8
|
+
let(:segment_list_2) { [crawl.crawl_segments[2]]}
|
9
|
+
|
10
|
+
let(:parse_job_1) { Elasticrawl::ParseJob.new }
|
11
|
+
let(:parse_job_2) { Elasticrawl::ParseJob.new }
|
12
|
+
let(:combine_job) { Elasticrawl::CombineJob.new }
|
13
|
+
|
14
|
+
before do
|
15
|
+
crawl.create_segments
|
16
|
+
parse_job_1.set_segments(segment_list_1)
|
17
|
+
parse_job_2.set_segments(segment_list_2)
|
18
|
+
|
19
|
+
input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
|
20
|
+
combine_job.set_input_jobs(input_jobs)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should have a job name based on current time' do
|
24
|
+
expect(combine_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should have a job desc' do
|
28
|
+
expect(combine_job.job_desc.end_with?('Combining: 3 segments')).to eq true
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should create 1 job step' do
|
32
|
+
expect(combine_job.job_steps.count).to eq 1
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should set 1 input path per parse job' do
|
36
|
+
input_paths = combine_job.job_steps[0].input_paths
|
37
|
+
expect(input_paths.split(',').count).to eq 2
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should set input path including parse job name' do
|
41
|
+
input_paths = combine_job.job_steps[0].input_paths
|
42
|
+
expect(input_paths.include?(parse_job_1.job_name)).to eq true
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should set input path without segment names' do
|
46
|
+
input_paths = combine_job.job_steps[0].input_paths
|
47
|
+
segment_name = segment_list_1[0].segment_name
|
48
|
+
expect(input_paths.include?(segment_name)).to eq false
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should set output path including job name' do
|
52
|
+
output_path = combine_job.job_steps[0].output_path
|
53
|
+
expect(output_path.include?(combine_job.job_name)).to eq true
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe '#run' do
|
58
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
59
|
+
let(:parse_job_1) { Elasticrawl::ParseJob.new }
|
60
|
+
let(:parse_job_2) { Elasticrawl::ParseJob.new }
|
61
|
+
let(:combine_job) { Elasticrawl::CombineJob.new }
|
62
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
63
|
+
|
64
|
+
before do
|
65
|
+
crawl.create_segments
|
66
|
+
parse_job_1.set_segments(crawl.crawl_segments[0..1])
|
67
|
+
parse_job_2.set_segments([crawl.crawl_segments[2]])
|
68
|
+
|
69
|
+
input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
|
70
|
+
combine_job.set_input_jobs(input_jobs)
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should set a job flow id' do
|
74
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
75
|
+
combine_job.run
|
76
|
+
|
77
|
+
expect(combine_job.job_flow_id).to eq job_flow_id
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe '#log_uri' do
|
82
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
83
|
+
let(:parse_job) { Elasticrawl::ParseJob.new }
|
84
|
+
let(:job) { Elasticrawl::CombineJob.new }
|
85
|
+
|
86
|
+
before do
|
87
|
+
crawl.create_segments
|
88
|
+
parse_job.set_segments(crawl.crawl_segments)
|
89
|
+
|
90
|
+
job.set_input_jobs([parse_job.job_name])
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should set a log uri including the job name' do
|
94
|
+
expect(job.log_uri).to eq "s3://elasticrawl/logs/2-combine/#{job.job_name}/"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Config do
|
4
|
+
describe '#load_config' do
|
5
|
+
subject { Elasticrawl::Config.new }
|
6
|
+
|
7
|
+
it 'should return a hash of config data' do
|
8
|
+
config_data = subject.load_config('jobs')
|
9
|
+
expect(config_data).to be_a Hash
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should load yaml config file' do
|
13
|
+
config_data = subject.load_config('jobs')
|
14
|
+
expect(config_data['s3_bucket_name']).to eq 'elasticrawl'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::CrawlSegment do
|
4
|
+
it { should belong_to(:crawl) }
|
5
|
+
it { should have_many(:job_steps) }
|
6
|
+
it { should have_db_column(:segment_name).of_type(:string) }
|
7
|
+
it { should have_db_column(:segment_s3_uri).of_type(:string) }
|
8
|
+
it { should have_db_column(:parse_time).of_type(:datetime) }
|
9
|
+
|
10
|
+
describe '#initialize' do
|
11
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
12
|
+
subject { crawl.crawl_segments[0]}
|
13
|
+
|
14
|
+
before do
|
15
|
+
crawl.create_segments
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should have a segment name' do
|
19
|
+
expect(subject.segment_name).to eq '1368696381249'
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should have an s3 uri' do
|
23
|
+
expect(subject.segment_s3_uri).to eq \
|
24
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Crawl do
|
4
|
+
it { should have_many(:crawl_segments) }
|
5
|
+
it { should have_db_column(:crawl_name).of_type(:string) }
|
6
|
+
|
7
|
+
describe '#has_segments?' do
|
8
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
9
|
+
subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
|
10
|
+
|
11
|
+
it 'should have segments' do
|
12
|
+
expect(subject.has_segments?).to eq true
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '#create_segments' do
|
17
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
18
|
+
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
19
|
+
|
20
|
+
before do
|
21
|
+
subject.create_segments
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should set crawl name' do
|
25
|
+
expect(subject.crawl_name).to eq crawl_name
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should create correct # of segments' do
|
29
|
+
expect(subject.crawl_segments.count).to eq 3
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should create segment names' do
|
33
|
+
expect(subject.crawl_segments[0].segment_name).to eq '1368696381249'
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should create segment s3 uris' do
|
37
|
+
expect(subject.crawl_segments[0].segment_s3_uri).to eq \
|
38
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#next_segments' do
|
43
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
44
|
+
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
45
|
+
|
46
|
+
before do
|
47
|
+
subject.create_segments
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should return all segments' do
|
51
|
+
crawl_segments = subject.next_segments
|
52
|
+
|
53
|
+
expect(crawl_segments.count).to eq 3
|
54
|
+
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
55
|
+
expect(crawl_segments[0].segment_name).to eq '1368696381249'
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should return first # segments' do
|
59
|
+
crawl_segments = subject.next_segments(2)
|
60
|
+
|
61
|
+
expect(crawl_segments.count).to eq 2
|
62
|
+
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
63
|
+
expect(crawl_segments[0].segment_name).to eq '1368696381249'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#select_segments' do
|
68
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
69
|
+
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
70
|
+
|
71
|
+
before do
|
72
|
+
subject.create_segments
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should select no segments' do
|
76
|
+
segments_list = ['test', 'segment']
|
77
|
+
crawl_segments = subject.select_segments(segments_list)
|
78
|
+
|
79
|
+
expect(crawl_segments.count).to eq 0
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should select only segments in list' do
|
83
|
+
segments_list = ['1368696381249', '1368696382185']
|
84
|
+
crawl_segments = subject.select_segments(segments_list)
|
85
|
+
|
86
|
+
expect(crawl_segments.count).to eq 2
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
describe '#reset' do
|
91
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
92
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
93
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
94
|
+
|
95
|
+
before do
|
96
|
+
crawl.create_segments
|
97
|
+
job.set_segments(crawl.crawl_segments[0..1])
|
98
|
+
|
99
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
100
|
+
job.run
|
101
|
+
|
102
|
+
crawl.reset
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'should set parse time of all segments to null' do
|
106
|
+
unparsed_segments = Elasticrawl::CrawlSegment.where(:parse_time => nil).count
|
107
|
+
expect(crawl.crawl_segments.count).to eq unparsed_segments
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
describe '.status' do
|
112
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
|
113
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
114
|
+
let(:max_files) { 5 }
|
115
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
116
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
117
|
+
|
118
|
+
before do
|
119
|
+
crawl.create_segments
|
120
|
+
job.set_segments(crawl.crawl_segments[0..1], max_files)
|
121
|
+
|
122
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
123
|
+
job.run
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should display status of crawl segments' do
|
127
|
+
expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
|
128
|
+
'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3'
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'should display parse job desc' do
|
132
|
+
crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
|
133
|
+
expect(crawl_status.include?(job.job_name)).to eq true
|
134
|
+
expect(crawl_status.include?(job.job_desc)).to eq true
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Job do
|
4
|
+
it { should have_many(:job_steps) }
|
5
|
+
it { should have_db_column(:type).of_type(:string) }
|
6
|
+
it { should have_db_column(:job_name).of_type(:string) }
|
7
|
+
it { should have_db_column(:job_desc).of_type(:string) }
|
8
|
+
it { should have_db_column(:max_files).of_type(:integer) }
|
9
|
+
it { should have_db_column(:job_flow_id).of_type(:string) }
|
10
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::JobStep do
|
4
|
+
it { should belong_to(:job) }
|
5
|
+
it { should belong_to(:crawl_segment) }
|
6
|
+
it { should have_db_column(:input_paths).of_type(:text) }
|
7
|
+
it { should have_db_column(:output_path).of_type(:text) }
|
8
|
+
|
9
|
+
describe '#job_flow_step' do
|
10
|
+
let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
|
11
|
+
:max_files => 5) }
|
12
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
13
|
+
let(:crawl_segment) { crawl.crawl_segments[0] }
|
14
|
+
let(:input_paths) {
|
15
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/*.warc.wet.gz'
|
16
|
+
}
|
17
|
+
let(:output_path) {
|
18
|
+
's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
|
19
|
+
}
|
20
|
+
let(:config) {
|
21
|
+
{ 'jar' => 's3://elasticrawl/jar/elasticrawl-example-0.0.1.jar',
|
22
|
+
'class' => 'com.rossfairbanks.commoncrawl.elasticrawl.ParserDriver'
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
let(:job_step) { Elasticrawl::JobStep.create(:job => job,
|
27
|
+
:crawl_segment => crawl_segment,
|
28
|
+
:input_paths => input_paths,
|
29
|
+
:output_path => output_path) }
|
30
|
+
subject { job_step.job_flow_step(config) }
|
31
|
+
|
32
|
+
it 'should be a CustomJarStep' do
|
33
|
+
expect(subject).to be_a Elasticity::CustomJarStep
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should have a jar location' do
|
37
|
+
expect(subject.jar).to eq config['jar']
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should have 4 jar args' do
|
41
|
+
expect(subject.arguments.count).to eq 4
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'should have a class argument' do
|
45
|
+
expect(subject.arguments[0]).to eq config['class']
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should have an input path arg' do
|
49
|
+
expect(subject.arguments[1]).to eq input_paths
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should have an output path arg' do
|
53
|
+
expect(subject.arguments[2]).to eq output_path
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should have a max files arg' do
|
57
|
+
expect(subject.arguments[3]).to eq '5'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|