elasticrawl 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +21 -0
- data/.travis.yml +5 -0
- data/Cheffile +14 -0
- data/Cheffile.lock +37 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +232 -0
- data/Rakefile +11 -0
- data/Vagrantfile +58 -0
- data/bin/elasticrawl +141 -0
- data/db/migrate/201401051536_create_crawls.rb +10 -0
- data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
- data/db/migrate/201401101723_create_jobs.rb +14 -0
- data/db/migrate/201401141606_create_job_steps.rb +11 -0
- data/elasticrawl.gemspec +35 -0
- data/lib/elasticrawl/cluster.rb +128 -0
- data/lib/elasticrawl/combine_job.rb +86 -0
- data/lib/elasticrawl/config.rb +242 -0
- data/lib/elasticrawl/crawl.rb +114 -0
- data/lib/elasticrawl/crawl_segment.rb +8 -0
- data/lib/elasticrawl/error.rb +22 -0
- data/lib/elasticrawl/job.rb +68 -0
- data/lib/elasticrawl/job_step.rb +46 -0
- data/lib/elasticrawl/parse_job.rb +84 -0
- data/lib/elasticrawl/version.rb +3 -0
- data/lib/elasticrawl.rb +21 -0
- data/spec/fixtures/aws.yml +4 -0
- data/spec/fixtures/cluster.yml +44 -0
- data/spec/fixtures/jobs.yml +31 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/unit/cluster_spec.rb +54 -0
- data/spec/unit/combine_job_spec.rb +97 -0
- data/spec/unit/config_spec.rb +17 -0
- data/spec/unit/crawl_segment_spec.rb +27 -0
- data/spec/unit/crawl_spec.rb +137 -0
- data/spec/unit/job_spec.rb +10 -0
- data/spec/unit/job_step_spec.rb +60 -0
- data/spec/unit/parse_job_spec.rb +130 -0
- data/templates/aws.yml +7 -0
- data/templates/cluster.yml +44 -0
- data/templates/jobs.yml +31 -0
- metadata +315 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents an Elastic MapReduce job flow step. For a parse job this will
|
3
|
+
# process a single Common Crawl segment. For a combine job a single step
|
4
|
+
# will aggregate the results of multiple parse jobs.
|
5
|
+
class JobStep < ActiveRecord::Base
|
6
|
+
belongs_to :job
|
7
|
+
belongs_to :crawl_segment
|
8
|
+
|
9
|
+
# Returns a custom jar step that is configured with the jar location,
|
10
|
+
# class name and input and output paths.
|
11
|
+
#
|
12
|
+
# For parse jobs optionally specifies the maximum # of Common Crawl
|
13
|
+
# data files to process before the job exits.
|
14
|
+
def job_flow_step(job_config)
|
15
|
+
jar = job_config['jar']
|
16
|
+
max_files = self.job.max_files
|
17
|
+
|
18
|
+
step_args = []
|
19
|
+
step_args[0] = job_config['class']
|
20
|
+
step_args[1] = self.input_paths
|
21
|
+
step_args[2] = self.output_path
|
22
|
+
# All arguments must be strings.
|
23
|
+
step_args[3] = max_files.to_s if max_files.present?
|
24
|
+
|
25
|
+
step = Elasticity::CustomJarStep.new(jar)
|
26
|
+
step.name = set_step_name
|
27
|
+
step.arguments = step_args
|
28
|
+
|
29
|
+
step
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
# Sets the Elastic MapReduce job flow step name based on the type of job it
|
34
|
+
# belongs to.
|
35
|
+
def set_step_name
|
36
|
+
case self.job.type
|
37
|
+
when 'Elasticrawl::ParseJob'
|
38
|
+
segment =self.crawl_segment.segment_name if self.crawl_segment.present?
|
39
|
+
"Segment: #{segment}"
|
40
|
+
when 'Elasticrawl::CombineJob'
|
41
|
+
paths = self.input_paths.split(',')
|
42
|
+
"Combining #{paths.count} jobs"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents an Elastic MapReduce job flow that parses segments of
|
3
|
+
# Common Crawl data. A job step is created per segment.
|
4
|
+
#
|
5
|
+
# Inherits from Job which is the ActiveRecord model class.
|
6
|
+
class ParseJob < Job
|
7
|
+
# Populates the job from the list of segments to be parsed.
|
8
|
+
def set_segments(crawl_segments, max_files = nil)
|
9
|
+
self.job_name = set_job_name
|
10
|
+
self.job_desc = set_job_desc(crawl_segments, max_files)
|
11
|
+
self.max_files = max_files
|
12
|
+
|
13
|
+
crawl_segments.each do |segment|
|
14
|
+
self.job_steps.push(create_job_step(segment))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
# Runs the job by calling Elastic MapReduce API. If successful the
|
19
|
+
# parse time is set for each segment.
|
20
|
+
def run
|
21
|
+
emr_config = job_config['emr_config']
|
22
|
+
job_flow_id = run_job_flow(emr_config)
|
23
|
+
|
24
|
+
if job_flow_id.present?
|
25
|
+
self.job_flow_id = job_flow_id
|
26
|
+
|
27
|
+
self.job_steps.each do |step|
|
28
|
+
segment = step.crawl_segment
|
29
|
+
segment.parse_time = DateTime.now
|
30
|
+
segment.save
|
31
|
+
end
|
32
|
+
|
33
|
+
self.save
|
34
|
+
self.result_message
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Returns the S3 location for storing Elastic MapReduce job logs.
|
39
|
+
def log_uri
|
40
|
+
s3_path = "/logs/1-parse/#{self.job_name}/"
|
41
|
+
build_s3_uri(s3_path)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
# Creates a job step for the crawl segment.
|
46
|
+
def create_job_step(segment)
|
47
|
+
JobStep.create(:job => self,
|
48
|
+
:crawl_segment => segment,
|
49
|
+
:input_paths => segment_input(segment),
|
50
|
+
:output_path => segment_output(segment))
|
51
|
+
end
|
52
|
+
|
53
|
+
# Returns the S3 location for reading a crawl segment. The input filter
|
54
|
+
# determines which type of Common Crawl data files are parsed.
|
55
|
+
def segment_input(segment)
|
56
|
+
segment.segment_s3_uri + job_config['input_filter']
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the S3 location for storing the step results. This includes
|
60
|
+
# the segment name.
|
61
|
+
def segment_output(segment)
|
62
|
+
job_path = "/data/1-parse/#{self.job_name}"
|
63
|
+
s3_path = "#{job_path}/segments/#{segment.segment_name}/"
|
64
|
+
build_s3_uri(s3_path)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Sets the job description which forms part of the Elastic MapReduce
|
68
|
+
# job flow name.
|
69
|
+
def set_job_desc(segments, max_files)
|
70
|
+
if segments.count > 0
|
71
|
+
crawl_name = segments[0].crawl.crawl_name if segments[0].crawl.present?
|
72
|
+
file_desc = max_files.nil? ? 'all files' : "#{max_files} files per segment"
|
73
|
+
end
|
74
|
+
|
75
|
+
"Crawl: #{crawl_name} Segments: #{segments.count} Parsing: #{file_desc}"
|
76
|
+
end
|
77
|
+
|
78
|
+
# Returns the parse job configuration from ~/.elasticrawl.jobs.yml.
|
79
|
+
def job_config
|
80
|
+
config = Config.new
|
81
|
+
config.load_config('jobs')['steps']['parse']
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
data/lib/elasticrawl.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'aws-sdk'
|
2
|
+
require 'active_record'
|
3
|
+
require 'active_support'
|
4
|
+
require 'elasticity'
|
5
|
+
require 'highline/import'
|
6
|
+
require 'thor'
|
7
|
+
|
8
|
+
module Elasticrawl
|
9
|
+
require 'elasticrawl/version'
|
10
|
+
|
11
|
+
require 'elasticrawl/config'
|
12
|
+
require 'elasticrawl/error'
|
13
|
+
|
14
|
+
require 'elasticrawl/cluster'
|
15
|
+
require 'elasticrawl/crawl'
|
16
|
+
require 'elasticrawl/crawl_segment'
|
17
|
+
require 'elasticrawl/job'
|
18
|
+
require 'elasticrawl/combine_job'
|
19
|
+
require 'elasticrawl/parse_job'
|
20
|
+
require 'elasticrawl/job_step'
|
21
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# Configures the Elastic MapReduce cluster that is launched to run parse and
|
2
|
+
# combine jobs. The list of EC2 instance types can be found at
|
3
|
+
# http://aws.amazon.com/ec2/instance-types/#instance-details
|
4
|
+
|
5
|
+
# Using spot instances is recommended to reduce costs. However if the spot
|
6
|
+
# price rises above your bid price the cluster may be terminated. Elasticrawl
|
7
|
+
# tries to reduce the effect of this by parsing each Commmon Crawl segment
|
8
|
+
# in a separate job flow step.
|
9
|
+
|
10
|
+
# The master node manages the cluster.
|
11
|
+
master_instance_group:
|
12
|
+
instance_type: m1.medium
|
13
|
+
use_spot_instances: true
|
14
|
+
bid_price: 0.120
|
15
|
+
|
16
|
+
# Core nodes run map and reduce tasks and store data using HDFS.
|
17
|
+
core_instance_group:
|
18
|
+
instance_type: m1.medium
|
19
|
+
instance_count: 2
|
20
|
+
use_spot_instances: true
|
21
|
+
bid_price: 0.120
|
22
|
+
|
23
|
+
# Task nodes are optional and only run map and reduce tasks.
|
24
|
+
task_instance_group:
|
25
|
+
instance_type: m1.small
|
26
|
+
instance_count: 0
|
27
|
+
use_spot_instances: true
|
28
|
+
bid_price: 0.080
|
29
|
+
|
30
|
+
# Array of bootstrap scripts that will be applied when the cluster nodes are
|
31
|
+
# initialized. The example installs the Ganglia distributed monitoring system.
|
32
|
+
bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
|
33
|
+
|
34
|
+
# Specifying an EC2 key pair allows SSH access to the master node. This also
|
35
|
+
# allows accessing the Hadoop Web UI over an SSH tunnel.
|
36
|
+
ec2_key_name: 'elasticrawl'
|
37
|
+
|
38
|
+
# Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
|
39
|
+
# recommended since the Common Crawl corpus is stored there. Otherwise inter
|
40
|
+
# region data transfer charges will apply.
|
41
|
+
placement: 'us-east-1c'
|
42
|
+
|
43
|
+
# The AMI version to use when launching instances.
|
44
|
+
emr_ami_version: 'latest'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
|
2
|
+
# corpus.
|
3
|
+
|
4
|
+
# An S3 bucket is created by the init command and is used to store data and logs.
|
5
|
+
s3_bucket_name: 'elasticrawl'
|
6
|
+
|
7
|
+
# A parse step is created per Common Crawl segment. A combine step takes the
|
8
|
+
# results from multiple segments to create a single set of output files.
|
9
|
+
|
10
|
+
# The parse input filter is used to specify the Common Crawl file type.
|
11
|
+
|
12
|
+
# WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
|
13
|
+
# WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
|
14
|
+
# WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
|
15
|
+
|
16
|
+
# The EMR config is an XML file that sets Hadoop properties. If a config file
|
17
|
+
# is specified then a bootstrap action is run on each node to apply it.
|
18
|
+
steps:
|
19
|
+
# Parse step for the Example Elasticrawl JAR. This does a word count
|
20
|
+
# against the text extractions of the corpus.
|
21
|
+
parse:
|
22
|
+
jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
|
23
|
+
class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
|
24
|
+
input_filter: 'wet/*.warc.wet.gz'
|
25
|
+
emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
|
26
|
+
# Combine step for the Example Elasticrawl JAR.
|
27
|
+
combine:
|
28
|
+
jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
|
29
|
+
class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
|
30
|
+
input_filter: 'part-*'
|
31
|
+
emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'elasticrawl'
|
2
|
+
require 'rspec'
|
3
|
+
require 'mocha'
|
4
|
+
require 'database_cleaner'
|
5
|
+
require 'shoulda-matchers'
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.before(:suite) do
|
9
|
+
# Return S3 paths that are used to create a crawl object with 3 crawl segments.
|
10
|
+
segment_paths = []
|
11
|
+
segment_paths[0] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
|
12
|
+
segment_paths[1] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381630/'
|
13
|
+
segment_paths[2] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696382185/'
|
14
|
+
Elasticrawl::Crawl.any_instance.stubs(:s3_segment_paths).returns(segment_paths)
|
15
|
+
|
16
|
+
# Load config from spec/fixtures/ rather than ~/.elasticrawl/
|
17
|
+
config_dir = File.join(File.dirname(__FILE__), 'fixtures')
|
18
|
+
Elasticrawl::Config.any_instance.stubs(:config_dir).returns(config_dir)
|
19
|
+
|
20
|
+
# Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
|
21
|
+
config = Elasticrawl::Config.new
|
22
|
+
config.load_database
|
23
|
+
end
|
24
|
+
|
25
|
+
# Run each test in a transaction and rollback data on completion.
|
26
|
+
DatabaseCleaner.strategy = :transaction
|
27
|
+
|
28
|
+
config.before(:each) do
|
29
|
+
DatabaseCleaner.start
|
30
|
+
end
|
31
|
+
|
32
|
+
config.after(:each) do
|
33
|
+
DatabaseCleaner.clean
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Cluster do
|
4
|
+
describe '#create_job_flow' do
|
5
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
6
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
7
|
+
let(:cluster) { Elasticrawl::Cluster.new }
|
8
|
+
subject { cluster.create_job_flow(job) }
|
9
|
+
|
10
|
+
before do
|
11
|
+
job.set_segments(crawl.crawl_segments)
|
12
|
+
end
|
13
|
+
|
14
|
+
it 'should be an Elasticity::JobFlow' do
|
15
|
+
expect(subject).to be_a Elasticity::JobFlow
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should have a job flow name' do
|
19
|
+
expect(subject.name).to eq "Job Name: #{job.job_name} #{job.job_desc}"
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should have a log uri' do
|
23
|
+
expect(subject.log_uri).to eq job.log_uri
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should have an ec2 key name' do
|
27
|
+
expect(subject.ec2_key_name).to eq 'elasticrawl'
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'should have a placement az name' do
|
31
|
+
expect(subject.placement).to eq 'us-east-1c'
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should have an ami version' do
|
35
|
+
expect(subject.ami_version).to eq 'latest'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe '#cluster_desc' do
|
40
|
+
let(:cluster_desc) {
|
41
|
+
cluster_desc = <<-HERE
|
42
|
+
Cluster configuration
|
43
|
+
Master: 1 m1.medium (Spot: 0.12)
|
44
|
+
Core: 2 m1.medium (Spot: 0.12)
|
45
|
+
Task: --
|
46
|
+
HERE
|
47
|
+
}
|
48
|
+
subject { Elasticrawl::Cluster.new }
|
49
|
+
|
50
|
+
it 'should describe configured instance groups' do
|
51
|
+
expect(subject.cluster_desc).to eq cluster_desc
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::CombineJob do
|
4
|
+
describe '#set_input_jobs' do
|
5
|
+
let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
|
6
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
7
|
+
let(:segment_list_1) { crawl.crawl_segments[0..1] }
|
8
|
+
let(:segment_list_2) { [crawl.crawl_segments[2]]}
|
9
|
+
|
10
|
+
let(:parse_job_1) { Elasticrawl::ParseJob.new }
|
11
|
+
let(:parse_job_2) { Elasticrawl::ParseJob.new }
|
12
|
+
let(:combine_job) { Elasticrawl::CombineJob.new }
|
13
|
+
|
14
|
+
before do
|
15
|
+
crawl.create_segments
|
16
|
+
parse_job_1.set_segments(segment_list_1)
|
17
|
+
parse_job_2.set_segments(segment_list_2)
|
18
|
+
|
19
|
+
input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
|
20
|
+
combine_job.set_input_jobs(input_jobs)
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'should have a job name based on current time' do
|
24
|
+
expect(combine_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should have a job desc' do
|
28
|
+
expect(combine_job.job_desc.end_with?('Combining: 3 segments')).to eq true
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'should create 1 job step' do
|
32
|
+
expect(combine_job.job_steps.count).to eq 1
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'should set 1 input path per parse job' do
|
36
|
+
input_paths = combine_job.job_steps[0].input_paths
|
37
|
+
expect(input_paths.split(',').count).to eq 2
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should set input path including parse job name' do
|
41
|
+
input_paths = combine_job.job_steps[0].input_paths
|
42
|
+
expect(input_paths.include?(parse_job_1.job_name)).to eq true
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should set input path without segment names' do
|
46
|
+
input_paths = combine_job.job_steps[0].input_paths
|
47
|
+
segment_name = segment_list_1[0].segment_name
|
48
|
+
expect(input_paths.include?(segment_name)).to eq false
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'should set output path including job name' do
|
52
|
+
output_path = combine_job.job_steps[0].output_path
|
53
|
+
expect(output_path.include?(combine_job.job_name)).to eq true
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe '#run' do
|
58
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
59
|
+
let(:parse_job_1) { Elasticrawl::ParseJob.new }
|
60
|
+
let(:parse_job_2) { Elasticrawl::ParseJob.new }
|
61
|
+
let(:combine_job) { Elasticrawl::CombineJob.new }
|
62
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
63
|
+
|
64
|
+
before do
|
65
|
+
crawl.create_segments
|
66
|
+
parse_job_1.set_segments(crawl.crawl_segments[0..1])
|
67
|
+
parse_job_2.set_segments([crawl.crawl_segments[2]])
|
68
|
+
|
69
|
+
input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
|
70
|
+
combine_job.set_input_jobs(input_jobs)
|
71
|
+
end
|
72
|
+
|
73
|
+
it 'should set a job flow id' do
|
74
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
75
|
+
combine_job.run
|
76
|
+
|
77
|
+
expect(combine_job.job_flow_id).to eq job_flow_id
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe '#log_uri' do
|
82
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
83
|
+
let(:parse_job) { Elasticrawl::ParseJob.new }
|
84
|
+
let(:job) { Elasticrawl::CombineJob.new }
|
85
|
+
|
86
|
+
before do
|
87
|
+
crawl.create_segments
|
88
|
+
parse_job.set_segments(crawl.crawl_segments)
|
89
|
+
|
90
|
+
job.set_input_jobs([parse_job.job_name])
|
91
|
+
end
|
92
|
+
|
93
|
+
it 'should set a log uri including the job name' do
|
94
|
+
expect(job.log_uri).to eq "s3://elasticrawl/logs/2-combine/#{job.job_name}/"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Config do
|
4
|
+
describe '#load_config' do
|
5
|
+
subject { Elasticrawl::Config.new }
|
6
|
+
|
7
|
+
it 'should return a hash of config data' do
|
8
|
+
config_data = subject.load_config('jobs')
|
9
|
+
expect(config_data).to be_a Hash
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should load yaml config file' do
|
13
|
+
config_data = subject.load_config('jobs')
|
14
|
+
expect(config_data['s3_bucket_name']).to eq 'elasticrawl'
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::CrawlSegment do
|
4
|
+
it { should belong_to(:crawl) }
|
5
|
+
it { should have_many(:job_steps) }
|
6
|
+
it { should have_db_column(:segment_name).of_type(:string) }
|
7
|
+
it { should have_db_column(:segment_s3_uri).of_type(:string) }
|
8
|
+
it { should have_db_column(:parse_time).of_type(:datetime) }
|
9
|
+
|
10
|
+
describe '#initialize' do
|
11
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
12
|
+
subject { crawl.crawl_segments[0]}
|
13
|
+
|
14
|
+
before do
|
15
|
+
crawl.create_segments
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should have a segment name' do
|
19
|
+
expect(subject.segment_name).to eq '1368696381249'
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'should have an s3 uri' do
|
23
|
+
expect(subject.segment_s3_uri).to eq \
|
24
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Crawl do
|
4
|
+
it { should have_many(:crawl_segments) }
|
5
|
+
it { should have_db_column(:crawl_name).of_type(:string) }
|
6
|
+
|
7
|
+
describe '#has_segments?' do
|
8
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
9
|
+
subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
|
10
|
+
|
11
|
+
it 'should have segments' do
|
12
|
+
expect(subject.has_segments?).to eq true
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
describe '#create_segments' do
|
17
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
18
|
+
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
19
|
+
|
20
|
+
before do
|
21
|
+
subject.create_segments
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should set crawl name' do
|
25
|
+
expect(subject.crawl_name).to eq crawl_name
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'should create correct # of segments' do
|
29
|
+
expect(subject.crawl_segments.count).to eq 3
|
30
|
+
end
|
31
|
+
|
32
|
+
it 'should create segment names' do
|
33
|
+
expect(subject.crawl_segments[0].segment_name).to eq '1368696381249'
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should create segment s3 uris' do
|
37
|
+
expect(subject.crawl_segments[0].segment_s3_uri).to eq \
|
38
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#next_segments' do
|
43
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
44
|
+
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
45
|
+
|
46
|
+
before do
|
47
|
+
subject.create_segments
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'should return all segments' do
|
51
|
+
crawl_segments = subject.next_segments
|
52
|
+
|
53
|
+
expect(crawl_segments.count).to eq 3
|
54
|
+
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
55
|
+
expect(crawl_segments[0].segment_name).to eq '1368696381249'
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'should return first # segments' do
|
59
|
+
crawl_segments = subject.next_segments(2)
|
60
|
+
|
61
|
+
expect(crawl_segments.count).to eq 2
|
62
|
+
expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
|
63
|
+
expect(crawl_segments[0].segment_name).to eq '1368696381249'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#select_segments' do
|
68
|
+
let(:crawl_name) { 'CC-MAIN-2013-20' }
|
69
|
+
subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
|
70
|
+
|
71
|
+
before do
|
72
|
+
subject.create_segments
|
73
|
+
end
|
74
|
+
|
75
|
+
it 'should select no segments' do
|
76
|
+
segments_list = ['test', 'segment']
|
77
|
+
crawl_segments = subject.select_segments(segments_list)
|
78
|
+
|
79
|
+
expect(crawl_segments.count).to eq 0
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'should select only segments in list' do
|
83
|
+
segments_list = ['1368696381249', '1368696382185']
|
84
|
+
crawl_segments = subject.select_segments(segments_list)
|
85
|
+
|
86
|
+
expect(crawl_segments.count).to eq 2
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
describe '#reset' do
|
91
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
92
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
93
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
94
|
+
|
95
|
+
before do
|
96
|
+
crawl.create_segments
|
97
|
+
job.set_segments(crawl.crawl_segments[0..1])
|
98
|
+
|
99
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
100
|
+
job.run
|
101
|
+
|
102
|
+
crawl.reset
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'should set parse time of all segments to null' do
|
106
|
+
unparsed_segments = Elasticrawl::CrawlSegment.where(:parse_time => nil).count
|
107
|
+
expect(crawl.crawl_segments.count).to eq unparsed_segments
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
describe '.status' do
|
112
|
+
let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
|
113
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
114
|
+
let(:max_files) { 5 }
|
115
|
+
let(:job) { Elasticrawl::ParseJob.new }
|
116
|
+
let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
|
117
|
+
|
118
|
+
before do
|
119
|
+
crawl.create_segments
|
120
|
+
job.set_segments(crawl.crawl_segments[0..1], max_files)
|
121
|
+
|
122
|
+
Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
|
123
|
+
job.run
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'should display status of crawl segments' do
|
127
|
+
expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
|
128
|
+
'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3'
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'should display parse job desc' do
|
132
|
+
crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
|
133
|
+
expect(crawl_status.include?(job.job_name)).to eq true
|
134
|
+
expect(crawl_status.include?(job.job_desc)).to eq true
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::Job do
|
4
|
+
it { should have_many(:job_steps) }
|
5
|
+
it { should have_db_column(:type).of_type(:string) }
|
6
|
+
it { should have_db_column(:job_name).of_type(:string) }
|
7
|
+
it { should have_db_column(:job_desc).of_type(:string) }
|
8
|
+
it { should have_db_column(:max_files).of_type(:integer) }
|
9
|
+
it { should have_db_column(:job_flow_id).of_type(:string) }
|
10
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Elasticrawl::JobStep do
|
4
|
+
it { should belong_to(:job) }
|
5
|
+
it { should belong_to(:crawl_segment) }
|
6
|
+
it { should have_db_column(:input_paths).of_type(:text) }
|
7
|
+
it { should have_db_column(:output_path).of_type(:text) }
|
8
|
+
|
9
|
+
describe '#job_flow_step' do
|
10
|
+
let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
|
11
|
+
:max_files => 5) }
|
12
|
+
let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
|
13
|
+
let(:crawl_segment) { crawl.crawl_segments[0] }
|
14
|
+
let(:input_paths) {
|
15
|
+
's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/*.warc.wet.gz'
|
16
|
+
}
|
17
|
+
let(:output_path) {
|
18
|
+
's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
|
19
|
+
}
|
20
|
+
let(:config) {
|
21
|
+
{ 'jar' => 's3://elasticrawl/jar/elasticrawl-example-0.0.1.jar',
|
22
|
+
'class' => 'com.rossfairbanks.commoncrawl.elasticrawl.ParserDriver'
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
let(:job_step) { Elasticrawl::JobStep.create(:job => job,
|
27
|
+
:crawl_segment => crawl_segment,
|
28
|
+
:input_paths => input_paths,
|
29
|
+
:output_path => output_path) }
|
30
|
+
subject { job_step.job_flow_step(config) }
|
31
|
+
|
32
|
+
it 'should be a CustomJarStep' do
|
33
|
+
expect(subject).to be_a Elasticity::CustomJarStep
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'should have a jar location' do
|
37
|
+
expect(subject.jar).to eq config['jar']
|
38
|
+
end
|
39
|
+
|
40
|
+
it 'should have 4 jar args' do
|
41
|
+
expect(subject.arguments.count).to eq 4
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'should have a class argument' do
|
45
|
+
expect(subject.arguments[0]).to eq config['class']
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'should have an input path arg' do
|
49
|
+
expect(subject.arguments[1]).to eq input_paths
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should have an output path arg' do
|
53
|
+
expect(subject.arguments[2]).to eq output_path
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'should have a max files arg' do
|
57
|
+
expect(subject.arguments[3]).to eq '5'
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|