elasticrawl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ module Elasticrawl
2
+ # Represents an Elastic MapReduce job flow step. For a parse job this will
3
+ # process a single Common Crawl segment. For a combine job a single step
4
+ # will aggregate the results of multiple parse jobs.
5
+ class JobStep < ActiveRecord::Base
6
+ belongs_to :job
7
+ belongs_to :crawl_segment
8
+
9
+ # Returns a custom jar step that is configured with the jar location,
10
+ # class name and input and output paths.
11
+ #
12
+ # For parse jobs optionally specifies the maximum # of Common Crawl
13
+ # data files to process before the job exits.
14
+ def job_flow_step(job_config)
15
+ jar = job_config['jar']
16
+ max_files = self.job.max_files
17
+
18
+ step_args = []
19
+ step_args[0] = job_config['class']
20
+ step_args[1] = self.input_paths
21
+ step_args[2] = self.output_path
22
+ # All arguments must be strings.
23
+ step_args[3] = max_files.to_s if max_files.present?
24
+
25
+ step = Elasticity::CustomJarStep.new(jar)
26
+ step.name = set_step_name
27
+ step.arguments = step_args
28
+
29
+ step
30
+ end
31
+
32
+ private
33
+ # Sets the Elastic MapReduce job flow step name based on the type of job it
34
+ # belongs to.
35
+ def set_step_name
36
+ case self.job.type
37
+ when 'Elasticrawl::ParseJob'
38
+ segment =self.crawl_segment.segment_name if self.crawl_segment.present?
39
+ "Segment: #{segment}"
40
+ when 'Elasticrawl::CombineJob'
41
+ paths = self.input_paths.split(',')
42
+ "Combining #{paths.count} jobs"
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,84 @@
1
+ module Elasticrawl
2
+ # Represents an Elastic MapReduce job flow that parses segments of
3
+ # Common Crawl data. A job step is created per segment.
4
+ #
5
+ # Inherits from Job which is the ActiveRecord model class.
6
+ class ParseJob < Job
7
+ # Populates the job from the list of segments to be parsed.
8
+ def set_segments(crawl_segments, max_files = nil)
9
+ self.job_name = set_job_name
10
+ self.job_desc = set_job_desc(crawl_segments, max_files)
11
+ self.max_files = max_files
12
+
13
+ crawl_segments.each do |segment|
14
+ self.job_steps.push(create_job_step(segment))
15
+ end
16
+ end
17
+
18
+ # Runs the job by calling Elastic MapReduce API. If successful the
19
+ # parse time is set for each segment.
20
+ def run
21
+ emr_config = job_config['emr_config']
22
+ job_flow_id = run_job_flow(emr_config)
23
+
24
+ if job_flow_id.present?
25
+ self.job_flow_id = job_flow_id
26
+
27
+ self.job_steps.each do |step|
28
+ segment = step.crawl_segment
29
+ segment.parse_time = DateTime.now
30
+ segment.save
31
+ end
32
+
33
+ self.save
34
+ self.result_message
35
+ end
36
+ end
37
+
38
+ # Returns the S3 location for storing Elastic MapReduce job logs.
39
+ def log_uri
40
+ s3_path = "/logs/1-parse/#{self.job_name}/"
41
+ build_s3_uri(s3_path)
42
+ end
43
+
44
+ private
45
+ # Creates a job step for the crawl segment.
46
+ def create_job_step(segment)
47
+ JobStep.create(:job => self,
48
+ :crawl_segment => segment,
49
+ :input_paths => segment_input(segment),
50
+ :output_path => segment_output(segment))
51
+ end
52
+
53
+ # Returns the S3 location for reading a crawl segment. The input filter
54
+ # determines which type of Common Crawl data files are parsed.
55
+ def segment_input(segment)
56
+ segment.segment_s3_uri + job_config['input_filter']
57
+ end
58
+
59
+ # Returns the S3 location for storing the step results. This includes
60
+ # the segment name.
61
+ def segment_output(segment)
62
+ job_path = "/data/1-parse/#{self.job_name}"
63
+ s3_path = "#{job_path}/segments/#{segment.segment_name}/"
64
+ build_s3_uri(s3_path)
65
+ end
66
+
67
+ # Sets the job description which forms part of the Elastic MapReduce
68
+ # job flow name.
69
+ def set_job_desc(segments, max_files)
70
+ if segments.count > 0
71
+ crawl_name = segments[0].crawl.crawl_name if segments[0].crawl.present?
72
+ file_desc = max_files.nil? ? 'all files' : "#{max_files} files per segment"
73
+ end
74
+
75
+ "Crawl: #{crawl_name} Segments: #{segments.count} Parsing: #{file_desc}"
76
+ end
77
+
78
+ # Returns the parse job configuration from ~/.elasticrawl.jobs.yml.
79
+ def job_config
80
+ config = Config.new
81
+ config.load_config('jobs')['steps']['parse']
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,3 @@
1
+ module Elasticrawl
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,21 @@
1
+ require 'aws-sdk'
2
+ require 'active_record'
3
+ require 'active_support'
4
+ require 'elasticity'
5
+ require 'highline/import'
6
+ require 'thor'
7
+
8
+ module Elasticrawl
9
+ require 'elasticrawl/version'
10
+
11
+ require 'elasticrawl/config'
12
+ require 'elasticrawl/error'
13
+
14
+ require 'elasticrawl/cluster'
15
+ require 'elasticrawl/crawl'
16
+ require 'elasticrawl/crawl_segment'
17
+ require 'elasticrawl/job'
18
+ require 'elasticrawl/combine_job'
19
+ require 'elasticrawl/parse_job'
20
+ require 'elasticrawl/job_step'
21
+ end
@@ -0,0 +1,4 @@
1
+ # Configures the AWS credentials used when accessing the AWS EMR and S3 APIs.
2
+ # This file is populated by the elasticrawl init command.
3
+ access_key_id: 'ACCESS_KEY_ID'
4
+ secret_access_key: 'SECRET_ACCESS_KEY'
@@ -0,0 +1,44 @@
1
+ # Configures the Elastic MapReduce cluster that is launched to run parse and
2
+ # combine jobs. The list of EC2 instance types can be found at
3
+ # http://aws.amazon.com/ec2/instance-types/#instance-details
4
+
5
+ # Using spot instances is recommended to reduce costs. However if the spot
6
+ # price rises above your bid price the cluster may be terminated. Elasticrawl
7
+ # tries to reduce the effect of this by parsing each Commmon Crawl segment
8
+ # in a separate job flow step.
9
+
10
+ # The master node manages the cluster.
11
+ master_instance_group:
12
+ instance_type: m1.medium
13
+ use_spot_instances: true
14
+ bid_price: 0.120
15
+
16
+ # Core nodes run map and reduce tasks and store data using HDFS.
17
+ core_instance_group:
18
+ instance_type: m1.medium
19
+ instance_count: 2
20
+ use_spot_instances: true
21
+ bid_price: 0.120
22
+
23
+ # Task nodes are optional and only run map and reduce tasks.
24
+ task_instance_group:
25
+ instance_type: m1.small
26
+ instance_count: 0
27
+ use_spot_instances: true
28
+ bid_price: 0.080
29
+
30
+ # Array of bootstrap scripts that will be applied when the cluster nodes are
31
+ # initialized. The example installs the Ganglia distributed monitoring system.
32
+ bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
33
+
34
+ # Specifying an EC2 key pair allows SSH access to the master node. This also
35
+ # allows accessing the Hadoop Web UI over an SSH tunnel.
36
+ ec2_key_name: 'elasticrawl'
37
+
38
+ # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
39
+ # recommended since the Common Crawl corpus is stored there. Otherwise inter
40
+ # region data transfer charges will apply.
41
+ placement: 'us-east-1c'
42
+
43
+ # The AMI version to use when launching instances.
44
+ emr_ami_version: 'latest'
@@ -0,0 +1,31 @@
1
+ # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
2
+ # corpus.
3
+
4
+ # An S3 bucket is created by the init command and is used to store data and logs.
5
+ s3_bucket_name: 'elasticrawl'
6
+
7
+ # A parse step is created per Common Crawl segment. A combine step takes the
8
+ # results from multiple segments to create a single set of output files.
9
+
10
+ # The parse input filter is used to specify the Common Crawl file type.
11
+
12
+ # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
13
+ # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
14
+ # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
15
+
16
+ # The EMR config is an XML file that sets Hadoop properties. If a config file
17
+ # is specified then a bootstrap action is run on each node to apply it.
18
+ steps:
19
+ # Parse step for the Example Elasticrawl JAR. This does a word count
20
+ # against the text extractions of the corpus.
21
+ parse:
22
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
23
+ class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
24
+ input_filter: 'wet/*.warc.wet.gz'
25
+ emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
26
+ # Combine step for the Example Elasticrawl JAR.
27
+ combine:
28
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
29
+ class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
30
+ input_filter: 'part-*'
31
+ emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
@@ -0,0 +1,35 @@
1
+ require 'elasticrawl'
2
+ require 'rspec'
3
+ require 'mocha'
4
+ require 'database_cleaner'
5
+ require 'shoulda-matchers'
6
+
7
+ RSpec.configure do |config|
8
+ config.before(:suite) do
9
+ # Return S3 paths that are used to create a crawl object with 3 crawl segments.
10
+ segment_paths = []
11
+ segment_paths[0] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
12
+ segment_paths[1] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381630/'
13
+ segment_paths[2] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696382185/'
14
+ Elasticrawl::Crawl.any_instance.stubs(:s3_segment_paths).returns(segment_paths)
15
+
16
+ # Load config from spec/fixtures/ rather than ~/.elasticrawl/
17
+ config_dir = File.join(File.dirname(__FILE__), 'fixtures')
18
+ Elasticrawl::Config.any_instance.stubs(:config_dir).returns(config_dir)
19
+
20
+ # Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
21
+ config = Elasticrawl::Config.new
22
+ config.load_database
23
+ end
24
+
25
+ # Run each test in a transaction and rollback data on completion.
26
+ DatabaseCleaner.strategy = :transaction
27
+
28
+ config.before(:each) do
29
+ DatabaseCleaner.start
30
+ end
31
+
32
+ config.after(:each) do
33
+ DatabaseCleaner.clean
34
+ end
35
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Cluster do
4
+ describe '#create_job_flow' do
5
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
6
+ let(:job) { Elasticrawl::ParseJob.new }
7
+ let(:cluster) { Elasticrawl::Cluster.new }
8
+ subject { cluster.create_job_flow(job) }
9
+
10
+ before do
11
+ job.set_segments(crawl.crawl_segments)
12
+ end
13
+
14
+ it 'should be an Elasticity::JobFlow' do
15
+ expect(subject).to be_a Elasticity::JobFlow
16
+ end
17
+
18
+ it 'should have a job flow name' do
19
+ expect(subject.name).to eq "Job Name: #{job.job_name} #{job.job_desc}"
20
+ end
21
+
22
+ it 'should have a log uri' do
23
+ expect(subject.log_uri).to eq job.log_uri
24
+ end
25
+
26
+ it 'should have an ec2 key name' do
27
+ expect(subject.ec2_key_name).to eq 'elasticrawl'
28
+ end
29
+
30
+ it 'should have a placement az name' do
31
+ expect(subject.placement).to eq 'us-east-1c'
32
+ end
33
+
34
+ it 'should have an ami version' do
35
+ expect(subject.ami_version).to eq 'latest'
36
+ end
37
+ end
38
+
39
+ describe '#cluster_desc' do
40
+ let(:cluster_desc) {
41
+ cluster_desc = <<-HERE
42
+ Cluster configuration
43
+ Master: 1 m1.medium (Spot: 0.12)
44
+ Core: 2 m1.medium (Spot: 0.12)
45
+ Task: --
46
+ HERE
47
+ }
48
+ subject { Elasticrawl::Cluster.new }
49
+
50
+ it 'should describe configured instance groups' do
51
+ expect(subject.cluster_desc).to eq cluster_desc
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,97 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::CombineJob do
4
+ describe '#set_input_jobs' do
5
+ let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
6
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
7
+ let(:segment_list_1) { crawl.crawl_segments[0..1] }
8
+ let(:segment_list_2) { [crawl.crawl_segments[2]]}
9
+
10
+ let(:parse_job_1) { Elasticrawl::ParseJob.new }
11
+ let(:parse_job_2) { Elasticrawl::ParseJob.new }
12
+ let(:combine_job) { Elasticrawl::CombineJob.new }
13
+
14
+ before do
15
+ crawl.create_segments
16
+ parse_job_1.set_segments(segment_list_1)
17
+ parse_job_2.set_segments(segment_list_2)
18
+
19
+ input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
20
+ combine_job.set_input_jobs(input_jobs)
21
+ end
22
+
23
+ it 'should have a job name based on current time' do
24
+ expect(combine_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
25
+ end
26
+
27
+ it 'should have a job desc' do
28
+ expect(combine_job.job_desc.end_with?('Combining: 3 segments')).to eq true
29
+ end
30
+
31
+ it 'should create 1 job step' do
32
+ expect(combine_job.job_steps.count).to eq 1
33
+ end
34
+
35
+ it 'should set 1 input path per parse job' do
36
+ input_paths = combine_job.job_steps[0].input_paths
37
+ expect(input_paths.split(',').count).to eq 2
38
+ end
39
+
40
+ it 'should set input path including parse job name' do
41
+ input_paths = combine_job.job_steps[0].input_paths
42
+ expect(input_paths.include?(parse_job_1.job_name)).to eq true
43
+ end
44
+
45
+ it 'should set input path without segment names' do
46
+ input_paths = combine_job.job_steps[0].input_paths
47
+ segment_name = segment_list_1[0].segment_name
48
+ expect(input_paths.include?(segment_name)).to eq false
49
+ end
50
+
51
+ it 'should set output path including job name' do
52
+ output_path = combine_job.job_steps[0].output_path
53
+ expect(output_path.include?(combine_job.job_name)).to eq true
54
+ end
55
+ end
56
+
57
+ describe '#run' do
58
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
59
+ let(:parse_job_1) { Elasticrawl::ParseJob.new }
60
+ let(:parse_job_2) { Elasticrawl::ParseJob.new }
61
+ let(:combine_job) { Elasticrawl::CombineJob.new }
62
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
63
+
64
+ before do
65
+ crawl.create_segments
66
+ parse_job_1.set_segments(crawl.crawl_segments[0..1])
67
+ parse_job_2.set_segments([crawl.crawl_segments[2]])
68
+
69
+ input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
70
+ combine_job.set_input_jobs(input_jobs)
71
+ end
72
+
73
+ it 'should set a job flow id' do
74
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
75
+ combine_job.run
76
+
77
+ expect(combine_job.job_flow_id).to eq job_flow_id
78
+ end
79
+ end
80
+
81
+ describe '#log_uri' do
82
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
83
+ let(:parse_job) { Elasticrawl::ParseJob.new }
84
+ let(:job) { Elasticrawl::CombineJob.new }
85
+
86
+ before do
87
+ crawl.create_segments
88
+ parse_job.set_segments(crawl.crawl_segments)
89
+
90
+ job.set_input_jobs([parse_job.job_name])
91
+ end
92
+
93
+ it 'should set a log uri including the job name' do
94
+ expect(job.log_uri).to eq "s3://elasticrawl/logs/2-combine/#{job.job_name}/"
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Config do
4
+ describe '#load_config' do
5
+ subject { Elasticrawl::Config.new }
6
+
7
+ it 'should return a hash of config data' do
8
+ config_data = subject.load_config('jobs')
9
+ expect(config_data).to be_a Hash
10
+ end
11
+
12
+ it 'should load yaml config file' do
13
+ config_data = subject.load_config('jobs')
14
+ expect(config_data['s3_bucket_name']).to eq 'elasticrawl'
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::CrawlSegment do
4
+ it { should belong_to(:crawl) }
5
+ it { should have_many(:job_steps) }
6
+ it { should have_db_column(:segment_name).of_type(:string) }
7
+ it { should have_db_column(:segment_s3_uri).of_type(:string) }
8
+ it { should have_db_column(:parse_time).of_type(:datetime) }
9
+
10
+ describe '#initialize' do
11
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
12
+ subject { crawl.crawl_segments[0]}
13
+
14
+ before do
15
+ crawl.create_segments
16
+ end
17
+
18
+ it 'should have a segment name' do
19
+ expect(subject.segment_name).to eq '1368696381249'
20
+ end
21
+
22
+ it 'should have an s3 uri' do
23
+ expect(subject.segment_s3_uri).to eq \
24
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Crawl do
4
+ it { should have_many(:crawl_segments) }
5
+ it { should have_db_column(:crawl_name).of_type(:string) }
6
+
7
+ describe '#has_segments?' do
8
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
9
+ subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
10
+
11
+ it 'should have segments' do
12
+ expect(subject.has_segments?).to eq true
13
+ end
14
+ end
15
+
16
+ describe '#create_segments' do
17
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
18
+ subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
19
+
20
+ before do
21
+ subject.create_segments
22
+ end
23
+
24
+ it 'should set crawl name' do
25
+ expect(subject.crawl_name).to eq crawl_name
26
+ end
27
+
28
+ it 'should create correct # of segments' do
29
+ expect(subject.crawl_segments.count).to eq 3
30
+ end
31
+
32
+ it 'should create segment names' do
33
+ expect(subject.crawl_segments[0].segment_name).to eq '1368696381249'
34
+ end
35
+
36
+ it 'should create segment s3 uris' do
37
+ expect(subject.crawl_segments[0].segment_s3_uri).to eq \
38
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
39
+ end
40
+ end
41
+
42
+ describe '#next_segments' do
43
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
44
+ subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
45
+
46
+ before do
47
+ subject.create_segments
48
+ end
49
+
50
+ it 'should return all segments' do
51
+ crawl_segments = subject.next_segments
52
+
53
+ expect(crawl_segments.count).to eq 3
54
+ expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
55
+ expect(crawl_segments[0].segment_name).to eq '1368696381249'
56
+ end
57
+
58
+ it 'should return first # segments' do
59
+ crawl_segments = subject.next_segments(2)
60
+
61
+ expect(crawl_segments.count).to eq 2
62
+ expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
63
+ expect(crawl_segments[0].segment_name).to eq '1368696381249'
64
+ end
65
+ end
66
+
67
+ describe '#select_segments' do
68
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
69
+ subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
70
+
71
+ before do
72
+ subject.create_segments
73
+ end
74
+
75
+ it 'should select no segments' do
76
+ segments_list = ['test', 'segment']
77
+ crawl_segments = subject.select_segments(segments_list)
78
+
79
+ expect(crawl_segments.count).to eq 0
80
+ end
81
+
82
+ it 'should select only segments in list' do
83
+ segments_list = ['1368696381249', '1368696382185']
84
+ crawl_segments = subject.select_segments(segments_list)
85
+
86
+ expect(crawl_segments.count).to eq 2
87
+ end
88
+ end
89
+
90
+ describe '#reset' do
91
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
92
+ let(:job) { Elasticrawl::ParseJob.new }
93
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
94
+
95
+ before do
96
+ crawl.create_segments
97
+ job.set_segments(crawl.crawl_segments[0..1])
98
+
99
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
100
+ job.run
101
+
102
+ crawl.reset
103
+ end
104
+
105
+ it 'should set parse time of all segments to null' do
106
+ unparsed_segments = Elasticrawl::CrawlSegment.where(:parse_time => nil).count
107
+ expect(crawl.crawl_segments.count).to eq unparsed_segments
108
+ end
109
+ end
110
+
111
+ describe '.status' do
112
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
113
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
114
+ let(:max_files) { 5 }
115
+ let(:job) { Elasticrawl::ParseJob.new }
116
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
117
+
118
+ before do
119
+ crawl.create_segments
120
+ job.set_segments(crawl.crawl_segments[0..1], max_files)
121
+
122
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
123
+ job.run
124
+ end
125
+
126
+ it 'should display status of crawl segments' do
127
+ expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
128
+ 'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3'
129
+ end
130
+
131
+ it 'should display parse job desc' do
132
+ crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
133
+ expect(crawl_status.include?(job.job_name)).to eq true
134
+ expect(crawl_status.include?(job.job_desc)).to eq true
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,10 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Job do
4
+ it { should have_many(:job_steps) }
5
+ it { should have_db_column(:type).of_type(:string) }
6
+ it { should have_db_column(:job_name).of_type(:string) }
7
+ it { should have_db_column(:job_desc).of_type(:string) }
8
+ it { should have_db_column(:max_files).of_type(:integer) }
9
+ it { should have_db_column(:job_flow_id).of_type(:string) }
10
+ end
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::JobStep do
4
+ it { should belong_to(:job) }
5
+ it { should belong_to(:crawl_segment) }
6
+ it { should have_db_column(:input_paths).of_type(:text) }
7
+ it { should have_db_column(:output_path).of_type(:text) }
8
+
9
+ describe '#job_flow_step' do
10
+ let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
11
+ :max_files => 5) }
12
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
13
+ let(:crawl_segment) { crawl.crawl_segments[0] }
14
+ let(:input_paths) {
15
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/*.warc.wet.gz'
16
+ }
17
+ let(:output_path) {
18
+ 's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
19
+ }
20
+ let(:config) {
21
+ { 'jar' => 's3://elasticrawl/jar/elasticrawl-example-0.0.1.jar',
22
+ 'class' => 'com.rossfairbanks.commoncrawl.elasticrawl.ParserDriver'
23
+ }
24
+ }
25
+
26
+ let(:job_step) { Elasticrawl::JobStep.create(:job => job,
27
+ :crawl_segment => crawl_segment,
28
+ :input_paths => input_paths,
29
+ :output_path => output_path) }
30
+ subject { job_step.job_flow_step(config) }
31
+
32
+ it 'should be a CustomJarStep' do
33
+ expect(subject).to be_a Elasticity::CustomJarStep
34
+ end
35
+
36
+ it 'should have a jar location' do
37
+ expect(subject.jar).to eq config['jar']
38
+ end
39
+
40
+ it 'should have 4 jar args' do
41
+ expect(subject.arguments.count).to eq 4
42
+ end
43
+
44
+ it 'should have a class argument' do
45
+ expect(subject.arguments[0]).to eq config['class']
46
+ end
47
+
48
+ it 'should have an input path arg' do
49
+ expect(subject.arguments[1]).to eq input_paths
50
+ end
51
+
52
+ it 'should have an output path arg' do
53
+ expect(subject.arguments[2]).to eq output_path
54
+ end
55
+
56
+ it 'should have a max files arg' do
57
+ expect(subject.arguments[3]).to eq '5'
58
+ end
59
+ end
60
+ end