elasticrawl 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ module Elasticrawl
2
+ # Represents an Elastic MapReduce job flow step. For a parse job this will
3
+ # process a single Common Crawl segment. For a combine job a single step
4
+ # will aggregate the results of multiple parse jobs.
5
+ class JobStep < ActiveRecord::Base
6
+ belongs_to :job
7
+ belongs_to :crawl_segment
8
+
9
+ # Returns a custom jar step that is configured with the jar location,
10
+ # class name and input and output paths.
11
+ #
12
+ # For parse jobs optionally specifies the maximum # of Common Crawl
13
+ # data files to process before the job exits.
14
+ def job_flow_step(job_config)
15
+ jar = job_config['jar']
16
+ max_files = self.job.max_files
17
+
18
+ step_args = []
19
+ step_args[0] = job_config['class']
20
+ step_args[1] = self.input_paths
21
+ step_args[2] = self.output_path
22
+ # All arguments must be strings.
23
+ step_args[3] = max_files.to_s if max_files.present?
24
+
25
+ step = Elasticity::CustomJarStep.new(jar)
26
+ step.name = set_step_name
27
+ step.arguments = step_args
28
+
29
+ step
30
+ end
31
+
32
+ private
33
+ # Sets the Elastic MapReduce job flow step name based on the type of job it
34
+ # belongs to.
35
+ def set_step_name
36
+ case self.job.type
37
+ when 'Elasticrawl::ParseJob'
38
+ segment =self.crawl_segment.segment_name if self.crawl_segment.present?
39
+ "Segment: #{segment}"
40
+ when 'Elasticrawl::CombineJob'
41
+ paths = self.input_paths.split(',')
42
+ "Combining #{paths.count} jobs"
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,84 @@
1
+ module Elasticrawl
2
+ # Represents an Elastic MapReduce job flow that parses segments of
3
+ # Common Crawl data. A job step is created per segment.
4
+ #
5
+ # Inherits from Job which is the ActiveRecord model class.
6
+ class ParseJob < Job
7
+ # Populates the job from the list of segments to be parsed.
8
+ def set_segments(crawl_segments, max_files = nil)
9
+ self.job_name = set_job_name
10
+ self.job_desc = set_job_desc(crawl_segments, max_files)
11
+ self.max_files = max_files
12
+
13
+ crawl_segments.each do |segment|
14
+ self.job_steps.push(create_job_step(segment))
15
+ end
16
+ end
17
+
18
+ # Runs the job by calling Elastic MapReduce API. If successful the
19
+ # parse time is set for each segment.
20
+ def run
21
+ emr_config = job_config['emr_config']
22
+ job_flow_id = run_job_flow(emr_config)
23
+
24
+ if job_flow_id.present?
25
+ self.job_flow_id = job_flow_id
26
+
27
+ self.job_steps.each do |step|
28
+ segment = step.crawl_segment
29
+ segment.parse_time = DateTime.now
30
+ segment.save
31
+ end
32
+
33
+ self.save
34
+ self.result_message
35
+ end
36
+ end
37
+
38
+ # Returns the S3 location for storing Elastic MapReduce job logs.
39
+ def log_uri
40
+ s3_path = "/logs/1-parse/#{self.job_name}/"
41
+ build_s3_uri(s3_path)
42
+ end
43
+
44
+ private
45
+ # Creates a job step for the crawl segment.
46
+ def create_job_step(segment)
47
+ JobStep.create(:job => self,
48
+ :crawl_segment => segment,
49
+ :input_paths => segment_input(segment),
50
+ :output_path => segment_output(segment))
51
+ end
52
+
53
+ # Returns the S3 location for reading a crawl segment. The input filter
54
+ # determines which type of Common Crawl data files are parsed.
55
+ def segment_input(segment)
56
+ segment.segment_s3_uri + job_config['input_filter']
57
+ end
58
+
59
+ # Returns the S3 location for storing the step results. This includes
60
+ # the segment name.
61
+ def segment_output(segment)
62
+ job_path = "/data/1-parse/#{self.job_name}"
63
+ s3_path = "#{job_path}/segments/#{segment.segment_name}/"
64
+ build_s3_uri(s3_path)
65
+ end
66
+
67
+ # Sets the job description which forms part of the Elastic MapReduce
68
+ # job flow name.
69
+ def set_job_desc(segments, max_files)
70
+ if segments.count > 0
71
+ crawl_name = segments[0].crawl.crawl_name if segments[0].crawl.present?
72
+ file_desc = max_files.nil? ? 'all files' : "#{max_files} files per segment"
73
+ end
74
+
75
+ "Crawl: #{crawl_name} Segments: #{segments.count} Parsing: #{file_desc}"
76
+ end
77
+
78
+ # Returns the parse job configuration from ~/.elasticrawl.jobs.yml.
79
+ def job_config
80
+ config = Config.new
81
+ config.load_config('jobs')['steps']['parse']
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,3 @@
1
+ module Elasticrawl
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,21 @@
1
+ require 'aws-sdk'
2
+ require 'active_record'
3
+ require 'active_support'
4
+ require 'elasticity'
5
+ require 'highline/import'
6
+ require 'thor'
7
+
8
+ module Elasticrawl
9
+ require 'elasticrawl/version'
10
+
11
+ require 'elasticrawl/config'
12
+ require 'elasticrawl/error'
13
+
14
+ require 'elasticrawl/cluster'
15
+ require 'elasticrawl/crawl'
16
+ require 'elasticrawl/crawl_segment'
17
+ require 'elasticrawl/job'
18
+ require 'elasticrawl/combine_job'
19
+ require 'elasticrawl/parse_job'
20
+ require 'elasticrawl/job_step'
21
+ end
@@ -0,0 +1,4 @@
1
+ # Configures the AWS credentials used when accessing the AWS EMR and S3 APIs.
2
+ # This file is populated by the elasticrawl init command.
3
+ access_key_id: 'ACCESS_KEY_ID'
4
+ secret_access_key: 'SECRET_ACCESS_KEY'
@@ -0,0 +1,44 @@
1
+ # Configures the Elastic MapReduce cluster that is launched to run parse and
2
+ # combine jobs. The list of EC2 instance types can be found at
3
+ # http://aws.amazon.com/ec2/instance-types/#instance-details
4
+
5
+ # Using spot instances is recommended to reduce costs. However if the spot
6
+ # price rises above your bid price the cluster may be terminated. Elasticrawl
7
+ # tries to reduce the effect of this by parsing each Commmon Crawl segment
8
+ # in a separate job flow step.
9
+
10
+ # The master node manages the cluster.
11
+ master_instance_group:
12
+ instance_type: m1.medium
13
+ use_spot_instances: true
14
+ bid_price: 0.120
15
+
16
+ # Core nodes run map and reduce tasks and store data using HDFS.
17
+ core_instance_group:
18
+ instance_type: m1.medium
19
+ instance_count: 2
20
+ use_spot_instances: true
21
+ bid_price: 0.120
22
+
23
+ # Task nodes are optional and only run map and reduce tasks.
24
+ task_instance_group:
25
+ instance_type: m1.small
26
+ instance_count: 0
27
+ use_spot_instances: true
28
+ bid_price: 0.080
29
+
30
+ # Array of bootstrap scripts that will be applied when the cluster nodes are
31
+ # initialized. The example installs the Ganglia distributed monitoring system.
32
+ bootstrap_scripts: #['s3://elasticmapreduce/bootstrap-actions/install-ganglia']
33
+
34
+ # Specifying an EC2 key pair allows SSH access to the master node. This also
35
+ # allows accessing the Hadoop Web UI over an SSH tunnel.
36
+ ec2_key_name: 'elasticrawl'
37
+
38
+ # Availability Zone (AZ) to launch instances in. An AZ in the US-East region is
39
+ # recommended since the Common Crawl corpus is stored there. Otherwise inter
40
+ # region data transfer charges will apply.
41
+ placement: 'us-east-1c'
42
+
43
+ # The AMI version to use when launching instances.
44
+ emr_ami_version: 'latest'
@@ -0,0 +1,31 @@
1
+ # Configures the AWS Elastic MapReduce jobs launched against the Common Crawl
2
+ # corpus.
3
+
4
+ # An S3 bucket is created by the init command and is used to store data and logs.
5
+ s3_bucket_name: 'elasticrawl'
6
+
7
+ # A parse step is created per Common Crawl segment. A combine step takes the
8
+ # results from multiple segments to create a single set of output files.
9
+
10
+ # The parse input filter is used to specify the Common Crawl file type.
11
+
12
+ # WARC: 'warc/*.warc.gz' - Full HTTP requests and responses.
13
+ # WAT: 'wat/*.warc.wat.gz' - Metadata extractions from WARC files.
14
+ # WET: 'wet/*.warc.wet.gz' - Text extractions from WARC files.
15
+
16
+ # The EMR config is an XML file that sets Hadoop properties. If a config file
17
+ # is specified then a bootstrap action is run on each node to apply it.
18
+ steps:
19
+ # Parse step for the Example Elasticrawl JAR. This does a word count
20
+ # against the text extractions of the corpus.
21
+ parse:
22
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
23
+ class: 'com.rossfairbanks.elasticrawl.examples.WordCount'
24
+ input_filter: 'wet/*.warc.wet.gz'
25
+ emr_config: #'s3://elasticrawl/jar/parse-mapred-site.xml'
26
+ # Combine step for the Example Elasticrawl JAR.
27
+ combine:
28
+ jar: 's3://elasticrawl/jar/elasticrawl-examples-1.0.0.jar'
29
+ class: 'com.rossfairbanks.elasticrawl.examples.SegmentCombiner'
30
+ input_filter: 'part-*'
31
+ emr_config: #'s3://elasticrawl/jar/combine-mapred-site.xml'
@@ -0,0 +1,35 @@
1
+ require 'elasticrawl'
2
+ require 'rspec'
3
+ require 'mocha'
4
+ require 'database_cleaner'
5
+ require 'shoulda-matchers'
6
+
7
+ RSpec.configure do |config|
8
+ config.before(:suite) do
9
+ # Return S3 paths that are used to create a crawl object with 3 crawl segments.
10
+ segment_paths = []
11
+ segment_paths[0] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
12
+ segment_paths[1] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381630/'
13
+ segment_paths[2] = 'common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696382185/'
14
+ Elasticrawl::Crawl.any_instance.stubs(:s3_segment_paths).returns(segment_paths)
15
+
16
+ # Load config from spec/fixtures/ rather than ~/.elasticrawl/
17
+ config_dir = File.join(File.dirname(__FILE__), 'fixtures')
18
+ Elasticrawl::Config.any_instance.stubs(:config_dir).returns(config_dir)
19
+
20
+ # Load sqlite database. For testing this is stored at db/elasticrawl.sqlite3
21
+ config = Elasticrawl::Config.new
22
+ config.load_database
23
+ end
24
+
25
+ # Run each test in a transaction and rollback data on completion.
26
+ DatabaseCleaner.strategy = :transaction
27
+
28
+ config.before(:each) do
29
+ DatabaseCleaner.start
30
+ end
31
+
32
+ config.after(:each) do
33
+ DatabaseCleaner.clean
34
+ end
35
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Cluster do
4
+ describe '#create_job_flow' do
5
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
6
+ let(:job) { Elasticrawl::ParseJob.new }
7
+ let(:cluster) { Elasticrawl::Cluster.new }
8
+ subject { cluster.create_job_flow(job) }
9
+
10
+ before do
11
+ job.set_segments(crawl.crawl_segments)
12
+ end
13
+
14
+ it 'should be an Elasticity::JobFlow' do
15
+ expect(subject).to be_a Elasticity::JobFlow
16
+ end
17
+
18
+ it 'should have a job flow name' do
19
+ expect(subject.name).to eq "Job Name: #{job.job_name} #{job.job_desc}"
20
+ end
21
+
22
+ it 'should have a log uri' do
23
+ expect(subject.log_uri).to eq job.log_uri
24
+ end
25
+
26
+ it 'should have an ec2 key name' do
27
+ expect(subject.ec2_key_name).to eq 'elasticrawl'
28
+ end
29
+
30
+ it 'should have a placement az name' do
31
+ expect(subject.placement).to eq 'us-east-1c'
32
+ end
33
+
34
+ it 'should have an ami version' do
35
+ expect(subject.ami_version).to eq 'latest'
36
+ end
37
+ end
38
+
39
+ describe '#cluster_desc' do
40
+ let(:cluster_desc) {
41
+ cluster_desc = <<-HERE
42
+ Cluster configuration
43
+ Master: 1 m1.medium (Spot: 0.12)
44
+ Core: 2 m1.medium (Spot: 0.12)
45
+ Task: --
46
+ HERE
47
+ }
48
+ subject { Elasticrawl::Cluster.new }
49
+
50
+ it 'should describe configured instance groups' do
51
+ expect(subject.cluster_desc).to eq cluster_desc
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,97 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::CombineJob do
4
+ describe '#set_input_jobs' do
5
+ let(:job_name) { (Time.now.to_f * 1000).to_i.to_s }
6
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
7
+ let(:segment_list_1) { crawl.crawl_segments[0..1] }
8
+ let(:segment_list_2) { [crawl.crawl_segments[2]]}
9
+
10
+ let(:parse_job_1) { Elasticrawl::ParseJob.new }
11
+ let(:parse_job_2) { Elasticrawl::ParseJob.new }
12
+ let(:combine_job) { Elasticrawl::CombineJob.new }
13
+
14
+ before do
15
+ crawl.create_segments
16
+ parse_job_1.set_segments(segment_list_1)
17
+ parse_job_2.set_segments(segment_list_2)
18
+
19
+ input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
20
+ combine_job.set_input_jobs(input_jobs)
21
+ end
22
+
23
+ it 'should have a job name based on current time' do
24
+ expect(combine_job.job_name.slice(0, 8)).to eq job_name.slice(0, 8)
25
+ end
26
+
27
+ it 'should have a job desc' do
28
+ expect(combine_job.job_desc.end_with?('Combining: 3 segments')).to eq true
29
+ end
30
+
31
+ it 'should create 1 job step' do
32
+ expect(combine_job.job_steps.count).to eq 1
33
+ end
34
+
35
+ it 'should set 1 input path per parse job' do
36
+ input_paths = combine_job.job_steps[0].input_paths
37
+ expect(input_paths.split(',').count).to eq 2
38
+ end
39
+
40
+ it 'should set input path including parse job name' do
41
+ input_paths = combine_job.job_steps[0].input_paths
42
+ expect(input_paths.include?(parse_job_1.job_name)).to eq true
43
+ end
44
+
45
+ it 'should set input path without segment names' do
46
+ input_paths = combine_job.job_steps[0].input_paths
47
+ segment_name = segment_list_1[0].segment_name
48
+ expect(input_paths.include?(segment_name)).to eq false
49
+ end
50
+
51
+ it 'should set output path including job name' do
52
+ output_path = combine_job.job_steps[0].output_path
53
+ expect(output_path.include?(combine_job.job_name)).to eq true
54
+ end
55
+ end
56
+
57
+ describe '#run' do
58
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
59
+ let(:parse_job_1) { Elasticrawl::ParseJob.new }
60
+ let(:parse_job_2) { Elasticrawl::ParseJob.new }
61
+ let(:combine_job) { Elasticrawl::CombineJob.new }
62
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
63
+
64
+ before do
65
+ crawl.create_segments
66
+ parse_job_1.set_segments(crawl.crawl_segments[0..1])
67
+ parse_job_2.set_segments([crawl.crawl_segments[2]])
68
+
69
+ input_jobs = [parse_job_1.job_name, parse_job_2.job_name]
70
+ combine_job.set_input_jobs(input_jobs)
71
+ end
72
+
73
+ it 'should set a job flow id' do
74
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
75
+ combine_job.run
76
+
77
+ expect(combine_job.job_flow_id).to eq job_flow_id
78
+ end
79
+ end
80
+
81
+ describe '#log_uri' do
82
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
83
+ let(:parse_job) { Elasticrawl::ParseJob.new }
84
+ let(:job) { Elasticrawl::CombineJob.new }
85
+
86
+ before do
87
+ crawl.create_segments
88
+ parse_job.set_segments(crawl.crawl_segments)
89
+
90
+ job.set_input_jobs([parse_job.job_name])
91
+ end
92
+
93
+ it 'should set a log uri including the job name' do
94
+ expect(job.log_uri).to eq "s3://elasticrawl/logs/2-combine/#{job.job_name}/"
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Config do
4
+ describe '#load_config' do
5
+ subject { Elasticrawl::Config.new }
6
+
7
+ it 'should return a hash of config data' do
8
+ config_data = subject.load_config('jobs')
9
+ expect(config_data).to be_a Hash
10
+ end
11
+
12
+ it 'should load yaml config file' do
13
+ config_data = subject.load_config('jobs')
14
+ expect(config_data['s3_bucket_name']).to eq 'elasticrawl'
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,27 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::CrawlSegment do
4
+ it { should belong_to(:crawl) }
5
+ it { should have_many(:job_steps) }
6
+ it { should have_db_column(:segment_name).of_type(:string) }
7
+ it { should have_db_column(:segment_s3_uri).of_type(:string) }
8
+ it { should have_db_column(:parse_time).of_type(:datetime) }
9
+
10
+ describe '#initialize' do
11
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
12
+ subject { crawl.crawl_segments[0]}
13
+
14
+ before do
15
+ crawl.create_segments
16
+ end
17
+
18
+ it 'should have a segment name' do
19
+ expect(subject.segment_name).to eq '1368696381249'
20
+ end
21
+
22
+ it 'should have an s3 uri' do
23
+ expect(subject.segment_s3_uri).to eq \
24
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,137 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Crawl do
4
+ it { should have_many(:crawl_segments) }
5
+ it { should have_db_column(:crawl_name).of_type(:string) }
6
+
7
+ describe '#has_segments?' do
8
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
9
+ subject { Elasticrawl::Crawl.new(:crawl_name => crawl_name) }
10
+
11
+ it 'should have segments' do
12
+ expect(subject.has_segments?).to eq true
13
+ end
14
+ end
15
+
16
+ describe '#create_segments' do
17
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
18
+ subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
19
+
20
+ before do
21
+ subject.create_segments
22
+ end
23
+
24
+ it 'should set crawl name' do
25
+ expect(subject.crawl_name).to eq crawl_name
26
+ end
27
+
28
+ it 'should create correct # of segments' do
29
+ expect(subject.crawl_segments.count).to eq 3
30
+ end
31
+
32
+ it 'should create segment names' do
33
+ expect(subject.crawl_segments[0].segment_name).to eq '1368696381249'
34
+ end
35
+
36
+ it 'should create segment s3 uris' do
37
+ expect(subject.crawl_segments[0].segment_s3_uri).to eq \
38
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/'
39
+ end
40
+ end
41
+
42
+ describe '#next_segments' do
43
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
44
+ subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
45
+
46
+ before do
47
+ subject.create_segments
48
+ end
49
+
50
+ it 'should return all segments' do
51
+ crawl_segments = subject.next_segments
52
+
53
+ expect(crawl_segments.count).to eq 3
54
+ expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
55
+ expect(crawl_segments[0].segment_name).to eq '1368696381249'
56
+ end
57
+
58
+ it 'should return first # segments' do
59
+ crawl_segments = subject.next_segments(2)
60
+
61
+ expect(crawl_segments.count).to eq 2
62
+ expect(crawl_segments[0].crawl.crawl_name).to eq crawl_name
63
+ expect(crawl_segments[0].segment_name).to eq '1368696381249'
64
+ end
65
+ end
66
+
67
+ describe '#select_segments' do
68
+ let(:crawl_name) { 'CC-MAIN-2013-20' }
69
+ subject { Elasticrawl::Crawl.create(:crawl_name => crawl_name) }
70
+
71
+ before do
72
+ subject.create_segments
73
+ end
74
+
75
+ it 'should select no segments' do
76
+ segments_list = ['test', 'segment']
77
+ crawl_segments = subject.select_segments(segments_list)
78
+
79
+ expect(crawl_segments.count).to eq 0
80
+ end
81
+
82
+ it 'should select only segments in list' do
83
+ segments_list = ['1368696381249', '1368696382185']
84
+ crawl_segments = subject.select_segments(segments_list)
85
+
86
+ expect(crawl_segments.count).to eq 2
87
+ end
88
+ end
89
+
90
+ describe '#reset' do
91
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
92
+ let(:job) { Elasticrawl::ParseJob.new }
93
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
94
+
95
+ before do
96
+ crawl.create_segments
97
+ job.set_segments(crawl.crawl_segments[0..1])
98
+
99
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
100
+ job.run
101
+
102
+ crawl.reset
103
+ end
104
+
105
+ it 'should set parse time of all segments to null' do
106
+ unparsed_segments = Elasticrawl::CrawlSegment.where(:parse_time => nil).count
107
+ expect(crawl.crawl_segments.count).to eq unparsed_segments
108
+ end
109
+ end
110
+
111
+ describe '.status' do
112
+ let(:job_desc) { 'Crawl: CC-MAIN-2013-20 Segments: 2 Parsing: 5 files per segment' }
113
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
114
+ let(:max_files) { 5 }
115
+ let(:job) { Elasticrawl::ParseJob.new }
116
+ let(:job_flow_id) { 'j-3QHDKKBT6VAIS' }
117
+
118
+ before do
119
+ crawl.create_segments
120
+ job.set_segments(crawl.crawl_segments[0..1], max_files)
121
+
122
+ Elasticity::JobFlow.any_instance.stubs(:run).returns(job_flow_id)
123
+ job.run
124
+ end
125
+
126
+ it 'should display status of crawl segments' do
127
+ expect(Elasticrawl::Crawl.status.split("\n")[1]).to eq \
128
+ 'CC-MAIN-2013-20 Segments: to parse 1, parsed 2, total 3'
129
+ end
130
+
131
+ it 'should display parse job desc' do
132
+ crawl_status = Elasticrawl::Crawl.status.split("\n")[4]
133
+ expect(crawl_status.include?(job.job_name)).to eq true
134
+ expect(crawl_status.include?(job.job_desc)).to eq true
135
+ end
136
+ end
137
+ end
@@ -0,0 +1,10 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::Job do
4
+ it { should have_many(:job_steps) }
5
+ it { should have_db_column(:type).of_type(:string) }
6
+ it { should have_db_column(:job_name).of_type(:string) }
7
+ it { should have_db_column(:job_desc).of_type(:string) }
8
+ it { should have_db_column(:max_files).of_type(:integer) }
9
+ it { should have_db_column(:job_flow_id).of_type(:string) }
10
+ end
@@ -0,0 +1,60 @@
1
+ require 'spec_helper'
2
+
3
+ describe Elasticrawl::JobStep do
4
+ it { should belong_to(:job) }
5
+ it { should belong_to(:crawl_segment) }
6
+ it { should have_db_column(:input_paths).of_type(:text) }
7
+ it { should have_db_column(:output_path).of_type(:text) }
8
+
9
+ describe '#job_flow_step' do
10
+ let(:job) { Elasticrawl::ParseJob.create(:job_name => '1389789645620',
11
+ :max_files => 5) }
12
+ let(:crawl) { Elasticrawl::Crawl.create(:crawl_name => 'CC-MAIN-2013-20') }
13
+ let(:crawl_segment) { crawl.crawl_segments[0] }
14
+ let(:input_paths) {
15
+ 's3://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-20/segments/1368696381249/wet/*.warc.wet.gz'
16
+ }
17
+ let(:output_path) {
18
+ 's3://elasticrawl/data/1-parse/1389789645620/segments/1368696381249/'
19
+ }
20
+ let(:config) {
21
+ { 'jar' => 's3://elasticrawl/jar/elasticrawl-example-0.0.1.jar',
22
+ 'class' => 'com.rossfairbanks.commoncrawl.elasticrawl.ParserDriver'
23
+ }
24
+ }
25
+
26
+ let(:job_step) { Elasticrawl::JobStep.create(:job => job,
27
+ :crawl_segment => crawl_segment,
28
+ :input_paths => input_paths,
29
+ :output_path => output_path) }
30
+ subject { job_step.job_flow_step(config) }
31
+
32
+ it 'should be a CustomJarStep' do
33
+ expect(subject).to be_a Elasticity::CustomJarStep
34
+ end
35
+
36
+ it 'should have a jar location' do
37
+ expect(subject.jar).to eq config['jar']
38
+ end
39
+
40
+ it 'should have 4 jar args' do
41
+ expect(subject.arguments.count).to eq 4
42
+ end
43
+
44
+ it 'should have a class argument' do
45
+ expect(subject.arguments[0]).to eq config['class']
46
+ end
47
+
48
+ it 'should have an input path arg' do
49
+ expect(subject.arguments[1]).to eq input_paths
50
+ end
51
+
52
+ it 'should have an output path arg' do
53
+ expect(subject.arguments[2]).to eq output_path
54
+ end
55
+
56
+ it 'should have a max files arg' do
57
+ expect(subject.arguments[3]).to eq '5'
58
+ end
59
+ end
60
+ end