elasticrawl 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +21 -0
- data/.travis.yml +5 -0
- data/Cheffile +14 -0
- data/Cheffile.lock +37 -0
- data/Gemfile +3 -0
- data/LICENSE +22 -0
- data/README.md +232 -0
- data/Rakefile +11 -0
- data/Vagrantfile +58 -0
- data/bin/elasticrawl +141 -0
- data/db/migrate/201401051536_create_crawls.rb +10 -0
- data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
- data/db/migrate/201401101723_create_jobs.rb +14 -0
- data/db/migrate/201401141606_create_job_steps.rb +11 -0
- data/elasticrawl.gemspec +35 -0
- data/lib/elasticrawl/cluster.rb +128 -0
- data/lib/elasticrawl/combine_job.rb +86 -0
- data/lib/elasticrawl/config.rb +242 -0
- data/lib/elasticrawl/crawl.rb +114 -0
- data/lib/elasticrawl/crawl_segment.rb +8 -0
- data/lib/elasticrawl/error.rb +22 -0
- data/lib/elasticrawl/job.rb +68 -0
- data/lib/elasticrawl/job_step.rb +46 -0
- data/lib/elasticrawl/parse_job.rb +84 -0
- data/lib/elasticrawl/version.rb +3 -0
- data/lib/elasticrawl.rb +21 -0
- data/spec/fixtures/aws.yml +4 -0
- data/spec/fixtures/cluster.yml +44 -0
- data/spec/fixtures/jobs.yml +31 -0
- data/spec/spec_helper.rb +35 -0
- data/spec/unit/cluster_spec.rb +54 -0
- data/spec/unit/combine_job_spec.rb +97 -0
- data/spec/unit/config_spec.rb +17 -0
- data/spec/unit/crawl_segment_spec.rb +27 -0
- data/spec/unit/crawl_spec.rb +137 -0
- data/spec/unit/job_spec.rb +10 -0
- data/spec/unit/job_step_spec.rb +60 -0
- data/spec/unit/parse_job_spec.rb +130 -0
- data/templates/aws.yml +7 -0
- data/templates/cluster.yml +44 -0
- data/templates/jobs.yml +31 -0
- metadata +315 -0
@@ -0,0 +1,128 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Configures the cluster settings for the job flow that will be launched.
|
3
|
+
# These settings are loaded from ~/.elasticrawl/cluster.yml.
|
4
|
+
class Cluster
|
5
|
+
def initialize
|
6
|
+
@master_group = instance_group('master')
|
7
|
+
@core_group = instance_group('core')
|
8
|
+
@task_group = instance_group('task') if has_task_group?
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns a configured job flow to the calling job.
|
12
|
+
def create_job_flow(job, emr_config = nil)
|
13
|
+
config = Config.new
|
14
|
+
job_flow = Elasticity::JobFlow.new(config.access_key_id,
|
15
|
+
config.secret_access_key)
|
16
|
+
job_flow.name = "Job Name: #{job.job_name} #{job.job_desc}"
|
17
|
+
job_flow.log_uri = job.log_uri
|
18
|
+
|
19
|
+
configure_job_flow(job_flow)
|
20
|
+
configure_instances(job_flow)
|
21
|
+
configure_bootstrap_actions(job_flow, emr_config)
|
22
|
+
|
23
|
+
job_flow
|
24
|
+
end
|
25
|
+
|
26
|
+
# Describes the instances that will be launched. This is used by the
|
27
|
+
# job confirmation messages.
|
28
|
+
def cluster_desc
|
29
|
+
cluster_desc = <<-HERE
|
30
|
+
Cluster configuration
|
31
|
+
Master: #{instance_group_desc(@master_group)}
|
32
|
+
Core: #{instance_group_desc(@core_group)}
|
33
|
+
Task: #{instance_group_desc(@task_group)}
|
34
|
+
HERE
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
# Set job flow properties from settings in cluster.yml.
|
39
|
+
def configure_job_flow(job_flow)
|
40
|
+
ec2_key_name = config_setting('ec2_key_name')
|
41
|
+
placement = config_setting('placement')
|
42
|
+
emr_ami_version = config_setting('emr_ami_version')
|
43
|
+
|
44
|
+
job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
|
45
|
+
job_flow.placement = placement if placement.present?
|
46
|
+
job_flow.ami_version = emr_ami_version if emr_ami_version.present?
|
47
|
+
end
|
48
|
+
|
49
|
+
# Configures the instances that will be launched. The master group has
|
50
|
+
# a single node. The task group is optional.
|
51
|
+
def configure_instances(job_flow)
|
52
|
+
job_flow.set_master_instance_group(@master_group)
|
53
|
+
job_flow.set_core_instance_group(@core_group)
|
54
|
+
job_flow.set_task_instance_group(@task_group) if @task_group.present?
|
55
|
+
end
|
56
|
+
|
57
|
+
# Configures bootstrap actions that will be run when each instance is
|
58
|
+
# launched. EMR config is an XML file of Hadoop settings stored on S3.
|
59
|
+
# There are applied to each node by a bootstrap action.
|
60
|
+
def configure_bootstrap_actions(job_flow, emr_config = nil)
|
61
|
+
bootstrap_scripts = config_setting('bootstrap_scripts')
|
62
|
+
|
63
|
+
if bootstrap_scripts.present?
|
64
|
+
bootstrap_scripts.each do |script_uri|
|
65
|
+
action = Elasticity::BootstrapAction.new(script_uri, '', '')
|
66
|
+
job_flow.add_bootstrap_action(action)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
if emr_config.present?
|
71
|
+
action = Elasticity::HadoopFileBootstrapAction.new(emr_config)
|
72
|
+
job_flow.add_bootstrap_action(action)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns whether cluster.yml specifies a task group.
|
77
|
+
def has_task_group?
|
78
|
+
task_config = config_for_group('task')
|
79
|
+
task_config.has_key?('instance_count') && task_config['instance_count'] > 0
|
80
|
+
end
|
81
|
+
|
82
|
+
# Describes an instance group.
|
83
|
+
def instance_group_desc(group)
|
84
|
+
if group.present?
|
85
|
+
if group.market == 'SPOT'
|
86
|
+
price = "(Spot: #{group.bid_price})"
|
87
|
+
else
|
88
|
+
price = '(On Demand)'
|
89
|
+
end
|
90
|
+
|
91
|
+
"#{group.count} #{group.type} #{price}"
|
92
|
+
else
|
93
|
+
'--'
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
# Configures an instance group with the instance type, # of instances and
|
98
|
+
# the bid price if spot instances are to be used.
|
99
|
+
def instance_group(group_name)
|
100
|
+
config = config_for_group(group_name)
|
101
|
+
|
102
|
+
instance_group = Elasticity::InstanceGroup.new
|
103
|
+
instance_group.role = group_name.upcase
|
104
|
+
instance_group.type = config['instance_type']
|
105
|
+
|
106
|
+
if config.has_key?('instance_count') && config['instance_count'] > 0
|
107
|
+
instance_group.count = config['instance_count']
|
108
|
+
end
|
109
|
+
|
110
|
+
if config['use_spot_instances'] == true
|
111
|
+
instance_group.set_spot_instances(config['bid_price'])
|
112
|
+
end
|
113
|
+
|
114
|
+
instance_group
|
115
|
+
end
|
116
|
+
|
117
|
+
# Returns the config settings for an instance group.
|
118
|
+
def config_for_group(group_name)
|
119
|
+
config_setting("#{group_name}_instance_group")
|
120
|
+
end
|
121
|
+
|
122
|
+
# Returns a config setting from cluster.yml.
|
123
|
+
def config_setting(key_name)
|
124
|
+
config = Config.new
|
125
|
+
config.load_config('cluster')[key_name]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents an Elastic MapReduce job flow that combines the results of
|
3
|
+
# multiple Elasticrawl Parse jobs. Parse jobs write their results per
|
4
|
+
# segment. Combine jobs aggregate parse results into a single set of files.
|
5
|
+
#
|
6
|
+
# Inherits from Job which is the ActiveRecord model class.
|
7
|
+
class CombineJob < Job
|
8
|
+
# Takes in an array of parse jobs that are to be combined. Creates a single
|
9
|
+
# job step whose input paths are the outputs of the parse jobs.
|
10
|
+
def set_input_jobs(input_jobs)
|
11
|
+
segment_count = 0
|
12
|
+
input_paths = []
|
13
|
+
|
14
|
+
input_jobs.each do |job_name|
|
15
|
+
input_job = Job.where(:job_name => job_name,
|
16
|
+
:type => 'Elasticrawl::ParseJob').first_or_initialize
|
17
|
+
step_count = input_job.job_steps.count
|
18
|
+
|
19
|
+
if step_count > 0
|
20
|
+
segment_count += step_count
|
21
|
+
input_paths << set_input_path(input_job)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
self.job_name = set_job_name
|
26
|
+
self.job_desc = set_job_desc(segment_count)
|
27
|
+
job_steps.push(create_job_step(input_paths.join(',')))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Runs the job by calling the Elastic MapReduce API.
|
31
|
+
def run
|
32
|
+
emr_config = job_config['emr_config']
|
33
|
+
job_flow_id = run_job_flow(emr_config)
|
34
|
+
|
35
|
+
if job_flow_id.present?
|
36
|
+
self.job_flow_id = job_flow_id
|
37
|
+
self.save
|
38
|
+
self.result_message
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the S3 location for storing Elastic MapReduce job logs.
|
43
|
+
def log_uri
|
44
|
+
s3_path = "/logs/2-combine/#{self.job_name}/"
|
45
|
+
build_s3_uri(s3_path)
|
46
|
+
end
|
47
|
+
|
48
|
+
private
|
49
|
+
# Returns a single job step. The input paths are a CSV list of parse
|
50
|
+
# job outputs.
|
51
|
+
def create_job_step(input_paths)
|
52
|
+
JobStep.create(:job => self,
|
53
|
+
:input_paths => input_paths,
|
54
|
+
:output_path => set_output_path)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Returns the S3 location for reading a parse job. A wildcard is
|
58
|
+
# used for the segment names. The input filter depends on the output
|
59
|
+
# file type of the parse job and what type of compression is used.
|
60
|
+
def set_input_path(input_job)
|
61
|
+
job_name = input_job.job_name
|
62
|
+
input_filter = job_config['input_filter']
|
63
|
+
|
64
|
+
s3_path = "/data/1-parse/#{job_name}/segments/*/#{input_filter}"
|
65
|
+
build_s3_uri(s3_path)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Returns the S3 location for storing the combine job results.
|
69
|
+
def set_output_path
|
70
|
+
s3_path = "/data/2-combine/#{self.job_name}/"
|
71
|
+
build_s3_uri(s3_path)
|
72
|
+
end
|
73
|
+
|
74
|
+
# Sets the job description which forms part of the Elastic MapReduce
|
75
|
+
# job flow name.
|
76
|
+
def set_job_desc(segment_count)
|
77
|
+
"Combining: #{segment_count} segments"
|
78
|
+
end
|
79
|
+
|
80
|
+
# Returns the combine job configuration from ~/.elasticrawl.jobs.yml.
|
81
|
+
def job_config
|
82
|
+
config = Config.new
|
83
|
+
config.load_config('jobs')['steps']['combine']
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents the current configuration which is persisted to
|
3
|
+
# ~/.elasticrawl/ and contains 3 configuration files.
|
4
|
+
#
|
5
|
+
# aws.yml - AWS access credentials unless stored in the environment
|
6
|
+
# variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
|
7
|
+
# cluster.yml - Elastic MapReduce cluster config including instance groups.
|
8
|
+
# jobs.yml - Elastic MapReduce jobs config and the S3 bucket used for
|
9
|
+
# storing data and logs.
|
10
|
+
#
|
11
|
+
# This directory also contains the Elasticrawl SQLite database.
|
12
|
+
class Config
|
13
|
+
CONFIG_DIR = '.elasticrawl'
|
14
|
+
DATABASE_FILE = 'elasticrawl.sqlite3'
|
15
|
+
TEMPLATES_DIR = '../../templates'
|
16
|
+
TEMPLATE_FILES = ['aws.yml', 'cluster.yml', 'jobs.yml']
|
17
|
+
|
18
|
+
attr_reader :access_key_id
|
19
|
+
attr_reader :secret_access_key
|
20
|
+
|
21
|
+
# Sets the AWS access credentials needed for the S3 and EMR API calls.
|
22
|
+
def initialize(access_key_id = nil, secret_access_key = nil)
|
23
|
+
# Credentials have been provided to the init command.
|
24
|
+
@access_key_id = access_key_id
|
25
|
+
@secret_access_key = secret_access_key
|
26
|
+
|
27
|
+
# If credentials are not set then check if they are available in aws.yml.
|
28
|
+
if dir_exists?
|
29
|
+
config = load_config('aws')
|
30
|
+
key = config['access_key_id']
|
31
|
+
secret = config['secret_access_key']
|
32
|
+
|
33
|
+
@access_key_id ||= key unless key == 'ACCESS_KEY_ID'
|
34
|
+
@secret_access_key ||= secret unless secret == 'SECRET_ACCESS_KEY'
|
35
|
+
end
|
36
|
+
|
37
|
+
# If credentials are still not set then check AWS environment variables.
|
38
|
+
@access_key_id ||= ENV['AWS_ACCESS_KEY_ID']
|
39
|
+
@secret_access_key ||= ENV['AWS_SECRET_ACCESS_KEY']
|
40
|
+
|
41
|
+
# Set AWS credentials for use when accessing the S3 API.
|
42
|
+
AWS.config(:access_key_id => @access_key_id,
|
43
|
+
:secret_access_key => @secret_access_key)
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the location of the config directory.
|
47
|
+
def config_dir
|
48
|
+
File.join(Dir.home, CONFIG_DIR)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Checks if the configuration directory exists.
|
52
|
+
def dir_exists?
|
53
|
+
Dir.exists?(config_dir)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Loads a YAML configuration file.
|
57
|
+
def load_config(config_file)
|
58
|
+
if dir_exists?
|
59
|
+
begin
|
60
|
+
config_file = File.join(config_dir, "#{config_file}.yml")
|
61
|
+
config = YAML::load(File.open(config_file))
|
62
|
+
|
63
|
+
rescue StandardError => e
|
64
|
+
raise FileAccessError, e.message
|
65
|
+
end
|
66
|
+
else
|
67
|
+
raise ConfigDirMissingError, 'Config dir missing. Run init command'
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Loads the sqlite database. If no database exists it will be created
|
72
|
+
# and the database migrations will be run.
|
73
|
+
def load_database
|
74
|
+
if dir_exists?
|
75
|
+
config = {
|
76
|
+
'adapter' => 'sqlite3',
|
77
|
+
'database' => File.join(config_dir, DATABASE_FILE),
|
78
|
+
'pool' => 5,
|
79
|
+
'timeout' => 5000
|
80
|
+
}
|
81
|
+
|
82
|
+
begin
|
83
|
+
ActiveRecord::Base.establish_connection(config)
|
84
|
+
ActiveRecord::Migrator.migrate(File.join(File.dirname(__FILE__), \
|
85
|
+
'../../db/migrate'), ENV['VERSION'] ? ENV['VERSION'].to_i : nil )
|
86
|
+
|
87
|
+
rescue StandardError => e
|
88
|
+
raise DatabaseAccessError, e.message
|
89
|
+
end
|
90
|
+
else
|
91
|
+
raise ConfigDirMissingError, 'Config dir missing. Run init command'
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Checks if a S3 bucket name is in use.
|
96
|
+
def bucket_exists?(bucket_name)
|
97
|
+
begin
|
98
|
+
s3 = AWS::S3.new
|
99
|
+
s3.buckets[bucket_name].exists?
|
100
|
+
|
101
|
+
rescue AWS::S3::Errors::SignatureDoesNotMatch => e
|
102
|
+
raise AWSCredentialsInvalidError, 'AWS access credentials are invalid'
|
103
|
+
rescue StandardError => e
|
104
|
+
raise S3AccessError, e.message
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
# Creates the S3 bucket and config directory. Deploys the config templates
|
109
|
+
# and creates the sqlite database.
|
110
|
+
def create(bucket_name)
|
111
|
+
create_bucket(bucket_name)
|
112
|
+
deploy_templates(bucket_name)
|
113
|
+
load_database
|
114
|
+
|
115
|
+
status_message(bucket_name, 'created')
|
116
|
+
end
|
117
|
+
|
118
|
+
# Deletes the S3 bucket and config directory.
|
119
|
+
def delete
|
120
|
+
bucket_name = load_config('jobs')['s3_bucket_name']
|
121
|
+
delete_bucket(bucket_name)
|
122
|
+
delete_config_dir
|
123
|
+
|
124
|
+
status_message(bucket_name, 'deleted')
|
125
|
+
end
|
126
|
+
|
127
|
+
# Displayed by destroy command to confirm deletion.
|
128
|
+
def delete_warning
|
129
|
+
bucket_name = load_config('jobs')['s3_bucket_name']
|
130
|
+
|
131
|
+
message = ['WARNING:']
|
132
|
+
message << "Bucket s3://#{bucket_name} and its data will be deleted"
|
133
|
+
message << "Config dir #{config_dir} will be deleted"
|
134
|
+
|
135
|
+
message.join("\n")
|
136
|
+
end
|
137
|
+
|
138
|
+
# Displayed by init command.
|
139
|
+
def access_key_prompt
|
140
|
+
prompt = "Enter AWS Access Key ID:"
|
141
|
+
prompt += " [#{@access_key_id}]" if @access_key_id.present?
|
142
|
+
|
143
|
+
prompt
|
144
|
+
end
|
145
|
+
|
146
|
+
# Displayed by init command.
|
147
|
+
def secret_key_prompt
|
148
|
+
prompt = "Enter AWS Secret Access Key:"
|
149
|
+
prompt += " [#{@secret_access_key}]" if @secret_access_key.present?
|
150
|
+
|
151
|
+
prompt
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
# Creates a bucket using the S3 API.
|
156
|
+
def create_bucket(bucket_name)
|
157
|
+
begin
|
158
|
+
s3 = AWS::S3.new
|
159
|
+
s3.buckets.create(bucket_name)
|
160
|
+
|
161
|
+
rescue StandardError => e
|
162
|
+
raise S3AccessError, e.message
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Deletes a bucket and its contents using the S3 API.
|
167
|
+
def delete_bucket(bucket_name)
|
168
|
+
begin
|
169
|
+
s3 = AWS::S3.new
|
170
|
+
bucket = s3.buckets[bucket_name]
|
171
|
+
bucket.delete!
|
172
|
+
|
173
|
+
rescue StandardError => e
|
174
|
+
raise S3AccessError, e.message
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
# Creates config directory and copies config templates into it.
|
179
|
+
# Saves S3 bucket name to jobs.yml and AWS credentials to aws.yml.
|
180
|
+
def deploy_templates(bucket_name)
|
181
|
+
begin
|
182
|
+
Dir.mkdir(config_dir, 0755) if dir_exists? == false
|
183
|
+
|
184
|
+
TEMPLATE_FILES.each do |template_file|
|
185
|
+
FileUtils.cp(File.join(File.dirname(__FILE__), TEMPLATES_DIR, template_file),
|
186
|
+
File.join(config_dir, template_file))
|
187
|
+
end
|
188
|
+
|
189
|
+
save_config('jobs', { 'BUCKET_NAME' => bucket_name })
|
190
|
+
save_aws_config
|
191
|
+
|
192
|
+
rescue StandardError => e
|
193
|
+
raise FileAccessError, e.message
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Saves AWS access credentials to aws.yml unless they are configured as
|
198
|
+
# environment variables.
|
199
|
+
def save_aws_config
|
200
|
+
env_key = ENV['AWS_ACCESS_KEY_ID']
|
201
|
+
env_secret = ENV['AWS_SECRET_ACCESS_KEY']
|
202
|
+
|
203
|
+
creds = {}
|
204
|
+
creds['ACCESS_KEY_ID'] = @access_key_id unless @access_key_id == env_key
|
205
|
+
creds['SECRET_ACCESS_KEY'] = @secret_access_key \
|
206
|
+
unless @secret_access_key == env_secret
|
207
|
+
|
208
|
+
save_config('aws', creds)
|
209
|
+
end
|
210
|
+
|
211
|
+
# Saves config values by overwriting placeholder values in template.
|
212
|
+
def save_config(template, params)
|
213
|
+
config_file = File.join(config_dir, "#{template}.yml")
|
214
|
+
config = File.read(config_file)
|
215
|
+
|
216
|
+
params.map { |key, value| config = config.gsub(key, value) }
|
217
|
+
|
218
|
+
File.open(config_file, 'w') { |file| file.write(config) }
|
219
|
+
end
|
220
|
+
|
221
|
+
# Deletes the config directory including its contents.
|
222
|
+
def delete_config_dir
|
223
|
+
begin
|
224
|
+
FileUtils.rm_r(config_dir) if dir_exists?
|
225
|
+
|
226
|
+
rescue StandardError => e
|
227
|
+
raise FileAccessError, e.message
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# Notifies user of results of init or destroy commands.
|
232
|
+
def status_message(bucket_name, state)
|
233
|
+
message = ['', "Bucket s3://#{bucket_name} #{state}"]
|
234
|
+
message << "Config dir #{config_dir} #{state}"
|
235
|
+
|
236
|
+
state = 'complete' if state == 'created'
|
237
|
+
message << "Config #{state}"
|
238
|
+
|
239
|
+
message.join("\n")
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Represents a web crawl released by the Common Crawl Foundation.
|
3
|
+
# Each crawl is split into multiple crawl segments and is stored
|
4
|
+
# in the S3 public datasets bucket.
|
5
|
+
class Crawl < ActiveRecord::Base
|
6
|
+
has_many :crawl_segments
|
7
|
+
|
8
|
+
COMMON_CRAWL_BUCKET = 'aws-publicdatasets'
|
9
|
+
COMMON_CRAWL_PATH = 'common-crawl/crawl-data/'
|
10
|
+
SEGMENTS_PATH = '/segments/'
|
11
|
+
MAX_SEGMENTS = 256
|
12
|
+
|
13
|
+
# Returns the status of all saved crawls and the current job history.
|
14
|
+
def self.status(show_all = false)
|
15
|
+
status = ['Crawl Status']
|
16
|
+
Crawl.all.map { |crawl| status << crawl.status }
|
17
|
+
|
18
|
+
if show_all == true
|
19
|
+
header = 'Job History'
|
20
|
+
jobs = Job.where('job_flow_id is not null').order(:id => :desc)
|
21
|
+
else
|
22
|
+
header = 'Job History (last 10)'
|
23
|
+
jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10)
|
24
|
+
end
|
25
|
+
|
26
|
+
status << ['', header]
|
27
|
+
jobs.map { |job| status << job.history }
|
28
|
+
|
29
|
+
status.join("\n")
|
30
|
+
end
|
31
|
+
|
32
|
+
# Returns the status of the current crawl.
|
33
|
+
def status
|
34
|
+
total = self.crawl_segments.count
|
35
|
+
remaining = CrawlSegment.where(:crawl_id => self.id,
|
36
|
+
:parse_time => nil).count
|
37
|
+
parsed = total - remaining
|
38
|
+
status = self.crawl_name
|
39
|
+
status += " Segments: to parse #{remaining}, "
|
40
|
+
status += "parsed #{parsed}, total #{total}"
|
41
|
+
end
|
42
|
+
|
43
|
+
# Checks for crawl segments in the database. If none are found then checks
|
44
|
+
# the S3 API and creates any segments that are found.
|
45
|
+
def has_segments?
|
46
|
+
if self.crawl_segments.count == 0
|
47
|
+
segment_count = create_segments
|
48
|
+
result = segment_count > 0
|
49
|
+
else
|
50
|
+
result = true
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# Creates crawl segments from their S3 paths and returns the segment count.
|
55
|
+
def create_segments
|
56
|
+
segment_paths = s3_segment_paths(self.crawl_name)
|
57
|
+
save if segment_paths.count > 0
|
58
|
+
segment_paths.map { |s3_path| create_segment(s3_path) }
|
59
|
+
|
60
|
+
segment_paths.count
|
61
|
+
end
|
62
|
+
|
63
|
+
# Returns the list of segments from the database.
|
64
|
+
def select_segments(segments_list)
|
65
|
+
CrawlSegment.where(:segment_name => segments_list)
|
66
|
+
end
|
67
|
+
|
68
|
+
# Returns next # segments to be parsed. The maximum is 256
|
69
|
+
# as this is the maximum # of steps for an Elastic MapReduce job flow.
|
70
|
+
def next_segments(max_segments = nil)
|
71
|
+
max_segments = MAX_SEGMENTS if max_segments.nil?
|
72
|
+
max_segments = MAX_SEGMENTS if max_segments > MAX_SEGMENTS
|
73
|
+
|
74
|
+
self.crawl_segments.where(:parse_time => nil).limit(max_segments)
|
75
|
+
end
|
76
|
+
|
77
|
+
# Resets parse time of all parsed segments to null so they will be parsed
|
78
|
+
# again. Returns the updated crawl status.
|
79
|
+
def reset
|
80
|
+
segments = CrawlSegment.where('crawl_id = ? and parse_time is not null',
|
81
|
+
self.id)
|
82
|
+
segments.map { |segment| segment.update_attribute(:parse_time, nil) }
|
83
|
+
|
84
|
+
status
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
# Creates a crawl segment based on its S3 path if it does not exist.
|
89
|
+
def create_segment(s3_path)
|
90
|
+
segment_name = s3_path.split('/').last
|
91
|
+
segment_s3_uri = URI::Generic.build(:scheme => 's3',
|
92
|
+
:host => COMMON_CRAWL_BUCKET,
|
93
|
+
:path => "/#{s3_path}").to_s
|
94
|
+
|
95
|
+
segment = CrawlSegment.where(:crawl_id => self.id,
|
96
|
+
:segment_name => segment_name,
|
97
|
+
:segment_s3_uri => segment_s3_uri).first_or_create
|
98
|
+
end
|
99
|
+
|
100
|
+
# Returns a list of S3 paths for the crawl name.
|
101
|
+
def s3_segment_paths(crawl_name)
|
102
|
+
s3_segment_tree(crawl_name).children.collect(&:prefix)
|
103
|
+
end
|
104
|
+
|
105
|
+
# Calls the S3 API and returns the tree structure for the crawl name.
|
106
|
+
def s3_segment_tree(crawl_name)
|
107
|
+
crawl_path = [COMMON_CRAWL_PATH, crawl_name, SEGMENTS_PATH].join
|
108
|
+
|
109
|
+
s3 = AWS::S3.new
|
110
|
+
bucket = s3.buckets[COMMON_CRAWL_BUCKET]
|
111
|
+
bucket.as_tree(:prefix => crawl_path)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# Base error class extends standard error.
|
3
|
+
class Error < StandardError; end
|
4
|
+
|
5
|
+
# AWS access credentials are invalid.
|
6
|
+
class AWSCredentialsInvalidError < Error; end
|
7
|
+
|
8
|
+
# Config directory does not exist.
|
9
|
+
class ConfigDirMissingError < Error; end
|
10
|
+
|
11
|
+
# Database error accessing sqlite database.
|
12
|
+
class DatabaseAccessError < Error; end
|
13
|
+
|
14
|
+
# Error accessing AWS Elastic MapReduce API.
|
15
|
+
class ElasticMapReduceAccessError < Error; end
|
16
|
+
|
17
|
+
# Error accessing config directory.
|
18
|
+
class FileAccessError < Error; end
|
19
|
+
|
20
|
+
# Error accessing AWS S3 API.
|
21
|
+
class S3AccessError < Error; end
|
22
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Elasticrawl
|
2
|
+
# The base job class that is extended by ParseJob and CombineJob.
|
3
|
+
class Job < ActiveRecord::Base
|
4
|
+
has_many :job_steps
|
5
|
+
|
6
|
+
# Displays a confirmation message showing the configuration of the
|
7
|
+
# Elastic MapReduce job flow and cluster.
|
8
|
+
def confirm_message
|
9
|
+
cluster = Cluster.new
|
10
|
+
message = []
|
11
|
+
|
12
|
+
message[0] = 'Job configuration'
|
13
|
+
message[1] = self.job_desc
|
14
|
+
message[2] = ''
|
15
|
+
message[3] = cluster.cluster_desc
|
16
|
+
message.join("\n")
|
17
|
+
end
|
18
|
+
|
19
|
+
# Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
|
20
|
+
# launched successfully.
|
21
|
+
def result_message
|
22
|
+
"\nJob Name: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
|
23
|
+
end
|
24
|
+
|
25
|
+
# Displays the history of the current job. Called by the status command.
|
26
|
+
def history
|
27
|
+
launch_time = "Launched: #{self.created_at.strftime('%Y-%m-%d %H:%M:%S')}"
|
28
|
+
"#{self.job_name} #{launch_time} #{self.job_desc}"
|
29
|
+
end
|
30
|
+
|
31
|
+
protected
|
32
|
+
# Calls the Elastic MapReduce API to create a Job Flow. Returns the Job Flow ID.
|
33
|
+
def run_job_flow(emr_config)
|
34
|
+
cluster = Cluster.new
|
35
|
+
job_flow = cluster.create_job_flow(self, emr_config)
|
36
|
+
|
37
|
+
job_steps.each do |step|
|
38
|
+
job_flow.add_step(step.job_flow_step(job_config))
|
39
|
+
end
|
40
|
+
|
41
|
+
begin
|
42
|
+
job_flow.run
|
43
|
+
|
44
|
+
rescue StandardError => e
|
45
|
+
raise ElasticMapReduceAccessError, e.message
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Returns an S3 location for storing either data or logs.
|
50
|
+
def build_s3_uri(s3_path)
|
51
|
+
URI::Generic.build(:scheme => 's3',
|
52
|
+
:host => bucket_name,
|
53
|
+
:path => s3_path).to_s
|
54
|
+
end
|
55
|
+
|
56
|
+
# Returns the S3 bucket name configured by the user using the init command.
|
57
|
+
def bucket_name
|
58
|
+
config = Config.new
|
59
|
+
config.load_config('jobs')['s3_bucket_name']
|
60
|
+
end
|
61
|
+
|
62
|
+
# Sets the job name which is the current Unix timestamp in milliseconds.
|
63
|
+
# This is the same naming format used for Common Crawl segment names.
|
64
|
+
def set_job_name
|
65
|
+
(Time.now.to_f * 1000).to_i.to_s
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|