elasticrawl 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,128 @@
1
+ module Elasticrawl
2
+ # Configures the cluster settings for the job flow that will be launched.
3
+ # These settings are loaded from ~/.elasticrawl/cluster.yml.
4
+ class Cluster
5
+ def initialize
6
+ @master_group = instance_group('master')
7
+ @core_group = instance_group('core')
8
+ @task_group = instance_group('task') if has_task_group?
9
+ end
10
+
11
+ # Returns a configured job flow to the calling job.
12
+ def create_job_flow(job, emr_config = nil)
13
+ config = Config.new
14
+ job_flow = Elasticity::JobFlow.new(config.access_key_id,
15
+ config.secret_access_key)
16
+ job_flow.name = "Job Name: #{job.job_name} #{job.job_desc}"
17
+ job_flow.log_uri = job.log_uri
18
+
19
+ configure_job_flow(job_flow)
20
+ configure_instances(job_flow)
21
+ configure_bootstrap_actions(job_flow, emr_config)
22
+
23
+ job_flow
24
+ end
25
+
26
+ # Describes the instances that will be launched. This is used by the
27
+ # job confirmation messages.
28
+ def cluster_desc
29
+ cluster_desc = <<-HERE
30
+ Cluster configuration
31
+ Master: #{instance_group_desc(@master_group)}
32
+ Core: #{instance_group_desc(@core_group)}
33
+ Task: #{instance_group_desc(@task_group)}
34
+ HERE
35
+ end
36
+
37
+ private
38
+ # Set job flow properties from settings in cluster.yml.
39
+ def configure_job_flow(job_flow)
40
+ ec2_key_name = config_setting('ec2_key_name')
41
+ placement = config_setting('placement')
42
+ emr_ami_version = config_setting('emr_ami_version')
43
+
44
+ job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
45
+ job_flow.placement = placement if placement.present?
46
+ job_flow.ami_version = emr_ami_version if emr_ami_version.present?
47
+ end
48
+
49
+ # Configures the instances that will be launched. The master group has
50
+ # a single node. The task group is optional.
51
+ def configure_instances(job_flow)
52
+ job_flow.set_master_instance_group(@master_group)
53
+ job_flow.set_core_instance_group(@core_group)
54
+ job_flow.set_task_instance_group(@task_group) if @task_group.present?
55
+ end
56
+
57
+ # Configures bootstrap actions that will be run when each instance is
58
+ # launched. EMR config is an XML file of Hadoop settings stored on S3.
59
+ # There are applied to each node by a bootstrap action.
60
+ def configure_bootstrap_actions(job_flow, emr_config = nil)
61
+ bootstrap_scripts = config_setting('bootstrap_scripts')
62
+
63
+ if bootstrap_scripts.present?
64
+ bootstrap_scripts.each do |script_uri|
65
+ action = Elasticity::BootstrapAction.new(script_uri, '', '')
66
+ job_flow.add_bootstrap_action(action)
67
+ end
68
+ end
69
+
70
+ if emr_config.present?
71
+ action = Elasticity::HadoopFileBootstrapAction.new(emr_config)
72
+ job_flow.add_bootstrap_action(action)
73
+ end
74
+ end
75
+
76
+ # Returns whether cluster.yml specifies a task group.
77
+ def has_task_group?
78
+ task_config = config_for_group('task')
79
+ task_config.has_key?('instance_count') && task_config['instance_count'] > 0
80
+ end
81
+
82
+ # Describes an instance group.
83
+ def instance_group_desc(group)
84
+ if group.present?
85
+ if group.market == 'SPOT'
86
+ price = "(Spot: #{group.bid_price})"
87
+ else
88
+ price = '(On Demand)'
89
+ end
90
+
91
+ "#{group.count} #{group.type} #{price}"
92
+ else
93
+ '--'
94
+ end
95
+ end
96
+
97
+ # Configures an instance group with the instance type, # of instances and
98
+ # the bid price if spot instances are to be used.
99
+ def instance_group(group_name)
100
+ config = config_for_group(group_name)
101
+
102
+ instance_group = Elasticity::InstanceGroup.new
103
+ instance_group.role = group_name.upcase
104
+ instance_group.type = config['instance_type']
105
+
106
+ if config.has_key?('instance_count') && config['instance_count'] > 0
107
+ instance_group.count = config['instance_count']
108
+ end
109
+
110
+ if config['use_spot_instances'] == true
111
+ instance_group.set_spot_instances(config['bid_price'])
112
+ end
113
+
114
+ instance_group
115
+ end
116
+
117
+ # Returns the config settings for an instance group.
118
+ def config_for_group(group_name)
119
+ config_setting("#{group_name}_instance_group")
120
+ end
121
+
122
+ # Returns a config setting from cluster.yml.
123
+ def config_setting(key_name)
124
+ config = Config.new
125
+ config.load_config('cluster')[key_name]
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,86 @@
1
+ module Elasticrawl
2
+ # Represents an Elastic MapReduce job flow that combines the results of
3
+ # multiple Elasticrawl Parse jobs. Parse jobs write their results per
4
+ # segment. Combine jobs aggregate parse results into a single set of files.
5
+ #
6
+ # Inherits from Job which is the ActiveRecord model class.
7
+ class CombineJob < Job
8
+ # Takes in an array of parse jobs that are to be combined. Creates a single
9
+ # job step whose input paths are the outputs of the parse jobs.
10
+ def set_input_jobs(input_jobs)
11
+ segment_count = 0
12
+ input_paths = []
13
+
14
+ input_jobs.each do |job_name|
15
+ input_job = Job.where(:job_name => job_name,
16
+ :type => 'Elasticrawl::ParseJob').first_or_initialize
17
+ step_count = input_job.job_steps.count
18
+
19
+ if step_count > 0
20
+ segment_count += step_count
21
+ input_paths << set_input_path(input_job)
22
+ end
23
+ end
24
+
25
+ self.job_name = set_job_name
26
+ self.job_desc = set_job_desc(segment_count)
27
+ job_steps.push(create_job_step(input_paths.join(',')))
28
+ end
29
+
30
+ # Runs the job by calling the Elastic MapReduce API.
31
+ def run
32
+ emr_config = job_config['emr_config']
33
+ job_flow_id = run_job_flow(emr_config)
34
+
35
+ if job_flow_id.present?
36
+ self.job_flow_id = job_flow_id
37
+ self.save
38
+ self.result_message
39
+ end
40
+ end
41
+
42
+ # Returns the S3 location for storing Elastic MapReduce job logs.
43
+ def log_uri
44
+ s3_path = "/logs/2-combine/#{self.job_name}/"
45
+ build_s3_uri(s3_path)
46
+ end
47
+
48
+ private
49
+ # Returns a single job step. The input paths are a CSV list of parse
50
+ # job outputs.
51
+ def create_job_step(input_paths)
52
+ JobStep.create(:job => self,
53
+ :input_paths => input_paths,
54
+ :output_path => set_output_path)
55
+ end
56
+
57
+ # Returns the S3 location for reading a parse job. A wildcard is
58
+ # used for the segment names. The input filter depends on the output
59
+ # file type of the parse job and what type of compression is used.
60
+ def set_input_path(input_job)
61
+ job_name = input_job.job_name
62
+ input_filter = job_config['input_filter']
63
+
64
+ s3_path = "/data/1-parse/#{job_name}/segments/*/#{input_filter}"
65
+ build_s3_uri(s3_path)
66
+ end
67
+
68
+ # Returns the S3 location for storing the combine job results.
69
+ def set_output_path
70
+ s3_path = "/data/2-combine/#{self.job_name}/"
71
+ build_s3_uri(s3_path)
72
+ end
73
+
74
+ # Sets the job description which forms part of the Elastic MapReduce
75
+ # job flow name.
76
+ def set_job_desc(segment_count)
77
+ "Combining: #{segment_count} segments"
78
+ end
79
+
80
+ # Returns the combine job configuration from ~/.elasticrawl.jobs.yml.
81
+ def job_config
82
+ config = Config.new
83
+ config.load_config('jobs')['steps']['combine']
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,242 @@
1
+ module Elasticrawl
2
+ # Represents the current configuration which is persisted to
3
+ # ~/.elasticrawl/ and contains 3 configuration files.
4
+ #
5
+ # aws.yml - AWS access credentials unless stored in the environment
6
+ # variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
7
+ # cluster.yml - Elastic MapReduce cluster config including instance groups.
8
+ # jobs.yml - Elastic MapReduce jobs config and the S3 bucket used for
9
+ # storing data and logs.
10
+ #
11
+ # This directory also contains the Elasticrawl SQLite database.
12
+ class Config
13
+ CONFIG_DIR = '.elasticrawl'
14
+ DATABASE_FILE = 'elasticrawl.sqlite3'
15
+ TEMPLATES_DIR = '../../templates'
16
+ TEMPLATE_FILES = ['aws.yml', 'cluster.yml', 'jobs.yml']
17
+
18
+ attr_reader :access_key_id
19
+ attr_reader :secret_access_key
20
+
21
+ # Sets the AWS access credentials needed for the S3 and EMR API calls.
22
+ def initialize(access_key_id = nil, secret_access_key = nil)
23
+ # Credentials have been provided to the init command.
24
+ @access_key_id = access_key_id
25
+ @secret_access_key = secret_access_key
26
+
27
+ # If credentials are not set then check if they are available in aws.yml.
28
+ if dir_exists?
29
+ config = load_config('aws')
30
+ key = config['access_key_id']
31
+ secret = config['secret_access_key']
32
+
33
+ @access_key_id ||= key unless key == 'ACCESS_KEY_ID'
34
+ @secret_access_key ||= secret unless secret == 'SECRET_ACCESS_KEY'
35
+ end
36
+
37
+ # If credentials are still not set then check AWS environment variables.
38
+ @access_key_id ||= ENV['AWS_ACCESS_KEY_ID']
39
+ @secret_access_key ||= ENV['AWS_SECRET_ACCESS_KEY']
40
+
41
+ # Set AWS credentials for use when accessing the S3 API.
42
+ AWS.config(:access_key_id => @access_key_id,
43
+ :secret_access_key => @secret_access_key)
44
+ end
45
+
46
+ # Returns the location of the config directory.
47
+ def config_dir
48
+ File.join(Dir.home, CONFIG_DIR)
49
+ end
50
+
51
+ # Checks if the configuration directory exists.
52
+ def dir_exists?
53
+ Dir.exists?(config_dir)
54
+ end
55
+
56
+ # Loads a YAML configuration file.
57
+ def load_config(config_file)
58
+ if dir_exists?
59
+ begin
60
+ config_file = File.join(config_dir, "#{config_file}.yml")
61
+ config = YAML::load(File.open(config_file))
62
+
63
+ rescue StandardError => e
64
+ raise FileAccessError, e.message
65
+ end
66
+ else
67
+ raise ConfigDirMissingError, 'Config dir missing. Run init command'
68
+ end
69
+ end
70
+
71
+ # Loads the sqlite database. If no database exists it will be created
72
+ # and the database migrations will be run.
73
+ def load_database
74
+ if dir_exists?
75
+ config = {
76
+ 'adapter' => 'sqlite3',
77
+ 'database' => File.join(config_dir, DATABASE_FILE),
78
+ 'pool' => 5,
79
+ 'timeout' => 5000
80
+ }
81
+
82
+ begin
83
+ ActiveRecord::Base.establish_connection(config)
84
+ ActiveRecord::Migrator.migrate(File.join(File.dirname(__FILE__), \
85
+ '../../db/migrate'), ENV['VERSION'] ? ENV['VERSION'].to_i : nil )
86
+
87
+ rescue StandardError => e
88
+ raise DatabaseAccessError, e.message
89
+ end
90
+ else
91
+ raise ConfigDirMissingError, 'Config dir missing. Run init command'
92
+ end
93
+ end
94
+
95
+ # Checks if a S3 bucket name is in use.
96
+ def bucket_exists?(bucket_name)
97
+ begin
98
+ s3 = AWS::S3.new
99
+ s3.buckets[bucket_name].exists?
100
+
101
+ rescue AWS::S3::Errors::SignatureDoesNotMatch => e
102
+ raise AWSCredentialsInvalidError, 'AWS access credentials are invalid'
103
+ rescue StandardError => e
104
+ raise S3AccessError, e.message
105
+ end
106
+ end
107
+
108
+ # Creates the S3 bucket and config directory. Deploys the config templates
109
+ # and creates the sqlite database.
110
+ def create(bucket_name)
111
+ create_bucket(bucket_name)
112
+ deploy_templates(bucket_name)
113
+ load_database
114
+
115
+ status_message(bucket_name, 'created')
116
+ end
117
+
118
+ # Deletes the S3 bucket and config directory.
119
+ def delete
120
+ bucket_name = load_config('jobs')['s3_bucket_name']
121
+ delete_bucket(bucket_name)
122
+ delete_config_dir
123
+
124
+ status_message(bucket_name, 'deleted')
125
+ end
126
+
127
+ # Displayed by destroy command to confirm deletion.
128
+ def delete_warning
129
+ bucket_name = load_config('jobs')['s3_bucket_name']
130
+
131
+ message = ['WARNING:']
132
+ message << "Bucket s3://#{bucket_name} and its data will be deleted"
133
+ message << "Config dir #{config_dir} will be deleted"
134
+
135
+ message.join("\n")
136
+ end
137
+
138
+ # Displayed by init command.
139
+ def access_key_prompt
140
+ prompt = "Enter AWS Access Key ID:"
141
+ prompt += " [#{@access_key_id}]" if @access_key_id.present?
142
+
143
+ prompt
144
+ end
145
+
146
+ # Displayed by init command.
147
+ def secret_key_prompt
148
+ prompt = "Enter AWS Secret Access Key:"
149
+ prompt += " [#{@secret_access_key}]" if @secret_access_key.present?
150
+
151
+ prompt
152
+ end
153
+
154
+ private
155
+ # Creates a bucket using the S3 API.
156
+ def create_bucket(bucket_name)
157
+ begin
158
+ s3 = AWS::S3.new
159
+ s3.buckets.create(bucket_name)
160
+
161
+ rescue StandardError => e
162
+ raise S3AccessError, e.message
163
+ end
164
+ end
165
+
166
+ # Deletes a bucket and its contents using the S3 API.
167
+ def delete_bucket(bucket_name)
168
+ begin
169
+ s3 = AWS::S3.new
170
+ bucket = s3.buckets[bucket_name]
171
+ bucket.delete!
172
+
173
+ rescue StandardError => e
174
+ raise S3AccessError, e.message
175
+ end
176
+ end
177
+
178
+ # Creates config directory and copies config templates into it.
179
+ # Saves S3 bucket name to jobs.yml and AWS credentials to aws.yml.
180
+ def deploy_templates(bucket_name)
181
+ begin
182
+ Dir.mkdir(config_dir, 0755) if dir_exists? == false
183
+
184
+ TEMPLATE_FILES.each do |template_file|
185
+ FileUtils.cp(File.join(File.dirname(__FILE__), TEMPLATES_DIR, template_file),
186
+ File.join(config_dir, template_file))
187
+ end
188
+
189
+ save_config('jobs', { 'BUCKET_NAME' => bucket_name })
190
+ save_aws_config
191
+
192
+ rescue StandardError => e
193
+ raise FileAccessError, e.message
194
+ end
195
+ end
196
+
197
+ # Saves AWS access credentials to aws.yml unless they are configured as
198
+ # environment variables.
199
+ def save_aws_config
200
+ env_key = ENV['AWS_ACCESS_KEY_ID']
201
+ env_secret = ENV['AWS_SECRET_ACCESS_KEY']
202
+
203
+ creds = {}
204
+ creds['ACCESS_KEY_ID'] = @access_key_id unless @access_key_id == env_key
205
+ creds['SECRET_ACCESS_KEY'] = @secret_access_key \
206
+ unless @secret_access_key == env_secret
207
+
208
+ save_config('aws', creds)
209
+ end
210
+
211
+ # Saves config values by overwriting placeholder values in template.
212
+ def save_config(template, params)
213
+ config_file = File.join(config_dir, "#{template}.yml")
214
+ config = File.read(config_file)
215
+
216
+ params.map { |key, value| config = config.gsub(key, value) }
217
+
218
+ File.open(config_file, 'w') { |file| file.write(config) }
219
+ end
220
+
221
+ # Deletes the config directory including its contents.
222
+ def delete_config_dir
223
+ begin
224
+ FileUtils.rm_r(config_dir) if dir_exists?
225
+
226
+ rescue StandardError => e
227
+ raise FileAccessError, e.message
228
+ end
229
+ end
230
+
231
+ # Notifies user of results of init or destroy commands.
232
+ def status_message(bucket_name, state)
233
+ message = ['', "Bucket s3://#{bucket_name} #{state}"]
234
+ message << "Config dir #{config_dir} #{state}"
235
+
236
+ state = 'complete' if state == 'created'
237
+ message << "Config #{state}"
238
+
239
+ message.join("\n")
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,114 @@
1
+ module Elasticrawl
2
+ # Represents a web crawl released by the Common Crawl Foundation.
3
+ # Each crawl is split into multiple crawl segments and is stored
4
+ # in the S3 public datasets bucket.
5
+ class Crawl < ActiveRecord::Base
6
+ has_many :crawl_segments
7
+
8
+ COMMON_CRAWL_BUCKET = 'aws-publicdatasets'
9
+ COMMON_CRAWL_PATH = 'common-crawl/crawl-data/'
10
+ SEGMENTS_PATH = '/segments/'
11
+ MAX_SEGMENTS = 256
12
+
13
+ # Returns the status of all saved crawls and the current job history.
14
+ def self.status(show_all = false)
15
+ status = ['Crawl Status']
16
+ Crawl.all.map { |crawl| status << crawl.status }
17
+
18
+ if show_all == true
19
+ header = 'Job History'
20
+ jobs = Job.where('job_flow_id is not null').order(:id => :desc)
21
+ else
22
+ header = 'Job History (last 10)'
23
+ jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10)
24
+ end
25
+
26
+ status << ['', header]
27
+ jobs.map { |job| status << job.history }
28
+
29
+ status.join("\n")
30
+ end
31
+
32
+ # Returns the status of the current crawl.
33
+ def status
34
+ total = self.crawl_segments.count
35
+ remaining = CrawlSegment.where(:crawl_id => self.id,
36
+ :parse_time => nil).count
37
+ parsed = total - remaining
38
+ status = self.crawl_name
39
+ status += " Segments: to parse #{remaining}, "
40
+ status += "parsed #{parsed}, total #{total}"
41
+ end
42
+
43
+ # Checks for crawl segments in the database. If none are found then checks
44
+ # the S3 API and creates any segments that are found.
45
+ def has_segments?
46
+ if self.crawl_segments.count == 0
47
+ segment_count = create_segments
48
+ result = segment_count > 0
49
+ else
50
+ result = true
51
+ end
52
+ end
53
+
54
+ # Creates crawl segments from their S3 paths and returns the segment count.
55
+ def create_segments
56
+ segment_paths = s3_segment_paths(self.crawl_name)
57
+ save if segment_paths.count > 0
58
+ segment_paths.map { |s3_path| create_segment(s3_path) }
59
+
60
+ segment_paths.count
61
+ end
62
+
63
+ # Returns the list of segments from the database.
64
+ def select_segments(segments_list)
65
+ CrawlSegment.where(:segment_name => segments_list)
66
+ end
67
+
68
+ # Returns next # segments to be parsed. The maximum is 256
69
+ # as this is the maximum # of steps for an Elastic MapReduce job flow.
70
+ def next_segments(max_segments = nil)
71
+ max_segments = MAX_SEGMENTS if max_segments.nil?
72
+ max_segments = MAX_SEGMENTS if max_segments > MAX_SEGMENTS
73
+
74
+ self.crawl_segments.where(:parse_time => nil).limit(max_segments)
75
+ end
76
+
77
+ # Resets parse time of all parsed segments to null so they will be parsed
78
+ # again. Returns the updated crawl status.
79
+ def reset
80
+ segments = CrawlSegment.where('crawl_id = ? and parse_time is not null',
81
+ self.id)
82
+ segments.map { |segment| segment.update_attribute(:parse_time, nil) }
83
+
84
+ status
85
+ end
86
+
87
+ private
88
+ # Creates a crawl segment based on its S3 path if it does not exist.
89
+ def create_segment(s3_path)
90
+ segment_name = s3_path.split('/').last
91
+ segment_s3_uri = URI::Generic.build(:scheme => 's3',
92
+ :host => COMMON_CRAWL_BUCKET,
93
+ :path => "/#{s3_path}").to_s
94
+
95
+ segment = CrawlSegment.where(:crawl_id => self.id,
96
+ :segment_name => segment_name,
97
+ :segment_s3_uri => segment_s3_uri).first_or_create
98
+ end
99
+
100
+ # Returns a list of S3 paths for the crawl name.
101
+ def s3_segment_paths(crawl_name)
102
+ s3_segment_tree(crawl_name).children.collect(&:prefix)
103
+ end
104
+
105
+ # Calls the S3 API and returns the tree structure for the crawl name.
106
+ def s3_segment_tree(crawl_name)
107
+ crawl_path = [COMMON_CRAWL_PATH, crawl_name, SEGMENTS_PATH].join
108
+
109
+ s3 = AWS::S3.new
110
+ bucket = s3.buckets[COMMON_CRAWL_BUCKET]
111
+ bucket.as_tree(:prefix => crawl_path)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,8 @@
1
+ module Elasticrawl
2
+ # Represents a segment of a web crawl released by the Common Crawl Foundation.
3
+ # Each segment contains archive, metadata and text files.
4
+ class CrawlSegment < ActiveRecord::Base
5
+ belongs_to :crawl
6
+ has_many :job_steps
7
+ end
8
+ end
@@ -0,0 +1,22 @@
1
+ module Elasticrawl
2
+ # Base error class extends standard error.
3
+ class Error < StandardError; end
4
+
5
+ # AWS access credentials are invalid.
6
+ class AWSCredentialsInvalidError < Error; end
7
+
8
+ # Config directory does not exist.
9
+ class ConfigDirMissingError < Error; end
10
+
11
+ # Database error accessing sqlite database.
12
+ class DatabaseAccessError < Error; end
13
+
14
+ # Error accessing AWS Elastic MapReduce API.
15
+ class ElasticMapReduceAccessError < Error; end
16
+
17
+ # Error accessing config directory.
18
+ class FileAccessError < Error; end
19
+
20
+ # Error accessing AWS S3 API.
21
+ class S3AccessError < Error; end
22
+ end
@@ -0,0 +1,68 @@
1
+ module Elasticrawl
2
+ # The base job class that is extended by ParseJob and CombineJob.
3
+ class Job < ActiveRecord::Base
4
+ has_many :job_steps
5
+
6
+ # Displays a confirmation message showing the configuration of the
7
+ # Elastic MapReduce job flow and cluster.
8
+ def confirm_message
9
+ cluster = Cluster.new
10
+ message = []
11
+
12
+ message[0] = 'Job configuration'
13
+ message[1] = self.job_desc
14
+ message[2] = ''
15
+ message[3] = cluster.cluster_desc
16
+ message.join("\n")
17
+ end
18
+
19
+ # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
20
+ # launched successfully.
21
+ def result_message
22
+ "\nJob Name: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
23
+ end
24
+
25
+ # Displays the history of the current job. Called by the status command.
26
+ def history
27
+ launch_time = "Launched: #{self.created_at.strftime('%Y-%m-%d %H:%M:%S')}"
28
+ "#{self.job_name} #{launch_time} #{self.job_desc}"
29
+ end
30
+
31
+ protected
32
+ # Calls the Elastic MapReduce API to create a Job Flow. Returns the Job Flow ID.
33
+ def run_job_flow(emr_config)
34
+ cluster = Cluster.new
35
+ job_flow = cluster.create_job_flow(self, emr_config)
36
+
37
+ job_steps.each do |step|
38
+ job_flow.add_step(step.job_flow_step(job_config))
39
+ end
40
+
41
+ begin
42
+ job_flow.run
43
+
44
+ rescue StandardError => e
45
+ raise ElasticMapReduceAccessError, e.message
46
+ end
47
+ end
48
+
49
+ # Returns an S3 location for storing either data or logs.
50
+ def build_s3_uri(s3_path)
51
+ URI::Generic.build(:scheme => 's3',
52
+ :host => bucket_name,
53
+ :path => s3_path).to_s
54
+ end
55
+
56
+ # Returns the S3 bucket name configured by the user using the init command.
57
+ def bucket_name
58
+ config = Config.new
59
+ config.load_config('jobs')['s3_bucket_name']
60
+ end
61
+
62
+ # Sets the job name which is the current Unix timestamp in milliseconds.
63
+ # This is the same naming format used for Common Crawl segment names.
64
+ def set_job_name
65
+ (Time.now.to_f * 1000).to_i.to_s
66
+ end
67
+ end
68
+ end