elasticrawl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ module Elasticrawl
2
+ # Configures the cluster settings for the job flow that will be launched.
3
+ # These settings are loaded from ~/.elasticrawl/cluster.yml.
4
+ class Cluster
5
+ def initialize
6
+ @master_group = instance_group('master')
7
+ @core_group = instance_group('core')
8
+ @task_group = instance_group('task') if has_task_group?
9
+ end
10
+
11
+ # Returns a configured job flow to the calling job.
12
+ def create_job_flow(job, emr_config = nil)
13
+ config = Config.new
14
+ job_flow = Elasticity::JobFlow.new(config.access_key_id,
15
+ config.secret_access_key)
16
+ job_flow.name = "Job Name: #{job.job_name} #{job.job_desc}"
17
+ job_flow.log_uri = job.log_uri
18
+
19
+ configure_job_flow(job_flow)
20
+ configure_instances(job_flow)
21
+ configure_bootstrap_actions(job_flow, emr_config)
22
+
23
+ job_flow
24
+ end
25
+
26
+ # Describes the instances that will be launched. This is used by the
27
+ # job confirmation messages.
28
+ def cluster_desc
29
+ cluster_desc = <<-HERE
30
+ Cluster configuration
31
+ Master: #{instance_group_desc(@master_group)}
32
+ Core: #{instance_group_desc(@core_group)}
33
+ Task: #{instance_group_desc(@task_group)}
34
+ HERE
35
+ end
36
+
37
+ private
38
+ # Set job flow properties from settings in cluster.yml.
39
+ def configure_job_flow(job_flow)
40
+ ec2_key_name = config_setting('ec2_key_name')
41
+ placement = config_setting('placement')
42
+ emr_ami_version = config_setting('emr_ami_version')
43
+
44
+ job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
45
+ job_flow.placement = placement if placement.present?
46
+ job_flow.ami_version = emr_ami_version if emr_ami_version.present?
47
+ end
48
+
49
+ # Configures the instances that will be launched. The master group has
50
+ # a single node. The task group is optional.
51
+ def configure_instances(job_flow)
52
+ job_flow.set_master_instance_group(@master_group)
53
+ job_flow.set_core_instance_group(@core_group)
54
+ job_flow.set_task_instance_group(@task_group) if @task_group.present?
55
+ end
56
+
57
+ # Configures bootstrap actions that will be run when each instance is
58
+ # launched. EMR config is an XML file of Hadoop settings stored on S3.
59
+ # There are applied to each node by a bootstrap action.
60
+ def configure_bootstrap_actions(job_flow, emr_config = nil)
61
+ bootstrap_scripts = config_setting('bootstrap_scripts')
62
+
63
+ if bootstrap_scripts.present?
64
+ bootstrap_scripts.each do |script_uri|
65
+ action = Elasticity::BootstrapAction.new(script_uri, '', '')
66
+ job_flow.add_bootstrap_action(action)
67
+ end
68
+ end
69
+
70
+ if emr_config.present?
71
+ action = Elasticity::HadoopFileBootstrapAction.new(emr_config)
72
+ job_flow.add_bootstrap_action(action)
73
+ end
74
+ end
75
+
76
+ # Returns whether cluster.yml specifies a task group.
77
+ def has_task_group?
78
+ task_config = config_for_group('task')
79
+ task_config.has_key?('instance_count') && task_config['instance_count'] > 0
80
+ end
81
+
82
+ # Describes an instance group.
83
+ def instance_group_desc(group)
84
+ if group.present?
85
+ if group.market == 'SPOT'
86
+ price = "(Spot: #{group.bid_price})"
87
+ else
88
+ price = '(On Demand)'
89
+ end
90
+
91
+ "#{group.count} #{group.type} #{price}"
92
+ else
93
+ '--'
94
+ end
95
+ end
96
+
97
+ # Configures an instance group with the instance type, # of instances and
98
+ # the bid price if spot instances are to be used.
99
+ def instance_group(group_name)
100
+ config = config_for_group(group_name)
101
+
102
+ instance_group = Elasticity::InstanceGroup.new
103
+ instance_group.role = group_name.upcase
104
+ instance_group.type = config['instance_type']
105
+
106
+ if config.has_key?('instance_count') && config['instance_count'] > 0
107
+ instance_group.count = config['instance_count']
108
+ end
109
+
110
+ if config['use_spot_instances'] == true
111
+ instance_group.set_spot_instances(config['bid_price'])
112
+ end
113
+
114
+ instance_group
115
+ end
116
+
117
+ # Returns the config settings for an instance group.
118
+ def config_for_group(group_name)
119
+ config_setting("#{group_name}_instance_group")
120
+ end
121
+
122
+ # Returns a config setting from cluster.yml.
123
+ def config_setting(key_name)
124
+ config = Config.new
125
+ config.load_config('cluster')[key_name]
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,86 @@
1
+ module Elasticrawl
2
+ # Represents an Elastic MapReduce job flow that combines the results of
3
+ # multiple Elasticrawl Parse jobs. Parse jobs write their results per
4
+ # segment. Combine jobs aggregate parse results into a single set of files.
5
+ #
6
+ # Inherits from Job which is the ActiveRecord model class.
7
+ class CombineJob < Job
8
+ # Takes in an array of parse jobs that are to be combined. Creates a single
9
+ # job step whose input paths are the outputs of the parse jobs.
10
+ def set_input_jobs(input_jobs)
11
+ segment_count = 0
12
+ input_paths = []
13
+
14
+ input_jobs.each do |job_name|
15
+ input_job = Job.where(:job_name => job_name,
16
+ :type => 'Elasticrawl::ParseJob').first_or_initialize
17
+ step_count = input_job.job_steps.count
18
+
19
+ if step_count > 0
20
+ segment_count += step_count
21
+ input_paths << set_input_path(input_job)
22
+ end
23
+ end
24
+
25
+ self.job_name = set_job_name
26
+ self.job_desc = set_job_desc(segment_count)
27
+ job_steps.push(create_job_step(input_paths.join(',')))
28
+ end
29
+
30
+ # Runs the job by calling the Elastic MapReduce API.
31
+ def run
32
+ emr_config = job_config['emr_config']
33
+ job_flow_id = run_job_flow(emr_config)
34
+
35
+ if job_flow_id.present?
36
+ self.job_flow_id = job_flow_id
37
+ self.save
38
+ self.result_message
39
+ end
40
+ end
41
+
42
+ # Returns the S3 location for storing Elastic MapReduce job logs.
43
+ def log_uri
44
+ s3_path = "/logs/2-combine/#{self.job_name}/"
45
+ build_s3_uri(s3_path)
46
+ end
47
+
48
+ private
49
+ # Returns a single job step. The input paths are a CSV list of parse
50
+ # job outputs.
51
+ def create_job_step(input_paths)
52
+ JobStep.create(:job => self,
53
+ :input_paths => input_paths,
54
+ :output_path => set_output_path)
55
+ end
56
+
57
+ # Returns the S3 location for reading a parse job. A wildcard is
58
+ # used for the segment names. The input filter depends on the output
59
+ # file type of the parse job and what type of compression is used.
60
+ def set_input_path(input_job)
61
+ job_name = input_job.job_name
62
+ input_filter = job_config['input_filter']
63
+
64
+ s3_path = "/data/1-parse/#{job_name}/segments/*/#{input_filter}"
65
+ build_s3_uri(s3_path)
66
+ end
67
+
68
+ # Returns the S3 location for storing the combine job results.
69
+ def set_output_path
70
+ s3_path = "/data/2-combine/#{self.job_name}/"
71
+ build_s3_uri(s3_path)
72
+ end
73
+
74
+ # Sets the job description which forms part of the Elastic MapReduce
75
+ # job flow name.
76
+ def set_job_desc(segment_count)
77
+ "Combining: #{segment_count} segments"
78
+ end
79
+
80
+ # Returns the combine job configuration from ~/.elasticrawl.jobs.yml.
81
+ def job_config
82
+ config = Config.new
83
+ config.load_config('jobs')['steps']['combine']
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,242 @@
1
+ module Elasticrawl
2
+ # Represents the current configuration which is persisted to
3
+ # ~/.elasticrawl/ and contains 3 configuration files.
4
+ #
5
+ # aws.yml - AWS access credentials unless stored in the environment
6
+ # variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
7
+ # cluster.yml - Elastic MapReduce cluster config including instance groups.
8
+ # jobs.yml - Elastic MapReduce jobs config and the S3 bucket used for
9
+ # storing data and logs.
10
+ #
11
+ # This directory also contains the Elasticrawl SQLite database.
12
+ class Config
13
+ CONFIG_DIR = '.elasticrawl'
14
+ DATABASE_FILE = 'elasticrawl.sqlite3'
15
+ TEMPLATES_DIR = '../../templates'
16
+ TEMPLATE_FILES = ['aws.yml', 'cluster.yml', 'jobs.yml']
17
+
18
+ attr_reader :access_key_id
19
+ attr_reader :secret_access_key
20
+
21
+ # Sets the AWS access credentials needed for the S3 and EMR API calls.
22
+ def initialize(access_key_id = nil, secret_access_key = nil)
23
+ # Credentials have been provided to the init command.
24
+ @access_key_id = access_key_id
25
+ @secret_access_key = secret_access_key
26
+
27
+ # If credentials are not set then check if they are available in aws.yml.
28
+ if dir_exists?
29
+ config = load_config('aws')
30
+ key = config['access_key_id']
31
+ secret = config['secret_access_key']
32
+
33
+ @access_key_id ||= key unless key == 'ACCESS_KEY_ID'
34
+ @secret_access_key ||= secret unless secret == 'SECRET_ACCESS_KEY'
35
+ end
36
+
37
+ # If credentials are still not set then check AWS environment variables.
38
+ @access_key_id ||= ENV['AWS_ACCESS_KEY_ID']
39
+ @secret_access_key ||= ENV['AWS_SECRET_ACCESS_KEY']
40
+
41
+ # Set AWS credentials for use when accessing the S3 API.
42
+ AWS.config(:access_key_id => @access_key_id,
43
+ :secret_access_key => @secret_access_key)
44
+ end
45
+
46
+ # Returns the location of the config directory.
47
+ def config_dir
48
+ File.join(Dir.home, CONFIG_DIR)
49
+ end
50
+
51
+ # Checks if the configuration directory exists.
52
+ def dir_exists?
53
+ Dir.exists?(config_dir)
54
+ end
55
+
56
+ # Loads a YAML configuration file.
57
+ def load_config(config_file)
58
+ if dir_exists?
59
+ begin
60
+ config_file = File.join(config_dir, "#{config_file}.yml")
61
+ config = YAML::load(File.open(config_file))
62
+
63
+ rescue StandardError => e
64
+ raise FileAccessError, e.message
65
+ end
66
+ else
67
+ raise ConfigDirMissingError, 'Config dir missing. Run init command'
68
+ end
69
+ end
70
+
71
+ # Loads the sqlite database. If no database exists it will be created
72
+ # and the database migrations will be run.
73
+ def load_database
74
+ if dir_exists?
75
+ config = {
76
+ 'adapter' => 'sqlite3',
77
+ 'database' => File.join(config_dir, DATABASE_FILE),
78
+ 'pool' => 5,
79
+ 'timeout' => 5000
80
+ }
81
+
82
+ begin
83
+ ActiveRecord::Base.establish_connection(config)
84
+ ActiveRecord::Migrator.migrate(File.join(File.dirname(__FILE__), \
85
+ '../../db/migrate'), ENV['VERSION'] ? ENV['VERSION'].to_i : nil )
86
+
87
+ rescue StandardError => e
88
+ raise DatabaseAccessError, e.message
89
+ end
90
+ else
91
+ raise ConfigDirMissingError, 'Config dir missing. Run init command'
92
+ end
93
+ end
94
+
95
+ # Checks if a S3 bucket name is in use.
96
+ def bucket_exists?(bucket_name)
97
+ begin
98
+ s3 = AWS::S3.new
99
+ s3.buckets[bucket_name].exists?
100
+
101
+ rescue AWS::S3::Errors::SignatureDoesNotMatch => e
102
+ raise AWSCredentialsInvalidError, 'AWS access credentials are invalid'
103
+ rescue StandardError => e
104
+ raise S3AccessError, e.message
105
+ end
106
+ end
107
+
108
+ # Creates the S3 bucket and config directory. Deploys the config templates
109
+ # and creates the sqlite database.
110
+ def create(bucket_name)
111
+ create_bucket(bucket_name)
112
+ deploy_templates(bucket_name)
113
+ load_database
114
+
115
+ status_message(bucket_name, 'created')
116
+ end
117
+
118
+ # Deletes the S3 bucket and config directory.
119
+ def delete
120
+ bucket_name = load_config('jobs')['s3_bucket_name']
121
+ delete_bucket(bucket_name)
122
+ delete_config_dir
123
+
124
+ status_message(bucket_name, 'deleted')
125
+ end
126
+
127
+ # Displayed by destroy command to confirm deletion.
128
+ def delete_warning
129
+ bucket_name = load_config('jobs')['s3_bucket_name']
130
+
131
+ message = ['WARNING:']
132
+ message << "Bucket s3://#{bucket_name} and its data will be deleted"
133
+ message << "Config dir #{config_dir} will be deleted"
134
+
135
+ message.join("\n")
136
+ end
137
+
138
+ # Displayed by init command.
139
+ def access_key_prompt
140
+ prompt = "Enter AWS Access Key ID:"
141
+ prompt += " [#{@access_key_id}]" if @access_key_id.present?
142
+
143
+ prompt
144
+ end
145
+
146
+ # Displayed by init command.
147
+ def secret_key_prompt
148
+ prompt = "Enter AWS Secret Access Key:"
149
+ prompt += " [#{@secret_access_key}]" if @secret_access_key.present?
150
+
151
+ prompt
152
+ end
153
+
154
+ private
155
+ # Creates a bucket using the S3 API.
156
+ def create_bucket(bucket_name)
157
+ begin
158
+ s3 = AWS::S3.new
159
+ s3.buckets.create(bucket_name)
160
+
161
+ rescue StandardError => e
162
+ raise S3AccessError, e.message
163
+ end
164
+ end
165
+
166
+ # Deletes a bucket and its contents using the S3 API.
167
+ def delete_bucket(bucket_name)
168
+ begin
169
+ s3 = AWS::S3.new
170
+ bucket = s3.buckets[bucket_name]
171
+ bucket.delete!
172
+
173
+ rescue StandardError => e
174
+ raise S3AccessError, e.message
175
+ end
176
+ end
177
+
178
+ # Creates config directory and copies config templates into it.
179
+ # Saves S3 bucket name to jobs.yml and AWS credentials to aws.yml.
180
+ def deploy_templates(bucket_name)
181
+ begin
182
+ Dir.mkdir(config_dir, 0755) if dir_exists? == false
183
+
184
+ TEMPLATE_FILES.each do |template_file|
185
+ FileUtils.cp(File.join(File.dirname(__FILE__), TEMPLATES_DIR, template_file),
186
+ File.join(config_dir, template_file))
187
+ end
188
+
189
+ save_config('jobs', { 'BUCKET_NAME' => bucket_name })
190
+ save_aws_config
191
+
192
+ rescue StandardError => e
193
+ raise FileAccessError, e.message
194
+ end
195
+ end
196
+
197
+ # Saves AWS access credentials to aws.yml unless they are configured as
198
+ # environment variables.
199
+ def save_aws_config
200
+ env_key = ENV['AWS_ACCESS_KEY_ID']
201
+ env_secret = ENV['AWS_SECRET_ACCESS_KEY']
202
+
203
+ creds = {}
204
+ creds['ACCESS_KEY_ID'] = @access_key_id unless @access_key_id == env_key
205
+ creds['SECRET_ACCESS_KEY'] = @secret_access_key \
206
+ unless @secret_access_key == env_secret
207
+
208
+ save_config('aws', creds)
209
+ end
210
+
211
+ # Saves config values by overwriting placeholder values in template.
212
+ def save_config(template, params)
213
+ config_file = File.join(config_dir, "#{template}.yml")
214
+ config = File.read(config_file)
215
+
216
+ params.map { |key, value| config = config.gsub(key, value) }
217
+
218
+ File.open(config_file, 'w') { |file| file.write(config) }
219
+ end
220
+
221
+ # Deletes the config directory including its contents.
222
+ def delete_config_dir
223
+ begin
224
+ FileUtils.rm_r(config_dir) if dir_exists?
225
+
226
+ rescue StandardError => e
227
+ raise FileAccessError, e.message
228
+ end
229
+ end
230
+
231
+ # Notifies user of results of init or destroy commands.
232
+ def status_message(bucket_name, state)
233
+ message = ['', "Bucket s3://#{bucket_name} #{state}"]
234
+ message << "Config dir #{config_dir} #{state}"
235
+
236
+ state = 'complete' if state == 'created'
237
+ message << "Config #{state}"
238
+
239
+ message.join("\n")
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,114 @@
1
+ module Elasticrawl
2
+ # Represents a web crawl released by the Common Crawl Foundation.
3
+ # Each crawl is split into multiple crawl segments and is stored
4
+ # in the S3 public datasets bucket.
5
+ class Crawl < ActiveRecord::Base
6
+ has_many :crawl_segments
7
+
8
+ COMMON_CRAWL_BUCKET = 'aws-publicdatasets'
9
+ COMMON_CRAWL_PATH = 'common-crawl/crawl-data/'
10
+ SEGMENTS_PATH = '/segments/'
11
+ MAX_SEGMENTS = 256
12
+
13
+ # Returns the status of all saved crawls and the current job history.
14
+ def self.status(show_all = false)
15
+ status = ['Crawl Status']
16
+ Crawl.all.map { |crawl| status << crawl.status }
17
+
18
+ if show_all == true
19
+ header = 'Job History'
20
+ jobs = Job.where('job_flow_id is not null').order(:id => :desc)
21
+ else
22
+ header = 'Job History (last 10)'
23
+ jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10)
24
+ end
25
+
26
+ status << ['', header]
27
+ jobs.map { |job| status << job.history }
28
+
29
+ status.join("\n")
30
+ end
31
+
32
+ # Returns the status of the current crawl.
33
+ def status
34
+ total = self.crawl_segments.count
35
+ remaining = CrawlSegment.where(:crawl_id => self.id,
36
+ :parse_time => nil).count
37
+ parsed = total - remaining
38
+ status = self.crawl_name
39
+ status += " Segments: to parse #{remaining}, "
40
+ status += "parsed #{parsed}, total #{total}"
41
+ end
42
+
43
+ # Checks for crawl segments in the database. If none are found then checks
44
+ # the S3 API and creates any segments that are found.
45
+ def has_segments?
46
+ if self.crawl_segments.count == 0
47
+ segment_count = create_segments
48
+ result = segment_count > 0
49
+ else
50
+ result = true
51
+ end
52
+ end
53
+
54
+ # Creates crawl segments from their S3 paths and returns the segment count.
55
+ def create_segments
56
+ segment_paths = s3_segment_paths(self.crawl_name)
57
+ save if segment_paths.count > 0
58
+ segment_paths.map { |s3_path| create_segment(s3_path) }
59
+
60
+ segment_paths.count
61
+ end
62
+
63
+ # Returns the list of segments from the database.
64
+ def select_segments(segments_list)
65
+ CrawlSegment.where(:segment_name => segments_list)
66
+ end
67
+
68
+ # Returns next # segments to be parsed. The maximum is 256
69
+ # as this is the maximum # of steps for an Elastic MapReduce job flow.
70
+ def next_segments(max_segments = nil)
71
+ max_segments = MAX_SEGMENTS if max_segments.nil?
72
+ max_segments = MAX_SEGMENTS if max_segments > MAX_SEGMENTS
73
+
74
+ self.crawl_segments.where(:parse_time => nil).limit(max_segments)
75
+ end
76
+
77
+ # Resets parse time of all parsed segments to null so they will be parsed
78
+ # again. Returns the updated crawl status.
79
+ def reset
80
+ segments = CrawlSegment.where('crawl_id = ? and parse_time is not null',
81
+ self.id)
82
+ segments.map { |segment| segment.update_attribute(:parse_time, nil) }
83
+
84
+ status
85
+ end
86
+
87
+ private
88
+ # Creates a crawl segment based on its S3 path if it does not exist.
89
+ def create_segment(s3_path)
90
+ segment_name = s3_path.split('/').last
91
+ segment_s3_uri = URI::Generic.build(:scheme => 's3',
92
+ :host => COMMON_CRAWL_BUCKET,
93
+ :path => "/#{s3_path}").to_s
94
+
95
+ segment = CrawlSegment.where(:crawl_id => self.id,
96
+ :segment_name => segment_name,
97
+ :segment_s3_uri => segment_s3_uri).first_or_create
98
+ end
99
+
100
+ # Returns a list of S3 paths for the crawl name.
101
+ def s3_segment_paths(crawl_name)
102
+ s3_segment_tree(crawl_name).children.collect(&:prefix)
103
+ end
104
+
105
+ # Calls the S3 API and returns the tree structure for the crawl name.
106
+ def s3_segment_tree(crawl_name)
107
+ crawl_path = [COMMON_CRAWL_PATH, crawl_name, SEGMENTS_PATH].join
108
+
109
+ s3 = AWS::S3.new
110
+ bucket = s3.buckets[COMMON_CRAWL_BUCKET]
111
+ bucket.as_tree(:prefix => crawl_path)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,8 @@
1
+ module Elasticrawl
2
+ # Represents a segment of a web crawl released by the Common Crawl Foundation.
3
+ # Each segment contains archive, metadata and text files.
4
+ class CrawlSegment < ActiveRecord::Base
5
+ belongs_to :crawl
6
+ has_many :job_steps
7
+ end
8
+ end
@@ -0,0 +1,22 @@
1
+ module Elasticrawl
2
+ # Base error class extends standard error.
3
+ class Error < StandardError; end
4
+
5
+ # AWS access credentials are invalid.
6
+ class AWSCredentialsInvalidError < Error; end
7
+
8
+ # Config directory does not exist.
9
+ class ConfigDirMissingError < Error; end
10
+
11
+ # Database error accessing sqlite database.
12
+ class DatabaseAccessError < Error; end
13
+
14
+ # Error accessing AWS Elastic MapReduce API.
15
+ class ElasticMapReduceAccessError < Error; end
16
+
17
+ # Error accessing config directory.
18
+ class FileAccessError < Error; end
19
+
20
+ # Error accessing AWS S3 API.
21
+ class S3AccessError < Error; end
22
+ end
@@ -0,0 +1,68 @@
1
+ module Elasticrawl
2
+ # The base job class that is extended by ParseJob and CombineJob.
3
+ class Job < ActiveRecord::Base
4
+ has_many :job_steps
5
+
6
+ # Displays a confirmation message showing the configuration of the
7
+ # Elastic MapReduce job flow and cluster.
8
+ def confirm_message
9
+ cluster = Cluster.new
10
+ message = []
11
+
12
+ message[0] = 'Job configuration'
13
+ message[1] = self.job_desc
14
+ message[2] = ''
15
+ message[3] = cluster.cluster_desc
16
+ message.join("\n")
17
+ end
18
+
19
+ # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
20
+ # launched successfully.
21
+ def result_message
22
+ "\nJob Name: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
23
+ end
24
+
25
+ # Displays the history of the current job. Called by the status command.
26
+ def history
27
+ launch_time = "Launched: #{self.created_at.strftime('%Y-%m-%d %H:%M:%S')}"
28
+ "#{self.job_name} #{launch_time} #{self.job_desc}"
29
+ end
30
+
31
+ protected
32
+ # Calls the Elastic MapReduce API to create a Job Flow. Returns the Job Flow ID.
33
+ def run_job_flow(emr_config)
34
+ cluster = Cluster.new
35
+ job_flow = cluster.create_job_flow(self, emr_config)
36
+
37
+ job_steps.each do |step|
38
+ job_flow.add_step(step.job_flow_step(job_config))
39
+ end
40
+
41
+ begin
42
+ job_flow.run
43
+
44
+ rescue StandardError => e
45
+ raise ElasticMapReduceAccessError, e.message
46
+ end
47
+ end
48
+
49
+ # Returns an S3 location for storing either data or logs.
50
+ def build_s3_uri(s3_path)
51
+ URI::Generic.build(:scheme => 's3',
52
+ :host => bucket_name,
53
+ :path => s3_path).to_s
54
+ end
55
+
56
+ # Returns the S3 bucket name configured by the user using the init command.
57
+ def bucket_name
58
+ config = Config.new
59
+ config.load_config('jobs')['s3_bucket_name']
60
+ end
61
+
62
+ # Sets the job name which is the current Unix timestamp in milliseconds.
63
+ # This is the same naming format used for Common Crawl segment names.
64
+ def set_job_name
65
+ (Time.now.to_f * 1000).to_i.to_s
66
+ end
67
+ end
68
+ end