RubyGems - elasticrawl - Versions diffs - 1.0.0 - Mend

elasticrawl 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/.gitignore +21 -0
data/.travis.yml +5 -0
data/Cheffile +14 -0
data/Cheffile.lock +37 -0
data/Gemfile +3 -0
data/LICENSE +22 -0
data/README.md +232 -0
data/Rakefile +11 -0
data/Vagrantfile +58 -0
data/bin/elasticrawl +141 -0
data/db/migrate/201401051536_create_crawls.rb +10 -0
data/db/migrate/201401051855_create_crawl_segments.rb +14 -0
data/db/migrate/201401101723_create_jobs.rb +14 -0
data/db/migrate/201401141606_create_job_steps.rb +11 -0
data/elasticrawl.gemspec +35 -0
data/lib/elasticrawl/cluster.rb +128 -0
data/lib/elasticrawl/combine_job.rb +86 -0
data/lib/elasticrawl/config.rb +242 -0
data/lib/elasticrawl/crawl.rb +114 -0
data/lib/elasticrawl/crawl_segment.rb +8 -0
data/lib/elasticrawl/error.rb +22 -0
data/lib/elasticrawl/job.rb +68 -0
data/lib/elasticrawl/job_step.rb +46 -0
data/lib/elasticrawl/parse_job.rb +84 -0
data/lib/elasticrawl/version.rb +3 -0
data/lib/elasticrawl.rb +21 -0
data/spec/fixtures/aws.yml +4 -0
data/spec/fixtures/cluster.yml +44 -0
data/spec/fixtures/jobs.yml +31 -0
data/spec/spec_helper.rb +35 -0
data/spec/unit/cluster_spec.rb +54 -0
data/spec/unit/combine_job_spec.rb +97 -0
data/spec/unit/config_spec.rb +17 -0
data/spec/unit/crawl_segment_spec.rb +27 -0
data/spec/unit/crawl_spec.rb +137 -0
data/spec/unit/job_spec.rb +10 -0
data/spec/unit/job_step_spec.rb +60 -0
data/spec/unit/parse_job_spec.rb +130 -0
data/templates/aws.yml +7 -0
data/templates/cluster.yml +44 -0
data/templates/jobs.yml +31 -0
metadata +315 -0

data/lib/elasticrawl/cluster.rb ADDED Viewed

@@ -0,0 +1,128 @@
+module Elasticrawl
+  # Configures the cluster settings for the job flow that will be launched.
+  # These settings are loaded from ~/.elasticrawl/cluster.yml.
+  class Cluster
+    def initialize
+      @master_group = instance_group('master')
+      @core_group = instance_group('core')
+      @task_group = instance_group('task') if has_task_group?
+    end
+    # Returns a configured job flow to the calling job.
+    def create_job_flow(job, emr_config = nil)
+      config = Config.new
+      job_flow = Elasticity::JobFlow.new(config.access_key_id,
+                                         config.secret_access_key)
+      job_flow.name = "Job Name: #{job.job_name} #{job.job_desc}"
+      job_flow.log_uri = job.log_uri
+      configure_job_flow(job_flow)
+      configure_instances(job_flow)
+      configure_bootstrap_actions(job_flow, emr_config)
+      job_flow
+    end
+    # Describes the instances that will be launched.  This is used by the
+    # job confirmation messages.
+    def cluster_desc
+      cluster_desc = <<-HERE
+Cluster configuration
+Master: #{instance_group_desc(@master_group)}
+Core:   #{instance_group_desc(@core_group)}
+Task:   #{instance_group_desc(@task_group)}
+HERE
+    end
+  private
+    # Set job flow properties from settings in cluster.yml.
+    def configure_job_flow(job_flow)
+        ec2_key_name = config_setting('ec2_key_name')
+        placement = config_setting('placement')
+        emr_ami_version = config_setting('emr_ami_version')
+        job_flow.ec2_key_name = ec2_key_name if ec2_key_name.present?
+        job_flow.placement = placement if placement.present?
+        job_flow.ami_version = emr_ami_version if emr_ami_version.present?
+    end
+    # Configures the instances that will be launched.  The master group has
+    # a single node.  The task group is optional.
+    def configure_instances(job_flow)
+      job_flow.set_master_instance_group(@master_group)
+      job_flow.set_core_instance_group(@core_group)
+      job_flow.set_task_instance_group(@task_group) if @task_group.present?
+    end
+    # Configures bootstrap actions that will be run when each instance is
+    # launched. EMR config is an XML file of Hadoop settings stored on S3.
+    # There are applied to each node by a bootstrap action.
+    def configure_bootstrap_actions(job_flow, emr_config = nil)
+      bootstrap_scripts = config_setting('bootstrap_scripts')
+      if bootstrap_scripts.present?
+        bootstrap_scripts.each do |script_uri|
+          action = Elasticity::BootstrapAction.new(script_uri, '', '')
+          job_flow.add_bootstrap_action(action)
+        end
+      end
+      if emr_config.present?
+        action = Elasticity::HadoopFileBootstrapAction.new(emr_config)
+        job_flow.add_bootstrap_action(action)
+      end
+    end
+    # Returns whether cluster.yml specifies a task group.
+    def has_task_group?
+      task_config = config_for_group('task')
+      task_config.has_key?('instance_count') && task_config['instance_count'] > 0
+    end
+    # Describes an instance group.
+    def instance_group_desc(group)
+      if group.present?
+        if group.market == 'SPOT'
+          price = "(Spot: #{group.bid_price})"
+        else
+          price = '(On Demand)'
+        end
+        "#{group.count} #{group.type}  #{price}"
+      else
+        '--'
+      end
+    end
+    # Configures an instance group with the instance type, # of instances and
+    # the bid price if spot instances are to be used.
+    def instance_group(group_name)
+      config = config_for_group(group_name)
+      instance_group = Elasticity::InstanceGroup.new
+      instance_group.role = group_name.upcase
+      instance_group.type = config['instance_type']
+      if config.has_key?('instance_count') && config['instance_count'] > 0
+        instance_group.count = config['instance_count']
+      end
+      if config['use_spot_instances'] == true
+        instance_group.set_spot_instances(config['bid_price'])
+      end
+      instance_group
+    end
+    # Returns the config settings for an instance group.
+    def config_for_group(group_name)
+      config_setting("#{group_name}_instance_group")
+    end
+    # Returns a config setting from cluster.yml.
+    def config_setting(key_name)
+      config = Config.new
+      config.load_config('cluster')[key_name]
+    end
+  end
+end

data/lib/elasticrawl/combine_job.rb ADDED Viewed

@@ -0,0 +1,86 @@
+module Elasticrawl
+  # Represents an Elastic MapReduce job flow that combines the results of
+  # multiple Elasticrawl Parse jobs.  Parse jobs write their results per
+  # segment. Combine jobs aggregate parse results into a single set of files.
+  #
+  # Inherits from Job which is the ActiveRecord model class.
+  class CombineJob < Job
+    # Takes in an array of parse jobs that are to be combined. Creates a single
+    # job step whose input paths are the outputs of the parse jobs.
+    def set_input_jobs(input_jobs)
+      segment_count = 0
+      input_paths = []
+      input_jobs.each do |job_name|
+        input_job = Job.where(:job_name => job_name,
+                              :type => 'Elasticrawl::ParseJob').first_or_initialize
+        step_count = input_job.job_steps.count
+        if step_count > 0
+          segment_count += step_count
+          input_paths << set_input_path(input_job)
+        end
+      end
+      self.job_name = set_job_name
+      self.job_desc = set_job_desc(segment_count)
+      job_steps.push(create_job_step(input_paths.join(',')))
+    end
+    # Runs the job by calling the Elastic MapReduce API.
+    def run
+      emr_config = job_config['emr_config']
+      job_flow_id = run_job_flow(emr_config)
+      if job_flow_id.present?
+        self.job_flow_id = job_flow_id
+        self.save
+        self.result_message
+      end
+    end
+    # Returns the S3 location for storing Elastic MapReduce job logs.
+    def log_uri
+      s3_path = "/logs/2-combine/#{self.job_name}/"
+      build_s3_uri(s3_path)
+    end
+  private
+    # Returns a single job step.  The input paths are a CSV list of parse
+    # job outputs.
+    def create_job_step(input_paths)
+      JobStep.create(:job => self,
+                     :input_paths => input_paths,
+                     :output_path => set_output_path)
+    end
+    # Returns the S3 location for reading a parse job. A wildcard is
+    # used for the segment names. The input filter depends on the output
+    # file type of the parse job and what type of compression is used.
+    def set_input_path(input_job)
+      job_name = input_job.job_name
+      input_filter = job_config['input_filter']
+      s3_path = "/data/1-parse/#{job_name}/segments/*/#{input_filter}"
+      build_s3_uri(s3_path)
+    end
+    # Returns the S3 location for storing the combine job results.
+    def set_output_path
+      s3_path = "/data/2-combine/#{self.job_name}/"
+      build_s3_uri(s3_path)
+    end
+    # Sets the job description which forms part of the Elastic MapReduce
+    # job flow name.
+    def set_job_desc(segment_count)
+      "Combining: #{segment_count} segments"
+    end
+    # Returns the combine job configuration from ~/.elasticrawl.jobs.yml.
+    def job_config
+      config = Config.new
+      config.load_config('jobs')['steps']['combine']
+    end
+  end
+end

data/lib/elasticrawl/config.rb ADDED Viewed

@@ -0,0 +1,242 @@
+module Elasticrawl
+  # Represents the current configuration which is persisted to
+  # ~/.elasticrawl/ and contains 3 configuration files.
+  #
+  # aws.yml     - AWS access credentials unless stored in the environment
+  #               variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.
+  # cluster.yml - Elastic MapReduce cluster config including instance groups.
+  # jobs.yml    - Elastic MapReduce jobs config and the S3 bucket used for
+  #               storing data and logs.
+  #
+  # This directory also contains the Elasticrawl SQLite database.
+  class Config
+    CONFIG_DIR = '.elasticrawl'
+    DATABASE_FILE = 'elasticrawl.sqlite3'
+    TEMPLATES_DIR = '../../templates'
+    TEMPLATE_FILES = ['aws.yml', 'cluster.yml', 'jobs.yml']
+    attr_reader :access_key_id
+    attr_reader :secret_access_key
+    # Sets the AWS access credentials needed for the S3 and EMR API calls.
+    def initialize(access_key_id = nil, secret_access_key = nil)
+      # Credentials have been provided to the init command.
+      @access_key_id = access_key_id
+      @secret_access_key = secret_access_key
+      # If credentials are not set then check if they are available in aws.yml.
+      if dir_exists?
+        config = load_config('aws')
+        key = config['access_key_id']
+        secret = config['secret_access_key']
+        @access_key_id ||= key unless key == 'ACCESS_KEY_ID'
+        @secret_access_key ||= secret unless secret == 'SECRET_ACCESS_KEY'
+      end
+      # If credentials are still not set then check AWS environment variables.
+      @access_key_id ||= ENV['AWS_ACCESS_KEY_ID']
+      @secret_access_key ||= ENV['AWS_SECRET_ACCESS_KEY']
+      # Set AWS credentials for use when accessing the S3 API.
+      AWS.config(:access_key_id => @access_key_id,
+                 :secret_access_key => @secret_access_key)
+    end
+    # Returns the location of the config directory.
+    def config_dir
+      File.join(Dir.home, CONFIG_DIR)
+    end
+    # Checks if the configuration directory exists.
+    def dir_exists?
+      Dir.exists?(config_dir)
+    end
+    # Loads a YAML configuration file.
+    def load_config(config_file)
+      if dir_exists?
+        begin
+          config_file = File.join(config_dir, "#{config_file}.yml")
+          config = YAML::load(File.open(config_file))
+        rescue StandardError => e
+          raise FileAccessError, e.message
+        end
+      else
+        raise ConfigDirMissingError, 'Config dir missing. Run init command'
+      end
+    end
+    # Loads the sqlite database.  If no database exists it will be created
+    # and the database migrations will be run.
+    def load_database
+      if dir_exists?
+        config = {
+          'adapter' => 'sqlite3',
+          'database' => File.join(config_dir, DATABASE_FILE),
+          'pool' => 5,
+          'timeout' => 5000
+        }
+        begin
+          ActiveRecord::Base.establish_connection(config)
+          ActiveRecord::Migrator.migrate(File.join(File.dirname(__FILE__), \
+            '../../db/migrate'), ENV['VERSION'] ? ENV['VERSION'].to_i : nil )
+        rescue StandardError => e
+          raise DatabaseAccessError, e.message
+        end
+      else
+        raise ConfigDirMissingError, 'Config dir missing. Run init command'
+      end
+    end
+    # Checks if a S3 bucket name is in use.
+    def bucket_exists?(bucket_name)
+      begin
+        s3 = AWS::S3.new
+        s3.buckets[bucket_name].exists?
+      rescue AWS::S3::Errors::SignatureDoesNotMatch => e
+        raise AWSCredentialsInvalidError, 'AWS access credentials are invalid'
+      rescue StandardError => e
+        raise S3AccessError, e.message
+      end
+    end
+    # Creates the S3 bucket and config directory. Deploys the config templates
+    # and creates the sqlite database.
+    def create(bucket_name)
+      create_bucket(bucket_name)
+      deploy_templates(bucket_name)
+      load_database
+      status_message(bucket_name, 'created')
+    end
+    # Deletes the S3 bucket and config directory.
+    def delete
+      bucket_name = load_config('jobs')['s3_bucket_name']
+      delete_bucket(bucket_name)
+      delete_config_dir
+      status_message(bucket_name, 'deleted')
+    end
+    # Displayed by destroy command to confirm deletion.
+    def delete_warning
+      bucket_name = load_config('jobs')['s3_bucket_name']
+      message = ['WARNING:']
+      message << "Bucket s3://#{bucket_name} and its data will be deleted"
+      message << "Config dir #{config_dir} will be deleted"
+      message.join("\n")
+    end
+    # Displayed by init command.
+    def access_key_prompt
+      prompt = "Enter AWS Access Key ID:"
+      prompt += " [#{@access_key_id}]" if @access_key_id.present?
+      prompt
+    end
+    # Displayed by init command.
+    def secret_key_prompt
+      prompt = "Enter AWS Secret Access Key:"
+      prompt += " [#{@secret_access_key}]" if @secret_access_key.present?
+      prompt
+    end
+  private
+    # Creates a bucket using the S3 API.
+    def create_bucket(bucket_name)
+      begin
+        s3 = AWS::S3.new
+        s3.buckets.create(bucket_name)
+      rescue StandardError => e
+        raise S3AccessError, e.message
+      end
+    end
+    # Deletes a bucket and its contents using the S3 API.
+    def delete_bucket(bucket_name)
+      begin
+        s3 = AWS::S3.new
+        bucket = s3.buckets[bucket_name]
+        bucket.delete!
+      rescue StandardError => e
+        raise S3AccessError, e.message
+      end
+    end
+    # Creates config directory and copies config templates into it.
+    # Saves S3 bucket name to jobs.yml and AWS credentials to aws.yml.
+    def deploy_templates(bucket_name)
+      begin
+        Dir.mkdir(config_dir, 0755) if dir_exists? == false
+        TEMPLATE_FILES.each do |template_file|
+          FileUtils.cp(File.join(File.dirname(__FILE__), TEMPLATES_DIR, template_file),
+                       File.join(config_dir, template_file))
+        end
+        save_config('jobs', { 'BUCKET_NAME' => bucket_name })
+        save_aws_config
+      rescue StandardError => e
+        raise FileAccessError, e.message
+      end
+    end
+    # Saves AWS access credentials to aws.yml unless they are configured as
+    # environment variables.
+    def save_aws_config
+      env_key = ENV['AWS_ACCESS_KEY_ID']
+      env_secret = ENV['AWS_SECRET_ACCESS_KEY']
+      creds = {}
+      creds['ACCESS_KEY_ID'] = @access_key_id unless @access_key_id == env_key
+      creds['SECRET_ACCESS_KEY'] = @secret_access_key \
+        unless @secret_access_key == env_secret
+      save_config('aws', creds)
+    end
+    # Saves config values by overwriting placeholder values in template.
+    def save_config(template, params)
+      config_file = File.join(config_dir, "#{template}.yml")
+      config = File.read(config_file)
+      params.map { |key, value| config = config.gsub(key, value) }
+      File.open(config_file, 'w') { |file| file.write(config) }
+    end
+    # Deletes the config directory including its contents.
+    def delete_config_dir
+      begin
+        FileUtils.rm_r(config_dir) if dir_exists?
+      rescue StandardError => e
+        raise FileAccessError, e.message
+      end
+    end
+    # Notifies user of results of init or destroy commands.
+    def status_message(bucket_name, state)
+      message = ['', "Bucket s3://#{bucket_name} #{state}"]
+      message << "Config dir #{config_dir} #{state}"
+      state = 'complete' if state == 'created'
+      message << "Config #{state}"
+      message.join("\n")
+    end
+  end
+end

data/lib/elasticrawl/crawl.rb ADDED Viewed

@@ -0,0 +1,114 @@
+module Elasticrawl
+  # Represents a web crawl released by the Common Crawl Foundation.
+  # Each crawl is split into multiple crawl segments and is stored
+  # in the S3 public datasets bucket.
+  class Crawl < ActiveRecord::Base
+    has_many :crawl_segments
+    COMMON_CRAWL_BUCKET = 'aws-publicdatasets'
+    COMMON_CRAWL_PATH = 'common-crawl/crawl-data/'
+    SEGMENTS_PATH = '/segments/'
+    MAX_SEGMENTS = 256
+    # Returns the status of all saved crawls and the current job history.
+    def self.status(show_all = false)
+      status = ['Crawl Status']
+      Crawl.all.map { |crawl| status << crawl.status }
+      if show_all == true
+        header = 'Job History'
+        jobs = Job.where('job_flow_id is not null').order(:id => :desc)
+      else
+        header = 'Job History (last 10)'
+        jobs = Job.where('job_flow_id is not null').order(:id => :desc).limit(10)
+      end
+      status << ['', header]
+      jobs.map { |job| status << job.history }
+      status.join("\n")
+    end
+    # Returns the status of the current crawl.
+    def status
+      total = self.crawl_segments.count
+      remaining = CrawlSegment.where(:crawl_id => self.id,
+                                        :parse_time => nil).count
+      parsed = total - remaining
+      status = self.crawl_name
+      status += " Segments: to parse #{remaining}, "
+      status += "parsed #{parsed}, total #{total}"
+    end
+    # Checks for crawl segments in the database.  If none are found then checks
+    # the S3 API and creates any segments that are found.
+    def has_segments?
+      if self.crawl_segments.count == 0
+        segment_count = create_segments
+        result = segment_count > 0
+      else
+        result = true
+      end
+    end
+    # Creates crawl segments from their S3 paths and returns the segment count.
+    def create_segments
+      segment_paths = s3_segment_paths(self.crawl_name)
+      save if segment_paths.count > 0
+      segment_paths.map { |s3_path| create_segment(s3_path) }
+      segment_paths.count
+    end
+    # Returns the list of segments from the database.
+    def select_segments(segments_list)
+      CrawlSegment.where(:segment_name => segments_list)
+    end
+    # Returns next # segments to be parsed. The maximum is 256
+    # as this is the maximum # of steps for an Elastic MapReduce job flow.
+    def next_segments(max_segments = nil)
+      max_segments = MAX_SEGMENTS if max_segments.nil?
+      max_segments = MAX_SEGMENTS if max_segments > MAX_SEGMENTS
+      self.crawl_segments.where(:parse_time => nil).limit(max_segments)
+    end
+    # Resets parse time of all parsed segments to null so they will be parsed
+    # again. Returns the updated crawl status.
+    def reset
+      segments = CrawlSegment.where('crawl_id = ? and parse_time is not null',
+                                    self.id)
+      segments.map { |segment| segment.update_attribute(:parse_time, nil) }
+      status
+    end
+  private
+    # Creates a crawl segment based on its S3 path if it does not exist.
+    def create_segment(s3_path)
+      segment_name = s3_path.split('/').last
+      segment_s3_uri = URI::Generic.build(:scheme => 's3',
+                                          :host => COMMON_CRAWL_BUCKET,
+                                          :path => "/#{s3_path}").to_s
+      segment = CrawlSegment.where(:crawl_id => self.id,
+                         :segment_name => segment_name,
+                         :segment_s3_uri => segment_s3_uri).first_or_create
+    end
+    # Returns a list of S3 paths for the crawl name.
+    def s3_segment_paths(crawl_name)
+      s3_segment_tree(crawl_name).children.collect(&:prefix)
+    end
+    # Calls the S3 API and returns the tree structure for the crawl name.
+    def s3_segment_tree(crawl_name)
+      crawl_path = [COMMON_CRAWL_PATH, crawl_name, SEGMENTS_PATH].join
+      s3 = AWS::S3.new
+      bucket = s3.buckets[COMMON_CRAWL_BUCKET]
+      bucket.as_tree(:prefix => crawl_path)
+    end
+  end
+end

data/lib/elasticrawl/crawl_segment.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module Elasticrawl
+  # Represents a segment of a web crawl released by the Common Crawl Foundation.
+  # Each segment contains archive, metadata and text files.
+  class CrawlSegment < ActiveRecord::Base
+    belongs_to :crawl
+    has_many :job_steps
+  end
+end

data/lib/elasticrawl/error.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Elasticrawl
+  # Base error class extends standard error.
+  class Error < StandardError; end
+  # AWS access credentials are invalid.
+  class AWSCredentialsInvalidError < Error; end
+  # Config directory does not exist.
+  class ConfigDirMissingError < Error; end
+  # Database error accessing sqlite database.
+  class DatabaseAccessError < Error; end
+  # Error accessing AWS Elastic MapReduce API.
+  class ElasticMapReduceAccessError < Error; end
+  # Error accessing config directory.
+  class FileAccessError < Error; end
+  # Error accessing AWS S3 API.
+  class S3AccessError < Error; end
+end

data/lib/elasticrawl/job.rb ADDED Viewed

@@ -0,0 +1,68 @@
+module Elasticrawl
+  # The base job class that is extended by ParseJob and CombineJob.
+  class Job < ActiveRecord::Base
+    has_many :job_steps
+    # Displays a confirmation message showing the configuration of the
+    # Elastic MapReduce job flow and cluster.
+    def confirm_message
+      cluster = Cluster.new
+      message = []
+      message[0] = 'Job configuration'
+      message[1] = self.job_desc
+      message[2] = ''
+      message[3] = cluster.cluster_desc
+      message.join("\n")
+    end
+    # Displays the Job Name and Elastic MapReduce Job Flow ID if the job was
+    # launched successfully.
+    def result_message
+      "\nJob Name: #{self.job_name} Job Flow ID: #{self.job_flow_id}"
+    end
+    # Displays the history of the current job. Called by the status command.
+    def history
+      launch_time = "Launched: #{self.created_at.strftime('%Y-%m-%d %H:%M:%S')}"
+      "#{self.job_name} #{launch_time} #{self.job_desc}"
+    end
+  protected
+    # Calls the Elastic MapReduce API to create a Job Flow. Returns the Job Flow ID.
+    def run_job_flow(emr_config)
+      cluster = Cluster.new
+      job_flow = cluster.create_job_flow(self, emr_config)
+      job_steps.each do |step|
+        job_flow.add_step(step.job_flow_step(job_config))
+      end
+      begin
+        job_flow.run
+      rescue StandardError => e
+        raise ElasticMapReduceAccessError, e.message
+      end
+    end
+    # Returns an S3 location for storing either data or logs.
+    def build_s3_uri(s3_path)
+      URI::Generic.build(:scheme => 's3',
+                         :host => bucket_name,
+                         :path => s3_path).to_s
+    end
+    # Returns the S3 bucket name configured by the user using the init command.
+    def bucket_name
+      config = Config.new
+      config.load_config('jobs')['s3_bucket_name']
+    end
+    # Sets the job name which is the current Unix timestamp in milliseconds.
+    # This is the same naming format used for Common Crawl segment names.
+    def set_job_name
+      (Time.now.to_f * 1000).to_i.to_s
+    end
+  end
+end