RubyGems - monkeyshines - Versions diffs - 0.0.2 - Mend

monkeyshines 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

data/.document +4 -0
data/.gitignore +43 -0
data/LICENSE +20 -0
data/LICENSE.textile +20 -0
data/README.textile +125 -0
data/Rakefile +105 -0
data/VERSION +1 -0
data/examples/.gitignore +4 -0
data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
data/examples/rename_tree/rename_hdp_tree.rb +151 -0
data/examples/rename_tree/rename_ripd_tree.rb +82 -0
data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
data/examples/shorturls/README.textile +111 -0
data/examples/shorturls/bulkdump_shorturls.rb +46 -0
data/examples/shorturls/bulkload_shorturls.rb +45 -0
data/examples/shorturls/extract_urls.rb +12 -0
data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
data/examples/shorturls/old/shorturl_stats.rb +81 -0
data/examples/shorturls/scrape_shorturls.rb +112 -0
data/examples/shorturls/shorturl_request.rb +29 -0
data/examples/shorturls/shorturl_sequence.rb +121 -0
data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
data/examples/shorturls/start_shorturl_cache.sh +2 -0
data/lib/monkeyshines.rb +31 -0
data/lib/monkeyshines/extensions.rb +16 -0
data/lib/monkeyshines/fetcher.rb +10 -0
data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
data/lib/monkeyshines/fetcher/base.rb +44 -0
data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
data/lib/monkeyshines/monitor.rb +7 -0
data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
data/lib/monkeyshines/options.rb +59 -0
data/lib/monkeyshines/recursive_runner.rb +26 -0
data/lib/monkeyshines/repository/base.rb +57 -0
data/lib/monkeyshines/repository/s3.rb +169 -0
data/lib/monkeyshines/request_stream.rb +11 -0
data/lib/monkeyshines/request_stream/base.rb +32 -0
data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
data/lib/monkeyshines/runner.rb +161 -0
data/lib/monkeyshines/runner_core/options.rb +5 -0
data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
data/lib/monkeyshines/scrape_request.rb +136 -0
data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
data/lib/monkeyshines/store.rb +14 -0
data/lib/monkeyshines/store/base.rb +29 -0
data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
data/lib/monkeyshines/store/conditional_store.rb +57 -0
data/lib/monkeyshines/store/factory.rb +8 -0
data/lib/monkeyshines/store/flat_file_store.rb +84 -0
data/lib/monkeyshines/store/key_store.rb +51 -0
data/lib/monkeyshines/store/null_store.rb +15 -0
data/lib/monkeyshines/store/read_thru_store.rb +22 -0
data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
data/lib/monkeyshines/utils/factory_module.rb +106 -0
data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
data/lib/monkeyshines/utils/logger.rb +15 -0
data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
data/lib/monkeyshines/utils/trollop.rb +744 -0
data/lib/monkeyshines/utils/union_interval.rb +52 -0
data/lib/monkeyshines/utils/uri.rb +70 -0
data/lib/monkeyshines/utils/uuid.rb +32 -0
data/monkeyshines.gemspec +147 -0
data/scrape_from_file.rb +44 -0
data/spec/monkeyshines_spec.rb +7 -0
data/spec/spec_helper.rb +9 -0
metadata +183 -0

data/lib/monkeyshines/repository/s3.rb ADDED Viewed

@@ -0,0 +1,169 @@
+require 'right_aws'
+module Monkeyshines
+  module Store
+    #
+    # Large portions lifted from Thoughtbot's Paperclip gem.
+    #
+    # Amazon's S3 file hosting service is a scalable, easy place to store files for
+    # distribution. You can find out more about it at http://aws.amazon.com/s3
+    # There are a few S3-specific options for has_attached_file:
+    # * +s3_credentials+: Takes a path, a File, or a Hash. The path (or File) must point
+    #   to a YAML file containing the +access_key_id+ and +secret_access_key+ that Amazon
+    #   gives you. You can 'environment-space' this just like you do to your
+    #   database.yml file, so different environments can use different accounts:
+    #     development:
+    #       access_key_id: 123...
+    #       secret_access_key: 123...
+    #     test:
+    #       access_key_id: abc...
+    #       secret_access_key: abc...
+    #     production:
+    #       access_key_id: 456...
+    #       secret_access_key: 456...
+    #   This is not required, however, and the file may simply look like this:
+    #     access_key_id: 456...
+    #     secret_access_key: 456...
+    #   In which case, those access keys will be used in all environments. You can also
+    #   put your bucket name in this file, instead of adding it to the code directly.
+    #   This is useful when you want the same account but a different bucket for
+    #   development versus production.
+    # * +s3_permissions+: This is a String that should be one of the "canned" access
+    #   policies that S3 provides (more information can be found here:
+    #   http://docs.amazonwebservices.com/AmazonS3/2006-03-01/RESTAccessPolicy.html#RESTCannedAccessPolicies)
+    #   The default for Paperclip is "public-read".
+    # * +s3_protocol+: The protocol for the URLs generated to your S3 assets. Can be either
+    #   'http' or 'https'. Defaults to 'http' when your :s3_permissions are 'public-read' (the
+    #   default), and 'https' when your :s3_permissions are anything else.
+    # * +s3_headers+: A hash of headers such as {'Expires' => 1.year.from_now.httpdate}
+    # * +bucket+: This is the name of the S3 bucket that will store your files. Remember
+    #   that the bucket must be unique across all of Amazon S3. If the bucket does not exist
+    #   Paperclip will attempt to create it. The bucket name will not be interpolated.
+    #   You can define the bucket as a Proc if you want to determine it's name at runtime.
+    #   Paperclip will call that Proc with attachment as the only argument.
+    # * +s3_host_alias+: The fully-qualified domain name (FQDN) that is the alias to the
+    #   S3 domain of your bucket. Used with the :s3_alias_url url interpolation. See the
+    #   link in the +url+ entry for more information about S3 domains and buckets.
+    # * +url+: There are three options for the S3 url. You can choose to have the bucket's name
+    #   placed domain-style (bucket.s3.amazonaws.com) or path-style (s3.amazonaws.com/bucket).
+    #   Lastly, you can specify a CNAME (which requires the CNAME to be specified as
+    #   :s3_alias_url. You can read more about CNAMEs and S3 at
+    #   http://docs.amazonwebservices.com/AmazonS3/latest/index.html?VirtualHosting.html
+    #   Normally, this won't matter in the slightest and you can leave the default (which is
+    #   path-style, or :s3_path_url). But in some cases paths don't work and you need to use
+    #   the domain-style (:s3_domain_url). Anything else here will be treated like path-style.
+    #   NOTE: If you use a CNAME for use with CloudFront, you can NOT specify https as your
+    #   :s3_protocol; This is *not supported* by S3/CloudFront. Finally, when using the host
+    #   alias, the :bucket parameter is ignored, as the hostname is used as the bucket name
+    #   by S3.
+    # * +path+: This is the key under the bucket in which the file will be stored. The
+    #   URL will be constructed from the bucket and the path. This is what you will want
+    #   to interpolate. Keys should be unique, like filenames, and despite the fact that
+    #   S3 (strictly speaking) does not support directories, you can still use a / to
+    #   separate parts of your file name.
+    class S3Repository < Monkeyshines::Repository::Base
+      attr_reader :bucket_name, :s3_host_alias, :s3_protocol
+      def initialize options={}
+        @s3_credentials = parse_credentials(@options[:s3_credentials])
+        @bucket         = @options[:bucket]         || @s3_credentials[:bucket]
+        @bucket         = @bucket.call(self) if @bucket.is_a?(Proc)
+        @s3_options     = @options[:s3_options]     || {}
+        @s3_permissions = @options[:s3_permissions] || 'public-read'
+        @s3_protocol    = @options[:s3_protocol]    || (@s3_permissions == 'public-read' ? 'http' : 'https')
+        @s3_headers     = @options[:s3_headers]     || {}
+        @s3_host_alias  = @options[:s3_host_alias]
+      end
+      #
+      # Implementation of Monkeyshines::Repository
+      #
+      def exists?(filename)
+        s3_bucket.key(filename) ? true : false
+      end
+      def md5(filename)
+      end
+      #
+      # s3 interface
+      #
+      # Use with Monkeyshines::Utils::FilenamePattern to generate urls to s3 files.
+      # Ex:
+      #     s3_url = FilenamePattern.new(":s3_path_url_base/path/to/file.ext", s3_repo.filename_pattern_tokens)
+      #
+      def filename_pattern_tokens
+        { :s3_domain_url_base => "#{s3_protocol}://#{bucket_name}.s3.amazonaws.com",
+          :s3_alias_url_base  => "#{s3_protocol}://#{s3_host_alias}",
+          :s3_path_url_base   => "#{s3_protocol}://s3.amazonaws.com/#{bucket_name}", }
+      end
+      def s3
+        @s3 ||= RightAws::S3.new(@s3_credentials[:access_key_id],
+          @s3_credentials[:secret_access_key],
+          @s3_options)
+      end
+      def s3_bucket
+        @s3_bucket ||= s3.bucket(@bucket, true, @s3_permissions)
+      end
+      def parse_credentials creds
+        find_credentials(creds)
+      end
+      def find_credentials creds
+        case creds
+        when File
+          YAML.load_file(creds.path)
+        when String
+          YAML.load_file(creds)
+        when Hash
+          creds
+        else
+          raise ArgumentError, "Credentials are not a path, file, or hash."
+        end
+      end
+      private :find_credentials
+      # # Returns representation of the data of the file assigned to the given
+      # # style, in the format most representative of the current storage.
+      # def to_file path
+      #   @queued_for_write[path] || s3_bucket.key(path(style))
+      # end
+      # alias_method :to_io, :to_file
+      #
+      # def flush_writes #:nodoc:
+      #   @queued_for_write.each do |style, file|
+      #     begin
+      #       log("saving #{path(style)}")
+      #       key = s3_bucket.key(path(style))
+      #       key.data = file
+      #       key.put(nil, @s3_permissions, {'Content-type' => instance_read(:content_type)}.merge(@s3_headers))
+      #     rescue RightAws::AwsError => e
+      #       raise
+      #     end
+      #   end
+      #   @queued_for_write = {}
+      # end
+      #
+      # def flush_deletes #:nodoc:
+      #   @queued_for_delete.each do |path|
+      #     begin
+      #       log("deleting #{path}")
+      #       if file = s3_bucket.key(path)
+      #         file.delete
+      #       end
+      #     rescue RightAws::AwsError
+      #       # Ignore this.
+      #     end
+      #   end
+      #   @queued_for_delete = []
+      # end
+    end
+  end
+end

data/lib/monkeyshines/request_stream.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Monkeyshines
+  module RequestStream
+    extend FactoryModule
+    autoload :Base,                   'monkeyshines/request_stream/base'
+    autoload :KlassRequestStream,     'monkeyshines/request_stream/klass_request_stream'
+    autoload :KlassHashRequestStream, 'monkeyshines/request_stream/klass_request_stream'
+    autoload :SimpleRequestStream,    'monkeyshines/request_stream/simple_request_stream'
+    autoload :BeanstalkQueue,         'monkeyshines/request_stream/beanstalk_queue'
+    autoload :EdamameQueue,           'monkeyshines/request_stream/edamame_queue'
+  end
+end

data/lib/monkeyshines/request_stream/base.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Monkeyshines
+  module RequestStream
+    #
+    # RequestStream::Base
+    #
+    #
+    class Base
+      attr_accessor :options
+      Base::DEFAULT_OPTIONS = {}
+      def initialize _options={}
+        self.options = Base::DEFAULT_OPTIONS.deep_merge(_options)
+        Log.debug "New #{self.class} as #{options.inspect}"
+      end
+      def each *args, &block
+        self.request_store.each(*args) do |*raw_req_args|
+          req = request_from_raw(*raw_req_args)
+          yield req
+        end
+      end
+      def put *args
+        request_store.put *args
+      end
+      def skip! *args
+        request_store.skip! *args
+      end
+    end
+  end
+end

data/lib/monkeyshines/request_stream/edamame_queue.rb ADDED Viewed

@@ -0,0 +1,54 @@
+module Monkeyshines
+  module RequestStream
+    #
+    # Watch for jobs in an Edamame priority queue
+    # (http://mrflip.github.com/edamame)
+    #
+    class EdamameQueue < Edamame::Broker
+      # How long to wait for tasks
+      cattr_accessor :queue_request_timeout
+      self.queue_request_timeout = 5 * 60 # seconds
+      # priority for search jobs if not otherwise given
+      QUEUE_PRIORITY = 65536
+      def initialize _options
+        tube = Monkeyshines::CONFIG[:handle].to_s.gsub(/_/, '-')
+        super _options.deep_merge( :tube => tube )
+        if _options[:queue_request_timeout]
+          Log.info "Setting timeout to #{_options[:queue_request_timeout]}"
+          self.queue_request_timeout = _options[:queue_request_timeout]
+        end
+      end
+      # def each klass, &block
+      #   work(queue_request_timeout, klass) do |job|
+      #     job.each_request(&block)
+      #   end
+      #   Log.info [queue, queue.beanstalk_stats]
+      # end
+      def each &block
+        work(queue_request_timeout) do |job|
+          yield job.obj['type'], job.obj
+        end
+        Log.info [queue, queue.beanstalk_stats]
+      end
+      def req_to_job req, job_options={}
+        obj_hash = req.to_hash.merge(
+          'type' => req.class.to_s,
+          'key'  => [req.class.to_s, req.key].join('-')  )
+        Edamame::Job.from_hash(job_options.merge("obj" => obj_hash,
+            'priority' => (66000 + 1000*req.req_generation),
+            'tube' => tube ))
+      end
+      def put job, *args
+        job_options = args.extract_options!
+        job = req_to_job(job, job_options) unless job.is_a?(Beanstalk::Job) || job.is_a?(Edamame::Job)
+        # p [self.class, job.key, job.obj,job.scheduling, job_options, args]
+        super job, *args
+      end
+    end
+  end
+end

data/lib/monkeyshines/request_stream/klass_request_stream.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module Monkeyshines
+  module RequestStream
+    #
+    # KlassRequestStream is an abstract factory for requests -- the first arg
+    # gives the request type
+    #
+    class KlassRequestStream < Base
+      attr_accessor :request_store
+      attr_accessor :klass_scope
+      KlassRequestStream::DEFAULT_OPTIONS = {
+        :store       => { :type => :flat_file_store },
+        :klass_scope => Kernel,
+      }
+      def initialize _options={}
+        super KlassRequestStream::DEFAULT_OPTIONS.deep_merge(_options)
+        self.request_store = Monkeyshines::Store.create(options.merge(options[:store]))
+        self.klass_scope   = options[:klass_scope]
+      end
+      #
+      # use the first arg as a klass name
+      # to create a scrape request using rest of args
+      #
+      def request_from_raw klass_name, *raw_req_args
+        klass = FactoryModule.get_class(klass_scope, klass_name)
+        klass.new(*raw_req_args)
+      end
+    end
+    class KlassHashRequestStream < KlassRequestStream
+      def request_from_raw klass_name, hsh
+        klass = FactoryModule.get_class(klass_scope, klass_name)
+        klass.from_hash(hsh)
+      end
+    end
+  end
+end

data/lib/monkeyshines/request_stream/simple_request_stream.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Monkeyshines
+  module RequestStream
+    #
+    # SimpleRequestStream generates an instance of options[:klass] from each element of its store
+    #
+    class SimpleRequestStream < KlassRequestStream
+      attr_accessor :klass
+      SimpleRequestStream::DEFAULT_OPTIONS = {
+        :klass => Monkeyshines::ScrapeRequest,
+      }
+      def initialize _options={}
+        super SimpleRequestStream::DEFAULT_OPTIONS.merge(_options)
+        self.klass         = options[:klass]
+      end
+      def request_from_raw *raw_req_args
+        klass.new(*raw_req_args)
+      end
+    end
+  end
+end

data/lib/monkeyshines/runner.rb ADDED Viewed

@@ -0,0 +1,161 @@
+require 'yaml'
+require 'monkeyshines/runner_core/options'
+module Monkeyshines
+  class Runner
+    attr_accessor :options
+    attr_accessor :fetcher
+    attr_accessor :source
+    attr_accessor :dest
+    attr_accessor :periodic_log
+    attr_accessor :sleep_time, :force_fetch
+    DEFAULT_OPTIONS = {
+      :source      => { :type => :simple_request_stream, },
+      :dest        => { :type => :flat_file_store, :filemode => 'w'},
+      :fetcher     => { :type => :http_fetcher,     },
+      :log         => { :dest => nil, :iters => 100, :time => 30 },
+      :skip        => nil,
+      :sleep_time  => 0.5,
+      :force_fetch => false,
+    }
+    #
+    # Assembles a MonkeyshinesRunner from the given plan.
+    #
+    # options_hashes is a hash tree of options to build each particular
+    # component. The options are deep merged with the class and global defaults.
+    #
+    # The options for each of :fetcher, :request_stream and :dest are passed to
+    # the Fetcher, RequestStream and Store factories respectively
+    #
+    def initialize *options_hashes
+      prepare_options(*options_hashes)
+      setup_main_log
+      self.source  = create_source
+      self.fetcher = create_fetcher
+      self.dest    = create_dest
+      self.sleep_time  = options[:sleep_time]
+      self.force_fetch = options[:force_fetch]
+    end
+    def create_source
+      Monkeyshines::RequestStream.create(options[:source])
+    end
+    def create_dest
+      Monkeyshines::Store.create(options[:dest])
+    end
+    def create_fetcher
+      Monkeyshines::Fetcher.create(options[:fetcher])
+    end
+    #
+    # Deep merges:
+    # * the DEFAULT_OPTIONS in runner.rb,
+    # * the global Monkeyshines::CONFIG loaded from disk
+    # * the options passed in as arguments
+    #
+    # Options appearing later win out.
+    #
+    def prepare_options *options_hashes
+      self.options = Hash.deep_sum(
+        Monkeyshines::Runner::DEFAULT_OPTIONS,
+        Monkeyshines::CONFIG,
+        *options_hashes
+        )
+    end
+    #
+    # * For each entry in #source,
+    # ** create scrape_request(s)
+    # ** fetch request (...if appropriate)
+    # ** store result (...if fetch was successful)
+    # ** do logging
+    #
+    def run
+      Log.info "Beginning scrape itself"
+      before_scrape()
+      each_request do |req|
+        next unless req
+        before_fetch(req)
+        fetch_and_store(req)
+        after_fetch(req)
+        sleep sleep_time
+        req
+      end
+      after_scrape()
+    end
+    #
+    # before_scrape filter chain.
+    #
+    def before_scrape
+      source.skip!(options[:skip].to_i) if options[:skip]
+    end
+    #
+    # enumerates requests
+    #
+    def each_request &block
+      source.each(&block)
+    end
+    #
+    # before_scrape filter chain.
+    #
+    def before_fetch req
+    end
+    #
+    # Fetch and store result
+    #
+    #
+    def fetch_and_store req
+      # some stores (eg.conditional) only call fetcher if url key is missing.
+      dest.set(req.url, force_fetch) do
+        response = fetcher.get(req)       # do the url fetch
+        return unless response.healthy?   # don't store bad fetches
+        [response.scraped_at, response]   # timestamp for bookkeeper, result for dest
+      end
+    end
+    #
+    # after_fetch
+    #
+    def after_fetch req
+      periodic_log.periodically{ self.log_line(req) }
+    end
+    #
+    # after_scrape
+    #
+    def after_scrape
+      dest.close
+      fetcher.close
+    end
+    #
+    # Logging
+    #
+    def setup_main_log
+      unless options[:log][:dest].blank?
+        log_file = "%s/log/%s" % [WORK_DIR, options[:log][:dest]]
+        FileUtils.mkdir_p(File.dirname(log_file))
+        $stdout = $stderr = File.open( log_file+"-console.log", "a" )
+      end
+    end
+    def periodic_log
+      @periodic_log ||= Monkeyshines::Monitor::PeriodicLogger.new(options[:log])
+    end
+    def log_line result
+      result_log_line = result.blank? ? ['-','-','-'] : [result.response_code, result.url, result.contents.to_s[0..80]]
+      [ dest.log_line, result_log_line ].flatten
+    end
+  end
+end