RubyGems - parallel588_polipus - Versions diffs - 0.4.0 - Mend

parallel588_polipus 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

checksums.yaml +7 -0
data/.document +5 -0
data/.gitignore +53 -0
data/.rspec +2 -0
data/.rubocop.yml +17 -0
data/.rubocop_todo.yml +33 -0
data/.travis.yml +22 -0
data/AUTHORS.md +5 -0
data/CHANGELOG.md +61 -0
data/Gemfile +12 -0
data/LICENSE.txt +20 -0
data/README.md +70 -0
data/Rakefile +8 -0
data/examples/basic.rb +63 -0
data/examples/error_handling.rb +23 -0
data/examples/incremental.rb +63 -0
data/examples/robots_txt_handling.rb +14 -0
data/examples/survival.rb +10 -0
data/lib/polipus.rb +488 -0
data/lib/polipus/http.rb +282 -0
data/lib/polipus/page.rb +256 -0
data/lib/polipus/plugin.rb +14 -0
data/lib/polipus/plugins/cleaner.rb +25 -0
data/lib/polipus/plugins/sample.rb +15 -0
data/lib/polipus/plugins/sleeper.rb +22 -0
data/lib/polipus/queue_overflow.rb +26 -0
data/lib/polipus/queue_overflow/base.rb +7 -0
data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
data/lib/polipus/queue_overflow/manager.rb +57 -0
data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
data/lib/polipus/queue_overflow/worker.rb +24 -0
data/lib/polipus/robotex.rb +145 -0
data/lib/polipus/signal_handler.rb +42 -0
data/lib/polipus/storage.rb +31 -0
data/lib/polipus/storage/base.rb +20 -0
data/lib/polipus/storage/dev_null.rb +35 -0
data/lib/polipus/storage/memory_store.rb +56 -0
data/lib/polipus/storage/mongo_store.rb +90 -0
data/lib/polipus/storage/rethink_store.rb +90 -0
data/lib/polipus/url_tracker.rb +21 -0
data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
data/lib/polipus/url_tracker/redis_set.rb +27 -0
data/lib/polipus/version.rb +5 -0
data/polipus.gemspec +44 -0
data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
data/spec/cassettes/gzipped_on.yml +147 -0
data/spec/cassettes/http_cookies.yml +133 -0
data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
data/spec/cassettes/http_test.yml +1418 -0
data/spec/cassettes/http_test_redirect.yml +71 -0
data/spec/clear.rb +12 -0
data/spec/polipus/http_spec.rb +139 -0
data/spec/polipus/page_spec.rb +68 -0
data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
data/spec/polipus/queue_overflow_spec.rb +66 -0
data/spec/polipus/robotex_spec.rb +85 -0
data/spec/polipus/signal_handler_spec.rb +15 -0
data/spec/polipus/storage/memory_store_spec.rb +87 -0
data/spec/polipus/storage/mongo_store_spec.rb +119 -0
data/spec/polipus/storage/rethink_store_spec.rb +117 -0
data/spec/polipus/url_tracker_spec.rb +29 -0
data/spec/polipus_spec.rb +107 -0
data/spec/spec_helper.rb +42 -0
metadata +348 -0

data/lib/polipus/http.rb ADDED Viewed

@@ -0,0 +1,282 @@
+# encoding: UTF-8
+require 'net/https'
+require 'polipus/page'
+require 'zlib'
+require 'http/cookie'
+module Polipus
+  class HTTP
+    # Maximum number of redirects to follow on each get_response
+    REDIRECT_LIMIT = 5
+    RESCUABLE_ERRORS = [
+      EOFError,
+      Errno::ECONNREFUSED,
+      Errno::ECONNRESET,
+      Errno::EHOSTUNREACH,
+      Errno::EINVAL,
+      Errno::EPIPE,
+      Errno::ETIMEDOUT,
+      Net::HTTPBadResponse,
+      Net::HTTPHeaderSyntaxError,
+      Net::ProtocolError,
+      SocketError,
+      Timeout::Error,
+      Zlib::DataError,
+      Zlib::GzipFile::Error
+    ]
+    def initialize(opts = {})
+      @connections = {}
+      @connections_hits = {}
+      @opts = opts
+    end
+    #
+    # Fetch a single Page from the response of an HTTP request to *url*.
+    # Just gets the final destination page.
+    #
+    def fetch_page(url, referer = nil, depth = nil)
+      fetch_pages(url, referer, depth).last
+    end
+    #
+    # Create new Pages from the response of an HTTP request to *url*,
+    # including redirects
+    #
+    def fetch_pages(url, referer = nil, depth = nil)
+      url = URI(url)
+      pages = []
+      get(url, referer) do |response, code, location, redirect_to, response_time|
+        handle_compression response
+        pages << Page.new(location, body: response.body,
+                                    code: code,
+                                    headers: response.to_hash,
+                                    referer: referer,
+                                    depth: depth,
+                                    redirect_to: redirect_to,
+                                    response_time: response_time,
+                                    fetched_at: Time.now.to_i)
+      end
+      pages
+    rescue *RESCUABLE_ERRORS => e
+      if verbose?
+        puts e.inspect
+        puts e.backtrace
+      end
+      [Page.new(url, error: e, referer: referer, depth: depth)]
+    end
+    #
+    # The maximum number of redirects to follow
+    #
+    def redirect_limit
+      @opts[:redirect_limit] || REDIRECT_LIMIT
+    end
+    #
+    # The user-agent string which will be sent with each request,
+    # or nil if no such option is set
+    #
+    def user_agent
+      if @opts[:user_agent].respond_to?(:sample)
+        @opts[:user_agent].sample
+      else
+        @opts[:user_agent]
+      end
+    end
+    #
+    # The proxy address string
+    #
+    def proxy_host
+      @opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
+    end
+    #
+    # The proxy port
+    #
+    def proxy_port
+      @opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
+    end
+    #
+    # The proxy username
+    #
+    def proxy_user
+      @opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
+    end
+    #
+    # The proxy password
+    #
+    def proxy_pass
+      #return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
+      @opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
+    end
+    #
+    # Shorthand to get proxy info with a single call
+    # It returns an array of ['addr', port, 'user', 'pass']
+    #
+    def proxy_host_port
+      @opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
+    end
+    #
+    # HTTP read timeout in seconds
+    #
+    def read_timeout
+      @opts[:read_timeout]
+    end
+    #
+    # HTTP open timeout in seconds
+    #
+    def open_timeout
+      @opts[:open_timeout]
+    end
+    # Does this HTTP client accept cookies from the server?
+    #
+    def accept_cookies?
+      @opts[:accept_cookies]
+    end
+    def cookie_jar
+      @opts[:cookie_jar] ||= ::HTTP::CookieJar.new
+      @opts[:cookie_jar]
+    end
+    private
+    #
+    # Retrieve HTTP responses for *url*, including redirects.
+    # Yields the response object, response code, and URI location
+    # for each response.
+    #
+    def get(url, referer = nil)
+      limit = redirect_limit
+      loc = url
+      loop do
+        # if redirected to a relative url, merge it with the host of the original
+        # request url
+        loc = url.merge(loc) if loc.relative?
+        response, response_time = get_response(loc, referer)
+        code = Integer(response.code)
+        redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
+        yield response, code, loc, redirect_to, response_time
+        limit -= 1
+        break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
+      end
+    end
+    #
+    # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
+    #
+    def get_response(url, referer = nil)
+      full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
+      opts = {}
+      opts['User-Agent'] = user_agent if user_agent
+      opts['Referer'] = referer.to_s if referer
+      opts['Cookie']  = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
+      opts['Accept-Encoding'] = 'gzip,deflate'
+      retries = 0
+      begin
+        start = Time.now
+        # format request
+        req = Net::HTTP::Get.new(full_path, opts)
+        # HTTP Basic authentication
+        req.basic_auth url.user, url.password if url.user
+        if @opts[:http_user]
+          req.basic_auth @opts[:http_user], @opts[:http_password]
+        end
+        # urls auth schema has higher priority
+        req.basic_auth url.user, url.password if url.user
+        response = connection(url).request(req)
+        finish = Time.now
+        response_time = ((finish - start) * 1000).round
+        cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
+        return response, response_time
+      rescue *RESCUABLE_ERRORS => e
+        puts e.inspect if verbose?
+        refresh_connection(url)
+        retries += 1
+        if retries < 3
+          retry
+        else
+          raise e
+        end
+      end
+    end
+    def connection(url)
+      @connections[url.host] ||= {}
+      @connections_hits[url.host] ||= {}
+      if @connections[url.host][url.port]
+        if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
+          @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
+          return refresh_connection url
+        end
+        @connections_hits[url.host][url.port] += 1
+        return @connections[url.host][url.port]
+      end
+      refresh_connection url
+    end
+    def refresh_connection(url)
+      if @opts[:logger] && proxy_host && proxy_port
+        @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
+      end
+      # Block has higher priority
+      unless @opts[:proxy_host_port].nil?
+        p_host, p_port, p_user, p_pass = proxy_host_port
+      else
+        p_host = proxy_host
+        p_port = proxy_port
+        p_user = proxy_user
+        p_pass = proxy_pass
+      end
+      http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
+      http.read_timeout = read_timeout if read_timeout
+      http.open_timeout = open_timeout if open_timeout
+      if url.scheme == 'https'
+        http.use_ssl = true
+        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+      end
+      @connections_hits[url.host][url.port] = 1
+      @connections[url.host][url.port] = http.start
+    end
+    def verbose?
+      @opts[:verbose]
+    end
+    #
+    # Allowed to connect to the requested url?
+    #
+    def allowed?(to_url, from_url)
+      to_url.host.nil? || (to_url.host == from_url.host)
+    end
+    def handle_compression(response)
+      case response['content-encoding']
+      when 'gzip', 'x-gzip'
+        body_io = StringIO.new(response.body)
+        response.body.replace Zlib::GzipReader.new(body_io).read
+      when 'deflate'
+        response.body.replace Zlib::Inflate.inflate(response.body)
+      end
+    end
+  end
+end

data/lib/polipus/page.rb ADDED Viewed

@@ -0,0 +1,256 @@
+# encoding: UTF-8
+require 'nokogiri'
+require 'json'
+require 'ostruct'
+require 'set'
+require 'kconv'
+module Polipus
+  class Page
+    # The URL of the page
+    attr_reader :url
+    # The raw HTTP response body of the page
+    attr_reader :body
+    # Headers of the HTTP response
+    attr_reader :headers
+    # URL of the page this one redirected to, if any
+    attr_reader :redirect_to
+    # Exception object, if one was raised during HTTP#fetch_page
+    attr_reader :error
+    # Integer response code of the page
+    attr_accessor :code
+    # Depth of this page from the root of the crawl.
+    attr_accessor :depth
+    # URL of the page that brought us to this page
+    attr_accessor :referer
+    # Response time of the request for this page in milliseconds
+    attr_accessor :response_time
+    # OpenStruct it holds users defined data
+    attr_accessor :user_data
+    attr_accessor :aliases
+    attr_accessor :domain_aliases
+    # Whether the current page should be stored
+    # Default: true
+    attr_accessor :storable
+    attr_accessor :fetched_at
+    #
+    # Create a new page
+    #
+    def initialize(url, params = {})
+      @url = URI(url)
+      @code = params[:code]
+      @headers = params[:headers] || {}
+      @headers['content-type'] ||= ['']
+      @aliases = Array(params[:aka]).compact
+      @referer = params[:referer]
+      @depth = params[:depth] || 0
+      @redirect_to = to_absolute(params[:redirect_to])
+      @response_time = params[:response_time]
+      @body = params[:body]
+      @error = params[:error]
+      @fetched = !params[:code].nil?
+      @user_data = OpenStruct.new
+      @domain_aliases = params[:domain_aliases] ||= []
+      @storable = true
+      @fetched_at = params[:fetched_at]
+    end
+    #
+    # Array of distinct A tag HREFs from the page
+    #
+    def links
+      return @links.to_a unless @links.nil?
+      @links = Set.new
+      return [] unless doc
+      doc.search('//a[@href]').each do |a|
+        u = a['href']
+        next if u.nil? || u.empty?
+        abs = to_absolute(u) rescue next
+        @links << abs if in_domain?(abs)
+      end
+      @links.to_a
+    end
+    #
+    # Nokogiri document for the HTML body
+    #
+    def doc
+      return @doc if @doc
+      @doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? rescue nil
+    end
+    #
+    # Discard links, a next call of page.links will return an empty array
+    #
+    def discard_links!
+      @links = []
+    end
+    #
+    # Delete the Nokogiri document and response body to conserve memory
+    #
+    def discard_doc!
+      links # force parsing of page links before we trash the document
+      @doc = @body = nil
+    end
+    #
+    # Was the page successfully fetched?
+    # +true+ if the page was fetched with no error, +false+ otherwise.
+    #
+    def fetched?
+      @fetched
+    end
+    #
+    # The content-type returned by the HTTP request for this page
+    #
+    def content_type
+      headers['content-type'].first
+    end
+    #
+    # Returns +true+ if the page is a HTML document, returns +false+
+    # otherwise.
+    #
+    def html?
+      content_type =~ %r{^(text/html|application/xhtml+xml)\b}
+    end
+    #
+    # Returns +true+ if the page is a HTTP redirect, returns +false+
+    # otherwise.
+    #
+    def redirect?
+      (300...400).include?(@code)
+    end
+    #
+    # Returns +true+ if the page is a HTTP success, returns +false+
+    # otherwise.
+    #
+    def success?
+      (200..206).include?(@code)
+    end
+    #
+    # Returns +true+ if the page was not found (returned 404 code),
+    # returns +false+ otherwise.
+    #
+    def not_found?
+      404 == @code
+    end
+    #
+    # Base URI from the HTML doc head element
+    # http://www.w3.org/TR/html4/struct/links.html#edef-BASE
+    #
+    def base
+      @base = if doc
+                href = doc.search('//head/base/@href')
+                URI(href.to_s) unless href.nil? rescue nil
+              end unless @base
+      return nil if @base && @base.to_s.empty?
+      @base
+    end
+    #
+    # Converts relative URL *link* into an absolute URL based on the
+    # location of the page
+    #
+    def to_absolute(link)
+      return nil if link.nil?
+      # remove anchor
+      link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
+      relative = URI(link)
+      absolute = base ? base.merge(relative) : @url.merge(relative)
+      absolute.path = '/' if absolute.path.empty?
+      absolute
+    end
+    #
+    # Returns +true+ if *uri* is in the same domain as the page, returns
+    # +false+ otherwise
+    #
+    def in_domain?(uri)
+      @domain_aliases ||= []
+      uri.host == @url.host || @domain_aliases.include?(uri.host)
+    end
+    def to_hash
+      {
+        'url'           => @url.to_s,
+        'headers'       => Marshal.dump(@headers),
+        'body'          => @body,
+        'links'         => links.map(&:to_s),
+        'code'          => @code,
+        'depth'         => @depth,
+        'referer'       => @referer.to_s,
+        'redirect_to'   => @redirect_to.to_s,
+        'response_time' => @response_time,
+        'fetched'       => @fetched,
+        'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
+        'fetched_at'    => @fetched_at,
+        'error'         => @error.to_s
+      }
+    end
+    def to_json
+      th = to_hash.dup
+      th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
+      th.delete('headers') if content_type.empty?
+      th.to_json
+    end
+    #
+    # Returns +true+ if page is marked as storeable
+    # +false+ otherwise
+    # Default is +true+
+    #
+    def storable?
+      @storable
+    end
+    def expired?(ttl)
+      return false if fetched_at.nil?
+      (Time.now.to_i - ttl) > fetched_at
+    end
+    def self.from_hash(hash)
+      page = new(URI(hash['url']))
+      {
+        '@headers'       => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
+        '@body'          => hash['body'],
+        '@links'         => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
+        '@code'          => hash['code'].to_i,
+        '@depth'         => hash['depth'].to_i,
+        '@referer'       => hash['referer'],
+        '@redirect_to'   => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
+        '@response_time' => hash['response_time'].to_i,
+        '@fetched'       => hash['fetched'],
+        '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
+        '@fetched_at'    => hash['fetched_at'],
+        '@error'         => hash['error']
+      }.each do |var, value|
+        page.instance_variable_set(var, value)
+      end
+      page
+    end
+    def self.from_json(json)
+      hash = JSON.parse json
+      from_hash hash
+    end
+  end
+end