RubyGems - polipus - Versions diffs - 0.2.2 → 0.3.0 - Mend

polipus 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +8 -8
data/AUTHORS.md +4 -0
data/CHANGELOG.md +20 -0
data/Gemfile +2 -2
data/examples/error_handling.rb +22 -0
data/examples/robots_txt_handling.rb +13 -0
data/lib/polipus.rb +54 -21
data/lib/polipus/http.rb +29 -14
data/lib/polipus/page.rb +15 -6
data/lib/polipus/robotex.rb +154 -0
data/lib/polipus/version.rb +1 -1
data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
data/spec/cassettes/gzipped_on.yml +80 -70
data/spec/cassettes/http_cookies.yml +133 -0
data/spec/cassettes/http_tconnection_max_hits.yml +4091 -7461
data/spec/http_spec.rb +32 -2
data/spec/page_spec.rb +19 -0
data/spec/polipus_spec.rb +18 -0
data/spec/robotex_spec.rb +86 -0
data/spec/spec_helper.rb +1 -0
metadata +15 -4
data/AUTHORS +0 -2
data/README.rdoc +0 -3

checksums.yaml CHANGED Viewed

@@ -1,15 +1,15 @@
 ---
 !binary "U0hBMQ==":
   metadata.gz: !binary |-
-    Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
+    ZTc3MjQ1OWQwNzVhMWFhMGQ2NTdlYjM3ZTkyZDQ3ZDAwZDExZWQ1Mw==
   data.tar.gz: !binary |-
-    NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
+    OTUzYTE5M2U4YTQ3ZGVmZTAzMzdiYjJmZWYzM2Q3MTU0NDMyYzAwMQ==
 SHA512:
   metadata.gz: !binary |-
-    YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
-    MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
-    MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
+    MDk5MGQ2MzBkYzU2MjJlNDg1YTkwYTU1YjJjYWQ0YjAyNDY5OTZkNWJlZDIw
+    NDAwNjY2ZjMwMGUxZWE0NTNiNzc5YmIzZTg2NjcwNjFjZTMyNzIxZjZlYzZm
+    N2ZjMTk2ZjRkYjU0M2VjZDk0NWMxYzk0MjE4MWRkOWFiY2M3YTA=
   data.tar.gz: !binary |-
-    ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
-    MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
-    NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
+    OTgwMTI5MWFhNWQ5Mjk4OWNmZTk3ZGE0MTMyYzM5NDlkMWJhMjFiMWQ4NDQ4
+    OGI1NDU3ZDQ0ZTkzNWFkMzAyZjg3YmRiNDlmN2I0ZDNlNWRlZmVkMjIzMWQ2
+    MGY0NGQ4YTQ1ZmEyMGQ0M2VkNzE2YzIyOGMxOGE4MDQzMWFkZjU=

data/AUTHORS.md ADDED Viewed

@@ -0,0 +1,4 @@
+# Authors
+* [Francesco Laurita](francesco.laurita@gmail.com)
+* [Tobias L. Maier](http://tobiasmaier.info/)

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Changelog
+## 0.3.0 (2015-06-02)
+[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
+* Add `PolipusCrawler#add_to_queue` to add a page back to the queue
+  [#24](https://github.com/taganaka/polipus/pull/24)
+* Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
+  For example a connectivity error.
+  See `/examples/error_handling.rb`
+  [#15](https://github.com/taganaka/polipus/issues/15)
+* Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
+* Polipus supports now `robots.txt` directives.
+  Set the option `:obey_robots_txt` to `true`.
+  See `/examples/robots_txt_handling.rb`
+  [#30](https://github.com/taganaka/polipus/pull/30)
+* Add support for GZIP and deflate compressed HTTP requests
+  [#26](https://github.com/taganaka/polipus/pull/26)
+* Minor improvements to code style

data/Gemfile CHANGED Viewed

@@ -1,3 +1,3 @@
-source "http://rubygems.org"
+source 'https://rubygems.org'
-gemspec
+gemspec

data/examples/error_handling.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'polipus'
+Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
+  # Handle connectivity errors
+  # Only runs when there is an error
+  crawler.on_page_error do |page|
+    # Don't store the page
+    page.storable = false
+    # Add the URL again to the queue
+    crawler.add_to_queue(page)
+  end
+  # In-place page processing
+  # Runs also when there was an error in the page
+  crawler.on_page_downloaded do |page|
+    # Skip block if there is an error
+    return if page.error
+    # A nokogiri object
+    puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
+  end
+end

data/examples/robots_txt_handling.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'polipus'
+options = {
+  user_agent: 'Googlebot', # Act as Google bot
+  obey_robots_txt: true # Follow /robots.txt rules if any
+}
+Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
+  crawler.on_page_downloaded do |page|
+    puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
+  end
+end

data/lib/polipus.rb CHANGED Viewed

@@ -8,6 +8,7 @@ require "polipus/storage"
 require "polipus/url_tracker"
 require "polipus/plugin"
 require "polipus/queue_overflow"
+require "polipus/robotex"
 require "thread"
 require "logger"
 require "json"
@@ -62,6 +63,7 @@ module Polipus
       :stats_enabled => false,
       # Cookies strategy
       :cookie_jar => nil,
+      # whether or not accept cookies
       :accept_cookies => false,
       # A set of hosts that should be considered parts of the same domain
       # Eg It can be used to follow links with and without 'www' domain
@@ -69,7 +71,9 @@ module Polipus
       # Mark a connection as staled after connection_max_hits request
       :connection_max_hits => nil,
       # Page TTL: mark a page as expired after ttl_page seconds
-      :ttl_page => nil
+      :ttl_page => nil,
+      # don't obey the robots exclusion protocol
+      :obey_robots_txt => false
     }
     attr_reader :storage
@@ -110,6 +114,7 @@ module Polipus
       @skip_links_like    = []
       @on_page_downloaded = []
       @on_before_save     = []
+      @on_page_error      = []
       @focus_crawl_block  = nil
       @on_crawl_end       = []
       @redis_factory      = nil
@@ -122,8 +127,8 @@ module Polipus
       @urls = [urls].flatten.map{ |url| URI(url) }
       @urls.each{ |url| url.path = '/' if url.path.empty? }
       @internal_queue = queue_factory
+      @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
       execute_plugin 'on_initialize'
@@ -139,14 +144,10 @@ module Polipus
       PolipusSignalHandler.enable
       overflow_items_controller if queue_overflow_adapter
-      q = queue_factory
       @urls.each do |u|
-        page = Page.new(u.to_s, :referer => '')
-        page.user_data.p_seeded = true
-        q << page.to_json
+        add_url(u) { |page| page.user_data.p_seeded = true }
       end
-      return if q.empty?
+      return if @internal_queue.empty?
       execute_plugin 'on_crawl_start'
       @options[:workers].times do |worker_number|
@@ -194,27 +195,28 @@ module Polipus
               page = pages.last
             end
-            # Execute on_before_save blocks
-            @on_before_save.each {|e| e.call(page)} unless page.nil?
             execute_plugin 'on_after_download'
-            @logger.warn {"Page #{page.url} has error: #{page.error}"} if page.error
+            if page.error
+              @logger.warn {"Page #{page.url} has error: #{page.error}"}
+              incr_error
+              @on_page_error.each {|e| e.call(page)}
+            end
-            incr_error if page.error
+            # Execute on_before_save blocks
+            @on_before_save.each {|e| e.call(page)}
-            if page && page.storable?
+            if page.storable?
               @storage.add page
             end
-            if page
-              @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
-              @logger.info  {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
-            end
+            @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
+            @logger.info  {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
             incr_pages
             # Execute on_page_downloaded blocks
-            @on_page_downloaded.each {|e| e.call(page)} unless page.nil?
+            @on_page_downloaded.each {|e| e.call(page)}
             if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
               links_for(page).each do |url_to_visit|
@@ -264,6 +266,7 @@ module Polipus
       self
     end
+    # A block of code will be executed when crawl session is over
     def on_crawl_end(&block)
       @on_crawl_end << block
       self
@@ -276,6 +279,12 @@ module Polipus
       self
     end
+    # A block of code will be executed whether a page contains an error
+    def on_page_error(&block)
+      @on_page_error << block
+      self
+    end
     # A block of code will be executed
     # on every page downloaded. The code is used to extract urls to visit
     # see links_for method
@@ -313,9 +322,18 @@ module Polipus
       @redis ||= redis_factory_adapter
     end
+    def add_to_queue(page)
+      if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
+        add_url(page.url, referer: page.referer, depth: page.depth)
+      else
+        add_url(page)
+      end
+    end
     # Enqueue an url, no matter what
-    def add_url url
-      page = Page.new(url)
+    def add_url(url, params = {})
+      page = Page.new(url, params)
+      yield(page) if block_given?
       @internal_queue << page.to_json
     end
@@ -329,7 +347,11 @@ module Polipus
     private
       # URLs enqueue policy
       def should_be_visited?(url, with_tracker = true)
         case
+        # robots.txt
+        when !allowed_by_robot?(url)
+          false
         # Check against whitelist pattern matching
         when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
           false
@@ -368,6 +390,17 @@ module Polipus
         @storage.exists?(page) && !page_expired?(page)
       end
+      #
+      # Returns +true+ if we are obeying robots.txt and the link
+      # is granted access in it. Always returns +true+ when we are
+      # not obeying robots.txt.
+      #
+      def allowed_by_robot?(link)
+        return true if @robots.nil?
+        @options[:obey_robots_txt] ? @robots.allowed?(link) : true
+      end
       # The url is enqueued for a later visit
       def enqueue url_to_visit, current_page, queue
         page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)

data/lib/polipus/http.rb CHANGED Viewed

@@ -7,6 +7,21 @@ module Polipus
   class HTTP
     # Maximum number of redirects to follow on each get_response
     REDIRECT_LIMIT = 5
+    RESCUABLE_ERRORS = [
+      EOFError,
+      Errno::ECONNREFUSED,
+      Errno::ECONNRESET,
+      Errno::EHOSTUNREACH,
+      Errno::EINVAL,
+      Errno::EPIPE,
+      Errno::ETIMEDOUT,
+      Net::HTTPBadResponse,
+      Net::HTTPHeaderSyntaxError,
+      Net::ProtocolError,
+      SocketError,
+      Timeout::Error,
+      Zlib::DataError
+    ]
     def initialize(opts = {})
       @connections = {}
@@ -30,13 +45,8 @@ module Polipus
       url = URI(url)
       pages = []
       get(url, referer) do |response, code, location, redirect_to, response_time|
-        body = response.body.dup
-        if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
-          gzip = Zlib::GzipReader.new(StringIO.new(body))
-          body = gzip.read
-        end
-        pages << Page.new(location, :body          => body,
+        handle_compression response
+        pages << Page.new(location, :body          => response.body,
                                     :code          => code,
                                     :headers       => response.to_hash,
                                     :referer       => referer,
@@ -47,13 +57,13 @@ module Polipus
       end
       pages
-    rescue StandardError => e
+    rescue *RESCUABLE_ERRORS => e
       if verbose?
         puts e.inspect
         puts e.backtrace
       end
-      [Page.new(url, :error => e)]
+      [Page.new(url, error: e, referer: referer, depth: depth)]
     end
     #
@@ -154,7 +164,7 @@ module Polipus
       opts['User-Agent'] = user_agent if user_agent
       opts['Referer'] = referer.to_s if referer
       opts['Cookie']  = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
-      opts['Accept-Encoding'] = 'gzip'
+      opts['Accept-Encoding'] = 'gzip,deflate'
       retries = 0
@@ -169,8 +179,7 @@ module Polipus
         response_time = ((finish - start) * 1000).round
         cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
         return response, response_time
-      rescue StandardError => e
+      rescue *RESCUABLE_ERRORS => e
         puts e.inspect if verbose?
         refresh_connection(url)
         retries += 1
@@ -229,8 +238,14 @@ module Polipus
       to_url.host.nil? || (to_url.host == from_url.host)
     end
-    def gzip_enabled?
-      @opts[:gzip_enabled]
+    def handle_compression response
+      case response["content-encoding"]
+      when "gzip", "x-gzip"
+        body_io = StringIO.new(response.body)
+        response.body.replace Zlib::GzipReader.new(body_io).read
+      when "deflate"
+        response.body.replace Zlib::Inflate.inflate(response.body)
+      end
     end
   end

data/lib/polipus/page.rb CHANGED Viewed

@@ -17,8 +17,7 @@ module Polipus
     attr_reader :error
     # Integer response code of the page
     attr_accessor :code
-    # Depth of this page from the root of the crawl. This is not necessarily the
-    # shortest path; use PageStore#shortest_paths! to find that value.
+    # Depth of this page from the root of the crawl.
     attr_accessor :depth
     # URL of the page that brought us to this page
     attr_accessor :referer
@@ -41,7 +40,7 @@ module Polipus
     # Create a new page
     #
     def initialize(url, params = {})
-      @url = url.kind_of?(URI) ? url : URI(url)
+      @url = URI(url)
       @code = params[:code]
       @headers = params[:headers] || {}
       @headers['content-type'] ||= ['']
@@ -130,6 +129,14 @@ module Polipus
       (300..307).include?(@code)
     end
+    #
+    # Returns +true+ if the page is a HTTP success, returns +false+
+    # otherwise.
+    #
+    def success?
+      (200..206).include?(@code)
+    end
     #
     # Returns +true+ if the page was not found (returned 404 code),
     # returns +false+ otherwise.
@@ -192,7 +199,8 @@ module Polipus
        'response_time' => @response_time,
        'fetched'       => @fetched,
        'user_data'     => @user_data.nil? ? {} : @user_data.marshal_dump,
-       'fetched_at'    => @fetched_at
+       'fetched_at'    => @fetched_at,
+       'error'         => @error
      }
     end
@@ -230,7 +238,8 @@ module Polipus
         '@response_time' => hash['response_time'].to_i,
         '@fetched'       => hash['fetched'],
         '@user_data'     => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
-        '@fetched_at'    => hash['fetched_at']
+        '@fetched_at'    => hash['fetched_at'],
+        '@error'         => hash['error']
       }.each do |var, value|
         page.instance_variable_set(var, value)
       end
@@ -242,4 +251,4 @@ module Polipus
       self.from_hash hash
     end
   end
-end
+end

data/lib/polipus/robotex.rb ADDED Viewed

@@ -0,0 +1,154 @@
+require 'open-uri'
+require 'uri'
+require 'timeout'
+module Polipus
+  # Original code taken from
+  # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
+  class Robotex
+    DEFAULT_TIMEOUT = 3
+    VERSION = '1.0.0'
+    attr_reader :user_agent
+    class ParsedRobots
+      def initialize(uri, user_agent)
+        io = Robotex.get_robots_txt(uri, user_agent)
+        if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
+          io = StringIO.new("User-agent: *\nAllow: /\n")
+        end
+        @disallows = {}
+        @allows = {}
+        @delays = {}
+        agent = /.*/
+        io.each do |line|
+          next if line =~ /^\s*(#.*|$)/
+          arr = line.split(":")
+          key = arr.shift
+          value = arr.join(":").strip
+          value.strip!
+          case key.downcase
+            when "user-agent"
+              agent = to_regex(value)
+            when "allow"
+              unless value.empty?
+                @allows[agent] ||= []
+                @allows[agent] << to_regex(value)
+              end
+            when "disallow"
+              unless value.empty?
+                @disallows[agent] ||= []
+                @disallows[agent] << to_regex(value)
+              end
+            when "crawl-delay"
+              @delays[agent] = value.to_i
+          end
+        end
+        @parsed = true
+      end
+      def allowed?(uri, user_agent)
+        return true unless @parsed
+        allowed = true
+        uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
+        path = uri.request_uri
+        @allows.each do |key, value|
+          unless allowed
+            if user_agent =~ key
+              value.each do |rule|
+                if path =~ rule
+                  allowed = true
+                end
+              end
+            end
+          end
+        end
+        @disallows.each do |key, value|
+          if user_agent =~ key
+            value.each do |rule|
+              if path =~ rule
+                allowed = false
+              end
+            end
+          end
+        end
+        return allowed
+      end
+      def delay(user_agent)
+        @delays.each do |agent, delay|
+          return delay if agent =~ user_agent
+        end
+        nil
+      end
+      protected
+      def to_regex(pattern)
+        pattern = Regexp.escape(pattern)
+        pattern.gsub!(Regexp.escape("*"), ".*")
+        Regexp.compile("^#{pattern}")
+      end
+    end
+    def self.get_robots_txt(uri, user_agent)
+      begin
+        Timeout::timeout(Robotex.timeout) do
+          URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
+        end
+      rescue Timeout::Error
+        STDERR.puts "robots.txt request timed out"
+      end
+    end
+    def self.timeout=(t)
+      @timeout = t
+    end
+    def self.timeout
+      @timeout || DEFAULT_TIMEOUT
+    end
+    def initialize(user_agent = nil)
+      user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
+      @user_agent = user_agent
+      @last_accessed = Time.at(1)
+      @parsed = {}
+    end
+    def parse_host(uri)
+      uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
+      @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
+    end
+    #
+    # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
+    #
+    def allowed?(uri)
+      parse_host(uri).allowed?(uri, @user_agent)
+    end
+    #
+    # Return the value of the Crawl-Delay directive, or nil if none
+    def delay(uri)
+      parse_host(uri).delay(@user_agent)
+    end
+    #
+    # Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
+    #
+    def delay!(uri)
+      delay = delay(uri)
+      sleep delay - (Time.now - @last_accessed) if !!delay
+      @last_accessed = Time.now
+    end
+  end
+end