RubyGems - upton - Versions diffs - 0.2.7 → 0.2.8 - Mend

upton 0.2.7 → 0.2.8

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/upton.rb +86 -116
data/lib/upton/downloader.rb +126 -0
data/lib/upton/utils.rb +43 -0
data/spec/data/propublica.html +269 -269
data/spec/data/propublica_search.html +388 -0
data/spec/data/propublica_search_page_2.html +375 -0
data/spec/spec_helper.rb +20 -0
data/spec/upton_downloader_spec.rb +75 -0
data/spec/upton_spec.rb +110 -47
metadata +26 -3
data/lib/utils.rb +0 -74

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
-  data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
+  metadata.gz: 0bc8fddf34dc974bde7491e7dd311eb09b5d393e
+  data.tar.gz: b8a8010408cd715b010406163cd14e45045af2d6
 SHA512:
-  metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
-  data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
+  metadata.gz: 0c5cdda936dcaf7a045afbc6cb317fc463191823a13d585732717f6ddfb3d4970a94c51df0324a343022c938702ca8b0fdbbf9e8b54fb0cc5fafec1dd8af8276
+  data.tar.gz: e5f2bd0c9f9ba843607b0ac7816c84df21cc6acbb0de13ec5918e3edb866fa41d7e6e9b39d4d0af7ea74c0ebf4628240ee783612edc3370842f860039ccc6465

data/lib/upton.rb CHANGED

@@ -3,55 +3,56 @@
 require 'nokogiri'
 require 'uri'
 require 'restclient'
-require_relative './utils'
+require_relative 'upton/utils'
+require_relative 'upton/downloader'
 ##
 # This module contains a scraper called Upton
 ##
 module Upton
   ##
-  # *Upton* is a framework for easy web-scraping with a useful debug mode
-  # that doesn't hammer your target's servers. It does the repetitive parts of
+  # *Upton* is a framework for easy web-scraping with a useful debug mode
+  # that doesn't hammer your target's servers. It does the repetitive parts of
   # writing scrapers, so you only have to write the unique parts for each site.
   #
   # Upton operates on the theory that, for most scraping projects, you need to
   # scrape two types of pages:
-  #
-  # 1. Index pages, which list instance pages. For example, a job search
+  #
+  # 1. Index pages, which list instance pages. For example, a job search
   #     site's search page or a newspaper's homepage.
   # 2. Instance pages, which represent the goal of your scraping, e.g.
   #     job listings or news articles.
   #
   # Upton::Scraper can be used as-is for basic use-cases by:
-  # 1. specifying the pages to be scraped in `new` as an index page
+  # 1. specifying the pages to be scraped in `new` as an index page
   #      or as an Array of URLs.
-  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
+  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
   #      block from Upton::Utils.
-  # For more complicated cases; subclass Upton::Scraper
+  # For more complicated cases; subclass Upton::Scraper
   #    e.g. +MyScraper < Upton::Scraper+ and override various methods.
   ##
   class Scraper
+    EMPTY_STRING = ''
-    attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
+    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
+      :paginated, :pagination_param, :pagination_max_pages
     ##
     # This is the main user-facing method for a basic scraper.
-    # Call +scrape+ with a block; this block will be called on
+    # Call +scrape+ with a block; this block will be called on
     # the text of each instance page, (and optionally, its URL and its index
     # in the list of instance URLs returned by +get_index+).
     ##
-    def scrape &blk
-      unless self.url_array
-        self.url_array = self.get_index
-      end
+    def scrape(&blk)
+      self.url_array = self.get_index unless self.url_array
       self.scrape_from_list(self.url_array, blk)
     end
     ##
     # +index_url_or_array+: A list of string URLs, OR
     #              the URL of the page containing the list of instances.
-    # +selector+: The XPath expression or CSS selector that specifies the
-    #              anchor elements within the page, if a url is specified for
+    # +selector+: The XPath expression or CSS selector that specifies the
+    #              anchor elements within the page, if a url is specified for
     #              the previous argument.
     # +selector_method+: Deprecated and ignored. Next breaking release will
     #                      remove this option.x
@@ -68,7 +69,7 @@ module Upton
       #  the String passed is of CSS/XPath notation
     def initialize(index_url_or_array, selector="", selector_method=:deprecated)
       #if first arg is a valid URL, do already-written stuff;
       #if it's not (or if it's a list?) don't bother with get_index, etc.
       #e.g. Scraper.new(["http://jeremybmerrill.com"])
@@ -80,6 +81,7 @@ module Upton
         @index_url = index_url_or_array
         @index_selector = selector
       end
       # If true, then Upton prints information about when it gets
       # files from the internet and when it gets them from its stash.
       @verbose = false
@@ -89,26 +91,32 @@ module Upton
       # version.
       # You may want to set @debug to false for production (but maybe not).
       # You can also control stashing behavior on a per-call basis with the
-      # optional second argument to get_page, if, for instance, you want to
+      # optional second argument to get_page, if, for instance, you want to
       # stash certain instance pages, e.g. based on their modification date.
       @debug = true
       # Index debug does the same, but for index pages.
       @index_debug = false
-      # In order to not hammer servers, Upton waits for, by default, 30
+      # In order to not hammer servers, Upton waits for, by default, 30
       # seconds between requests to the remote server.
       @sleep_time_between_requests = 30 #seconds
+      # If true, then Upton will attempt to scrape paginated index pages
+      @paginated = false
+      # Default query string parameter used to specify the current page
+      @pagination_param = 'page'
+      # Default number of paginated pages to scrape
+      @pagination_max_pages = 2
       # Folder name for stashes, if you want them to be stored somewhere else,
       # e.g. under /tmp.
       @stash_folder ||= "stashes"
-      unless Dir.exists?(@stash_folder)
-        FileUtils.mkdir_p(@stash_folder)
-      end
+      FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
     end
     ##
-    # If instance pages are paginated, <b>you must override</b>
+    # If instance pages are paginated, <b>you must override</b>
     # this method to return the next URL, given the current URL and its index.
     #
     # If instance pages aren't paginated, there's no need to override this.
@@ -119,22 +127,42 @@ module Upton
     # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
     ##
     def next_instance_page_url(url, pagination_index)
-      ""
+      EMPTY_STRING
     end
     ##
-    # If index pages are paginated, <b>you must override</b>
-    # this method to return the next URL, given the current URL and its index.
-    #
-    # If index pages aren't paginated, there's no need to override this.
+    # Return the next URL to scrape, given the current URL and its index.
     #
     # Recursion stops if the fetching URL returns an empty string or an error.
     #
-    # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
+    # If @paginated is not set (the default), this method returns an empty string.
+    #
+    # If @paginated is set, this method will return the next pagination URL
+    # to scrape using @pagination_param and the pagination_index.
+    #
+    # If the pagination_index is greater than @pagination_max_pages, then the
+    # method will return an empty string.
+    #
+    # Override this method to handle pagination is an alternative way
+    # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
     # ought to return "http://whatever.com/articles?page=2"
+    #
     ##
     def next_index_page_url(url, pagination_index)
-      ""
+      return EMPTY_STRING unless @paginated
+      if pagination_index > @pagination_max_pages
+        puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
+        EMPTY_STRING
+      else
+        uri = URI.parse(url)
+        query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
+        # update the pagination query string parameter
+        query[@pagination_param] = pagination_index
+        uri.query = URI.encode_www_form(query)
+        puts "Next index pagination url is #{uri}" if @verbose
+        uri.to_s
+      end
     end
     ##
@@ -142,13 +170,10 @@ module Upton
     ##
     def scrape_to_csv filename, &blk
       require 'csv'
-      unless self.url_array
-        self.url_array = self.get_index
-      end
+      self.url_array = self.get_index unless self.url_array
       CSV.open filename, 'wb' do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
-          puts document.inspect
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
@@ -161,13 +186,10 @@ module Upton
     def scrape_to_tsv filename, &blk
       require 'csv'
-      unless self.url_array
-        self.url_array = self.get_index
-      end
+      self.url_array = self.get_index unless self.url_array
       CSV.open filename, 'wb', :col_sep => "\t" do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
-          puts document.inspect
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
@@ -181,70 +203,20 @@ module Upton
     protected
     ##
-    # Actually fetches the page
-    ##
-    def fetch_page(url, options={})
-      RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
-    end
-    ##
-    # Handles getting pages with RestClient or getting them from the local stash.
-    #
-    # Uses a kludge (because rest-client is outdated) to handle encoding.
+    # Handles getting pages with Downlader, which handles stashing.
     ##
     def get_page(url, stash=false, options={})
-      return "" if url.empty?
-      #the filename for each stashed version is a cleaned version of the URL.
-      if stash && File.exists?( url_to_filename(url, options) )
-        puts "usin' a stashed copy of " + url if @verbose
-        resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
-      else
-        begin
-          puts "getting " + url if @verbose
-          sleep @sleep_time_between_requests
-          resp = fetch_page(url, options)
-          #this is silly, but rest-client needs to get on their game.
-          #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
-          if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
-            charset = if set = resp.net_http_res.type_params['charset']
-              set
-            elsif content_type == 'text/xml'
-              'us-ascii'
-            elsif content_type.split('/').first == 'text'
-              'iso-8859-1'
-            end
-            resp.force_encoding(charset) if charset
-          end
-        rescue RestClient::ResourceNotFound
-          puts "404 error, skipping: #{url}" if @verbose
-          resp = ""
-        rescue RestClient::InternalServerError
-          puts "500 Error, skipping: #{url}" if @verbose
-          resp = ""
-        rescue URI::InvalidURIError
-          puts "Invalid URI: #{url}" if @verbose
-          resp = ""
-        rescue RestClient::RequestTimeout
-          "Timeout: #{url}" if @verbose
-          retry
-        end
-        if stash
-          puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
-          open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
-        end
+      return EMPTY_STRING if url.empty?
+      resp_and_cache = Downloader.new(url, {:cache => stash, :verbose => @verbose}.merge(options)).get
+      if resp_and_cache[:from_resource]
+        puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
+        sleep @sleep_time_between_requests
       end
-      resp
+      resp_and_cache[:resp]
     end
-    def url_to_filename(url, options={})
-      File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
-    end
-    ##
+    ##
     # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
     # resolve_url resolves them to absolute urls.
     # absolute_url_str must be a URL, as a string, that is absolute.
@@ -258,7 +230,7 @@ module Upton
       return href.to_s if href.absolute?
       #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
-      URI.join(absolute_url, href).to_s
+      URI.join(absolute_url, href).to_s
     end
     ##
@@ -272,7 +244,7 @@ module Upton
     end
     ##
-    # Using the XPath expression or CSS selector and selector_method that
+    # Using the XPath expression or CSS selector and selector_method that
     # uniquely identifies the links in the index, return those links as strings.    ##
     def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
       # for now, override selector_method with :search, which will work with either CSS or XPath
@@ -285,20 +257,18 @@ module Upton
     # Does @index_url stay unaltered for the lifetime of the Upton instance?
     # It seems to at this point, but that may be something that gets
     #  deprecated later
-    #
-    # So for now, @index_url is used in conjunction with resolve_url
+    #
+    # So for now, @index_url is used in conjunction with resolve_url
     # to make sure that this method returns absolute urls
     # i.e. this method expects @index_url to always have an absolute address
     # for the lifetime of an Upton instance
     def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
       # for now, override selector_method with :search, which will work with either CSS or XPath
-      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
+      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
         href = a_element["href"]
-        u = resolve_url( href, @index_url) unless href.nil?
-        unless u == href
-          puts "resolved #{href} to #{u}"
-        end
-        u
+        resolved_url = resolve_url( href, @index_url) unless href.nil?
+        puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
+        resolved_url
       end
     end
@@ -309,13 +279,13 @@ module Upton
     ##
     def get_index_pages(url, pagination_index, options={})
       resp = self.get_page(url, @index_debug, options)
-      if !resp.empty?
+      unless resp.empty?
         next_url = self.next_index_page_url(url, pagination_index + 1)
         # resolve to absolute url
         #
         next_url = resolve_url(next_url, url)
         unless next_url == url
-          next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
+          next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
           resp += next_resp
         end
       end
@@ -324,20 +294,20 @@ module Upton
     ##
     # Returns the instance at `url`.
-    #
+    #
     # If the page is stashed, returns that, otherwise, fetches it from the web.
     #
-    # If an instance is paginated, returns the concatenated output of each
+    # If an instance is paginated, returns the concatenated output of each
     # page, e.g. if a news article has two pages.
     ##
     def get_instance(url, pagination_index=0, options={})
       resp = self.get_page(url, @debug, options)
-      if !resp.empty?
+      if !resp.empty?
         next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
-#        next_url = resolve_url(next_url, url)
+        #next_url = resolve_url(next_url, url)
         unless next_url == url
-          next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
+          next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
           resp += next_resp
         end
       end

data/lib/upton/downloader.rb ADDED

@@ -0,0 +1,126 @@
+require "fileutils"
+require "open-uri"
+require "tmpdir"
+require "restclient"
+module Upton
+  # This class is used internally to download and cache the webpages
+  # that are requested.
+  #
+  # By default, the cache location is the output of `Dir.tmpdir`/upton.
+  # The Dir.tmpdir returns the temporary directory of the operating system.
+  # By default, the stashed files have a non-human-readable md5-based filename.
+  # If `readable_stash_filenames` is true, they will have human-readable names.
+  class Downloader
+    MAX_FILENAME_LENGTH = 130 #for unixes, win xp+
+    EMPTY_STRING = ''
+    attr_reader :uri, :cache_location, :verbose
+    def initialize(uri, options = {})
+      @uri = uri
+      @cache = options.fetch(:cache) { true }
+      @cache_location = File.absolute_path(options[:cache_location] || "#{Dir.tmpdir}/upton")
+      @verbose = options[:verbose] || false
+      @readable_stash_filenames = options[:readable_filenames] || false
+      initialize_cache!
+    end
+    def get
+      if cache_enabled?
+        puts "Stashing enabled. Will try reading #{uri} data from cache." if @verbose
+        download_from_cache!
+      else
+        puts "Stashing disabled. Will download from the internet." if @verbose
+        from_resource = true
+        resp = download_from_resource!
+        {:resp => resp, :from_resource => from_resource }
+      end
+    end
+    private
+    def download_from_resource!
+      begin
+        puts "Downloading from #{uri}" if @verbose
+        resp = RestClient.get(uri)
+        puts "Downloaded #{uri}" if @verbose
+      rescue RestClient::ResourceNotFound
+        puts "404 error, skipping: #{uri}" if @verbose
+      rescue RestClient::InternalServerError
+        puts "500 Error, skipping: #{uri}" if @verbose
+      rescue URI::InvalidURIError
+        puts "Invalid URI: #{uri}" if @verbose
+      rescue RestClient::RequestTimeout
+        puts "Timeout: #{uri}" if @verbose
+        retry
+      end
+      resp ||= EMPTY_STRING
+    end
+    def download_from_cache!
+      resp = if cached_file_exists?
+              puts "Cache of #{uri} available" if @verbose
+              from_resource = false
+              open(cached_file).read
+            else
+              if @verbose
+                if @readable_stash_filenames
+                  puts "Cache of #{uri} unavailable at #{filename_from_uri}. Will download from the internet"
+                else
+                  puts "Cache of #{uri} unavailable. Will download from the internet"
+                end
+              end
+              from_resource = false
+              download_from_resource!
+            end
+      unless cached_file_exists?
+        if @verbose
+          if @readable_stash_filenames
+            puts "Writing #{uri} data to the cache at #{cached_file}"
+          else
+            puts "Writing #{uri} data to the cache"
+          end
+        end
+        File.write(cached_file, resp)
+      end
+      {:resp => resp, :from_resource => from_resource }
+    end
+    def cache_enabled?
+      !!@cache
+    end
+    def filename_from_uri
+      @readable_stash_filenames ? readable_filename_from_uri : hashed_filename_from_uri
+    end
+    def hashed_filename_from_uri
+      Digest::MD5.hexdigest(uri)
+    end
+    def readable_filename_from_uri
+      html = "html"
+      clean_url_max_length = MAX_FILENAME_LENGTH - html.length - cache_location.size
+      clean_url = uri.gsub(/[^A-Za-z0-9\-_]/, "")[0...clean_url_max_length]
+      "#{clean_url}.#{html}"
+    end
+    def cached_file
+      "#{cache_location}/#{filename_from_uri}"
+    end
+    def cached_file_exists?
+      File.exists?(cached_file)
+    end
+    def initialize_cache!
+      unless Dir.exists?(cache_location)
+        Dir.mkdir(cache_location)
+        FileUtils.chmod 0700, cache_location
+      end
+    end
+  end
+end