RubyGems - upton - Versions diffs - 0.2.7 → 0.2.8 - Mend

upton 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/upton.rb +86 -116
data/lib/upton/downloader.rb +126 -0
data/lib/upton/utils.rb +43 -0
data/spec/data/propublica.html +269 -269
data/spec/data/propublica_search.html +388 -0
data/spec/data/propublica_search_page_2.html +375 -0
data/spec/spec_helper.rb +20 -0
data/spec/upton_downloader_spec.rb +75 -0
data/spec/upton_spec.rb +110 -47
metadata +26 -3
data/lib/utils.rb +0 -74

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
-  data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
+  metadata.gz: 0bc8fddf34dc974bde7491e7dd311eb09b5d393e
+  data.tar.gz: b8a8010408cd715b010406163cd14e45045af2d6
 SHA512:
-  metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
-  data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
+  metadata.gz: 0c5cdda936dcaf7a045afbc6cb317fc463191823a13d585732717f6ddfb3d4970a94c51df0324a343022c938702ca8b0fdbbf9e8b54fb0cc5fafec1dd8af8276
+  data.tar.gz: e5f2bd0c9f9ba843607b0ac7816c84df21cc6acbb0de13ec5918e3edb866fa41d7e6e9b39d4d0af7ea74c0ebf4628240ee783612edc3370842f860039ccc6465

data/lib/upton.rb CHANGED

@@ -3,55 +3,56 @@
 require 'nokogiri'
 require 'uri'
 require 'restclient'
-require_relative './utils'
+require_relative 'upton/utils'
+require_relative 'upton/downloader'
 ##
 # This module contains a scraper called Upton
 ##
 module Upton
   ##
-  # *Upton* is a framework for easy web-scraping with a useful debug mode
-  # that doesn't hammer your target's servers. It does the repetitive parts of
+  # *Upton* is a framework for easy web-scraping with a useful debug mode
+  # that doesn't hammer your target's servers. It does the repetitive parts of
   # writing scrapers, so you only have to write the unique parts for each site.
   #
   # Upton operates on the theory that, for most scraping projects, you need to
   # scrape two types of pages:
-  #
-  # 1. Index pages, which list instance pages. For example, a job search
+  #
+  # 1. Index pages, which list instance pages. For example, a job search
   #     site's search page or a newspaper's homepage.
   # 2. Instance pages, which represent the goal of your scraping, e.g.
   #     job listings or news articles.
   #
   # Upton::Scraper can be used as-is for basic use-cases by:
-  # 1. specifying the pages to be scraped in `new` as an index page
+  # 1. specifying the pages to be scraped in `new` as an index page
   #      or as an Array of URLs.
-  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
+  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
   #      block from Upton::Utils.
-  # For more complicated cases; subclass Upton::Scraper
+  # For more complicated cases; subclass Upton::Scraper
   #    e.g. +MyScraper < Upton::Scraper+ and override various methods.
   ##
   class Scraper
+    EMPTY_STRING = ''
-    attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
+    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
+      :paginated, :pagination_param, :pagination_max_pages
     ##
     # This is the main user-facing method for a basic scraper.
-    # Call +scrape+ with a block; this block will be called on
+    # Call +scrape+ with a block; this block will be called on
     # the text of each instance page, (and optionally, its URL and its index
     # in the list of instance URLs returned by +get_index+).
     ##
-    def scrape &blk
-      unless self.url_array
-        self.url_array = self.get_index
-      end
+    def scrape(&blk)
+      self.url_array = self.get_index unless self.url_array
       self.scrape_from_list(self.url_array, blk)
     end
     ##
     # +index_url_or_array+: A list of string URLs, OR
     #              the URL of the page containing the list of instances.
-    # +selector+: The XPath expression or CSS selector that specifies the
-    #              anchor elements within the page, if a url is specified for
+    # +selector+: The XPath expression or CSS selector that specifies the
+    #              anchor elements within the page, if a url is specified for
     #              the previous argument.
     # +selector_method+: Deprecated and ignored. Next breaking release will
     #                      remove this option.x
@@ -68,7 +69,7 @@ module Upton
       #  the String passed is of CSS/XPath notation
     def initialize(index_url_or_array, selector="", selector_method=:deprecated)
       #if first arg is a valid URL, do already-written stuff;
       #if it's not (or if it's a list?) don't bother with get_index, etc.
       #e.g. Scraper.new(["http://jeremybmerrill.com"])
@@ -80,6 +81,7 @@ module Upton
         @index_url = index_url_or_array
         @index_selector = selector
       end
       # If true, then Upton prints information about when it gets
       # files from the internet and when it gets them from its stash.
       @verbose = false
@@ -89,26 +91,32 @@ module Upton
       # version.
       # You may want to set @debug to false for production (but maybe not).
       # You can also control stashing behavior on a per-call basis with the
-      # optional second argument to get_page, if, for instance, you want to
+      # optional second argument to get_page, if, for instance, you want to
       # stash certain instance pages, e.g. based on their modification date.
       @debug = true
       # Index debug does the same, but for index pages.
       @index_debug = false
-      # In order to not hammer servers, Upton waits for, by default, 30
+      # In order to not hammer servers, Upton waits for, by default, 30
       # seconds between requests to the remote server.
       @sleep_time_between_requests = 30 #seconds
+      # If true, then Upton will attempt to scrape paginated index pages
+      @paginated = false
+      # Default query string parameter used to specify the current page
+      @pagination_param = 'page'
+      # Default number of paginated pages to scrape
+      @pagination_max_pages = 2
       # Folder name for stashes, if you want them to be stored somewhere else,
       # e.g. under /tmp.
       @stash_folder ||= "stashes"
-      unless Dir.exists?(@stash_folder)
-        FileUtils.mkdir_p(@stash_folder)
-      end
+      FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
     end
     ##
-    # If instance pages are paginated, <b>you must override</b>
+    # If instance pages are paginated, <b>you must override</b>
     # this method to return the next URL, given the current URL and its index.
     #
     # If instance pages aren't paginated, there's no need to override this.
@@ -119,22 +127,42 @@ module Upton
     # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
     ##
     def next_instance_page_url(url, pagination_index)
-      ""
+      EMPTY_STRING
     end
     ##
-    # If index pages are paginated, <b>you must override</b>
-    # this method to return the next URL, given the current URL and its index.
-    #
-    # If index pages aren't paginated, there's no need to override this.
+    # Return the next URL to scrape, given the current URL and its index.
     #
     # Recursion stops if the fetching URL returns an empty string or an error.
     #
-    # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
+    # If @paginated is not set (the default), this method returns an empty string.
+    #
+    # If @paginated is set, this method will return the next pagination URL
+    # to scrape using @pagination_param and the pagination_index.
+    #
+    # If the pagination_index is greater than @pagination_max_pages, then the
+    # method will return an empty string.
+    #
+    # Override this method to handle pagination is an alternative way
+    # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
     # ought to return "http://whatever.com/articles?page=2"
+    #
     ##
     def next_index_page_url(url, pagination_index)
-      ""
+      return EMPTY_STRING unless @paginated
+      if pagination_index > @pagination_max_pages
+        puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
+        EMPTY_STRING
+      else
+        uri = URI.parse(url)
+        query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
+        # update the pagination query string parameter
+        query[@pagination_param] = pagination_index
+        uri.query = URI.encode_www_form(query)
+        puts "Next index pagination url is #{uri}" if @verbose
+        uri.to_s
+      end
     end
     ##
@@ -142,13 +170,10 @@ module Upton
     ##
     def scrape_to_csv filename, &blk
       require 'csv'
-      unless self.url_array
-        self.url_array = self.get_index
-      end
+      self.url_array = self.get_index unless self.url_array
       CSV.open filename, 'wb' do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
-          puts document.inspect
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
@@ -161,13 +186,10 @@ module Upton
     def scrape_to_tsv filename, &blk
       require 'csv'
-      unless self.url_array
-        self.url_array = self.get_index
-      end
+      self.url_array = self.get_index unless self.url_array
       CSV.open filename, 'wb', :col_sep => "\t" do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
-          puts document.inspect
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
@@ -181,70 +203,20 @@ module Upton
     protected
     ##
-    # Actually fetches the page
-    ##
-    def fetch_page(url, options={})
-      RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
-    end
-    ##
-    # Handles getting pages with RestClient or getting them from the local stash.
-    #
-    # Uses a kludge (because rest-client is outdated) to handle encoding.
+    # Handles getting pages with Downlader, which handles stashing.
     ##
     def get_page(url, stash=false, options={})
-      return "" if url.empty?
-      #the filename for each stashed version is a cleaned version of the URL.
-      if stash && File.exists?( url_to_filename(url, options) )
-        puts "usin' a stashed copy of " + url if @verbose
-        resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
-      else
-        begin
-          puts "getting " + url if @verbose
-          sleep @sleep_time_between_requests
-          resp = fetch_page(url, options)
-          #this is silly, but rest-client needs to get on their game.
-          #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
-          if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
-            charset = if set = resp.net_http_res.type_params['charset']
-              set
-            elsif content_type == 'text/xml'
-              'us-ascii'
-            elsif content_type.split('/').first == 'text'
-              'iso-8859-1'
-            end
-            resp.force_encoding(charset) if charset
-          end
-        rescue RestClient::ResourceNotFound
-          puts "404 error, skipping: #{url}" if @verbose
-          resp = ""
-        rescue RestClient::InternalServerError
-          puts "500 Error, skipping: #{url}" if @verbose
-          resp = ""
-        rescue URI::InvalidURIError
-          puts "Invalid URI: #{url}" if @verbose
-          resp = ""
-        rescue RestClient::RequestTimeout
-          "Timeout: #{url}" if @verbose
-          retry
-        end
-        if stash
-          puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
-          open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
-        end
+      return EMPTY_STRING if url.empty?
+      resp_and_cache = Downloader.new(url, {:cache => stash, :verbose => @verbose}.merge(options)).get
+      if resp_and_cache[:from_resource]
+        puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
+        sleep @sleep_time_between_requests
       end
-      resp
+      resp_and_cache[:resp]
     end
-    def url_to_filename(url, options={})
-      File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
-    end
-    ##
+    ##
     # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
     # resolve_url resolves them to absolute urls.
     # absolute_url_str must be a URL, as a string, that is absolute.
@@ -258,7 +230,7 @@ module Upton
       return href.to_s if href.absolute?
       #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
-      URI.join(absolute_url, href).to_s
+      URI.join(absolute_url, href).to_s
     end
     ##
@@ -272,7 +244,7 @@ module Upton
     end
     ##
-    # Using the XPath expression or CSS selector and selector_method that
+    # Using the XPath expression or CSS selector and selector_method that
     # uniquely identifies the links in the index, return those links as strings.    ##
     def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
       # for now, override selector_method with :search, which will work with either CSS or XPath
@@ -285,20 +257,18 @@ module Upton
     # Does @index_url stay unaltered for the lifetime of the Upton instance?
     # It seems to at this point, but that may be something that gets
     #  deprecated later
-    #
-    # So for now, @index_url is used in conjunction with resolve_url
+    #
+    # So for now, @index_url is used in conjunction with resolve_url
     # to make sure that this method returns absolute urls
     # i.e. this method expects @index_url to always have an absolute address
     # for the lifetime of an Upton instance
     def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
       # for now, override selector_method with :search, which will work with either CSS or XPath
-      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
+      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
         href = a_element["href"]
-        u = resolve_url( href, @index_url) unless href.nil?
-        unless u == href
-          puts "resolved #{href} to #{u}"
-        end
-        u
+        resolved_url = resolve_url( href, @index_url) unless href.nil?
+        puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
+        resolved_url
       end
     end
@@ -309,13 +279,13 @@ module Upton
     ##
     def get_index_pages(url, pagination_index, options={})
       resp = self.get_page(url, @index_debug, options)
-      if !resp.empty?
+      unless resp.empty?
         next_url = self.next_index_page_url(url, pagination_index + 1)
         # resolve to absolute url
         #
         next_url = resolve_url(next_url, url)
         unless next_url == url
-          next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
+          next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
           resp += next_resp
         end
       end
@@ -324,20 +294,20 @@ module Upton
     ##
     # Returns the instance at `url`.
-    #
+    #
     # If the page is stashed, returns that, otherwise, fetches it from the web.
     #
-    # If an instance is paginated, returns the concatenated output of each
+    # If an instance is paginated, returns the concatenated output of each
     # page, e.g. if a news article has two pages.
     ##
     def get_instance(url, pagination_index=0, options={})
       resp = self.get_page(url, @debug, options)
-      if !resp.empty?
+      if !resp.empty?
         next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
-#        next_url = resolve_url(next_url, url)
+        #next_url = resolve_url(next_url, url)
         unless next_url == url
-          next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
+          next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
           resp += next_resp
         end
       end

data/lib/upton/downloader.rb ADDED

@@ -0,0 +1,126 @@
+require "fileutils"
+require "open-uri"
+require "tmpdir"
+require "restclient"
+module Upton
+  # This class is used internally to download and cache the webpages
+  # that are requested.
+  #
+  # By default, the cache location is the output of `Dir.tmpdir`/upton.
+  # The Dir.tmpdir returns the temporary directory of the operating system.
+  # By default, the stashed files have a non-human-readable md5-based filename.
+  # If `readable_stash_filenames` is true, they will have human-readable names.
+  class Downloader
+    MAX_FILENAME_LENGTH = 130 #for unixes, win xp+
+    EMPTY_STRING = ''
+    attr_reader :uri, :cache_location, :verbose
+    def initialize(uri, options = {})
+      @uri = uri
+      @cache = options.fetch(:cache) { true }
+      @cache_location = File.absolute_path(options[:cache_location] || "#{Dir.tmpdir}/upton")
+      @verbose = options[:verbose] || false
+      @readable_stash_filenames = options[:readable_filenames] || false
+      initialize_cache!
+    end
+    def get
+      if cache_enabled?
+        puts "Stashing enabled. Will try reading #{uri} data from cache." if @verbose
+        download_from_cache!
+      else
+        puts "Stashing disabled. Will download from the internet." if @verbose
+        from_resource = true
+        resp = download_from_resource!
+        {:resp => resp, :from_resource => from_resource }
+      end
+    end
+    private
+    def download_from_resource!
+      begin
+        puts "Downloading from #{uri}" if @verbose
+        resp = RestClient.get(uri)
+        puts "Downloaded #{uri}" if @verbose
+      rescue RestClient::ResourceNotFound
+        puts "404 error, skipping: #{uri}" if @verbose
+      rescue RestClient::InternalServerError
+        puts "500 Error, skipping: #{uri}" if @verbose
+      rescue URI::InvalidURIError
+        puts "Invalid URI: #{uri}" if @verbose
+      rescue RestClient::RequestTimeout
+        puts "Timeout: #{uri}" if @verbose
+        retry
+      end
+      resp ||= EMPTY_STRING
+    end
+    def download_from_cache!
+      resp = if cached_file_exists?
+              puts "Cache of #{uri} available" if @verbose
+              from_resource = false
+              open(cached_file).read
+            else
+              if @verbose
+                if @readable_stash_filenames
+                  puts "Cache of #{uri} unavailable at #{filename_from_uri}. Will download from the internet"
+                else
+                  puts "Cache of #{uri} unavailable. Will download from the internet"
+                end
+              end
+              from_resource = false
+              download_from_resource!
+            end
+      unless cached_file_exists?
+        if @verbose
+          if @readable_stash_filenames
+            puts "Writing #{uri} data to the cache at #{cached_file}"
+          else
+            puts "Writing #{uri} data to the cache"
+          end
+        end
+        File.write(cached_file, resp)
+      end
+      {:resp => resp, :from_resource => from_resource }
+    end
+    def cache_enabled?
+      !!@cache
+    end
+    def filename_from_uri
+      @readable_stash_filenames ? readable_filename_from_uri : hashed_filename_from_uri
+    end
+    def hashed_filename_from_uri
+      Digest::MD5.hexdigest(uri)
+    end
+    def readable_filename_from_uri
+      html = "html"
+      clean_url_max_length = MAX_FILENAME_LENGTH - html.length - cache_location.size
+      clean_url = uri.gsub(/[^A-Za-z0-9\-_]/, "")[0...clean_url_max_length]
+      "#{clean_url}.#{html}"
+    end
+    def cached_file
+      "#{cache_location}/#{filename_from_uri}"
+    end
+    def cached_file_exists?
+      File.exists?(cached_file)
+    end
+    def initialize_cache!
+      unless Dir.exists?(cache_location)
+        Dir.mkdir(cache_location)
+        FileUtils.chmod 0700, cache_location
+      end
+    end
+  end
+end