RubyGems - upton - Versions diffs - 0.3.0 → 0.3.1 - Mend

upton 0.3.0 → 0.3.1

Files changed (6) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 892592f6c890ecd94fb1bdf3b8cc500e813ebfa3
+  data.tar.gz: 95d10ea4c37aaec611c76dc98c45dd449a1ac35d
+SHA512:
+  metadata.gz: f112a48ed90264ac5e111e48b45e6b67468793059f613385faa87bd6ab5122a7f11c532358daf7382c689e42b66db022bfce24ffd0b32ffe15619de0a026df77
+  data.tar.gz: 8fcbd1276ea6e284481d0395de5a1f73c07a2acb159edab6775c875d3dbf76e2b3c16c6b46b2aa48aec345fbd4950a5af179a6e560a1af5923e5087a8c6a648b

data/lib/upton/downloader.rb CHANGED Viewed

@@ -2,6 +2,7 @@ require "fileutils"
 require "open-uri"
 require "tmpdir"
 require "restclient"
+require_relative "./version"
 module Upton
@@ -88,11 +89,30 @@ module Upton
             puts "Writing #{uri} data to the cache"
           end
         end
-        open(cached_file, 'w'){|f| f << resp}
+        commented_resp = add_comment(resp)
+        open(cached_file, 'w'){|f| f << commented_resp}
       end
       {:resp => resp, :from_resource => from_resource }
     end
+    def add_comment(resp)
+      # n = Nokogiri::HTML("<html></html>")
+      # c = Nokogiri::XML::Comment.new(n, "asdfasdf")
+      # n.root.add_child(c)
+      # <!----Retrieved by Upton from http://www.somesite.com on January 15 at 4:28 p.m.-->
+      msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
+      resp_html = Nokogiri::HTML(resp)
+      comment = Nokogiri::XML::Comment.new(resp_html, msg)
+      if resp_html.root.nil?
+        return resp
+      elsif resp_html.root.children.empty?
+        resp_html.root.add_child(comment)
+      else
+        resp_html.root.children.before(comment)
+      end
+      resp_html.to_html
+    end
     def cache_enabled?
       !!@cache
     end

data/lib/upton/scraper.rb ADDED Viewed

@@ -0,0 +1,330 @@
+require 'uri'
+require 'nokogiri'
+require_relative './downloader'
+module Upton
+    # Upton::Scraper can be used as-is for basic use-cases by:
+  # 1. specifying the pages to be scraped in `new` as an index page
+  #      or as an Array of URLs.
+  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
+  #      block from Upton::Utils.
+  # For more complicated cases; subclass Upton::Scraper
+  #    e.g. +MyScraper < Upton::Scraper+ and override various methods.
+  ##
+  class Scraper
+    EMPTY_STRING = ''
+    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
+      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
+      :pagination_interval
+    ##
+    # This is the main user-facing method for a basic scraper.
+    # Call +scrape+ with a block; this block will be called on
+    # the text of each instance page, (and optionally, its URL and its index
+    # in the list of instance URLs returned by +get_index+).
+    ##
+    def scrape(&blk)
+      self.url_array = self.get_index unless self.url_array
+      self.scrape_from_list(self.url_array, blk)
+    end
+    ##
+    # +index_url_or_array+: A list of string URLs, OR
+    #              the URL of the page containing the list of instances.
+    # +selector+: The XPath expression or CSS selector that specifies the
+    #              anchor elements within the page, if a url is specified for
+    #              the previous argument.
+    #
+    # These options are a shortcut. If you plan to override +get_index+, you
+    # do not need to set them.
+    # If you don't specify a selector, the first argument will be treated as a
+    # list of URLs.
+    ##
+    def initialize(index_url_or_array, selector="")
+      #if first arg is a valid URL, do already-written stuff;
+      #if it's not (or if it's a list?) don't bother with get_index, etc.
+      #e.g. Scraper.new(["http://jeremybmerrill.com"])
+      #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
+      if index_url_or_array.respond_to? :each_with_index
+        @url_array = index_url_or_array
+      else
+        @index_url = index_url_or_array
+        @index_selector = selector
+      end
+      # If true, then Upton prints information about when it gets
+      # files from the internet and when it gets them from its stash.
+      @verbose = false
+      # If true, then Upton fetches each instance page only once
+      # future requests for that file are responded to with the locally stashed
+      # version.
+      # You may want to set @debug to false for production (but maybe not).
+      # You can also control stashing behavior on a per-call basis with the
+      # optional second argument to get_page, if, for instance, you want to
+      # stash certain instance pages, e.g. based on their modification date.
+      @debug = true
+      # Index debug does the same, but for index pages.
+      @index_debug = false
+      # In order to not hammer servers, Upton waits for, by default, 30
+      # seconds between requests to the remote server.
+      @sleep_time_between_requests = 30 #seconds
+      # If true, then Upton will attempt to scrape paginated index pages
+      @paginated = false
+      # Default query string parameter used to specify the current page
+      @pagination_param = 'page'
+      # Default number of paginated pages to scrape
+      @pagination_max_pages = 2
+      # Default starting number for pagination (second page is this plus 1).
+      @pagination_start_index = 1
+      # Default value to increment page number by
+      @pagination_interval = 1
+      # Folder name for stashes, if you want them to be stored somewhere else,
+      # e.g. under /tmp.
+      if @stash_folder
+        FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
+      end
+    end
+    ##
+    # If instance pages are paginated, <b>you must override</b>
+    # this method to return the next URL, given the current URL and its index.
+    #
+    # If instance pages aren't paginated, there's no need to override this.
+    #
+    # Recursion stops if the fetching URL returns an empty string or an error.
+    #
+    # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
+    # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
+    ##
+    def next_instance_page_url(url, pagination_index)
+      EMPTY_STRING
+    end
+    ##
+    # Return the next URL to scrape, given the current URL and its index.
+    #
+    # Recursion stops if the fetching URL returns an empty string or an error.
+    #
+    # If @paginated is not set (the default), this method returns an empty string.
+    #
+    # If @paginated is set, this method will return the next pagination URL
+    # to scrape using @pagination_param and the pagination_index.
+    #
+    # If the pagination_index is greater than @pagination_max_pages, then the
+    # method will return an empty string.
+    #
+    # Override this method to handle pagination is an alternative way
+    # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
+    # ought to return "http://whatever.com/articles?page=2"
+    #
+    ##
+    def next_index_page_url(url, pagination_index)
+      return EMPTY_STRING unless @paginated
+      if pagination_index > @pagination_max_pages
+        puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
+        EMPTY_STRING
+      else
+        uri = URI.parse(url)
+        query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
+        # update the pagination query string parameter
+        query[@pagination_param] = pagination_index
+        uri.query = URI.encode_www_form(query)
+        puts "Next index pagination url is #{uri}" if @verbose
+        uri.to_s
+      end
+    end
+    ##
+    # Writes the scraped result to a CSV at the given filename.
+    ##
+    def scrape_to_csv filename, &blk
+      require 'csv'
+      self.url_array = self.get_index unless self.url_array
+      CSV.open filename, 'wb' do |csv|
+        #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+          if document[0].respond_to? :map
+            document.each{|row| csv << row }
+          else
+            csv << document
+          end
+        end
+        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+      end
+    end
+    def scrape_to_tsv filename, &blk
+      require 'csv'
+      self.url_array = self.get_index unless self.url_array
+      CSV.open filename, 'wb', :col_sep => "\t" do |csv|
+        #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+          if document[0].respond_to? :map
+            document.each{|row| csv << row }
+          else
+            csv << document
+          end
+        end
+        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+      end
+    end
+    protected
+    ##
+    # Handles getting pages with Downlader, which handles stashing.
+    ##
+    def get_page(url, stash=false, options={})
+      return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
+      global_options = {
+        :cache => stash,
+        :verbose => @verbose
+      }
+      if @readable_filenames
+        global_options[:readable_filenames] = true
+      end
+      if @stash_folder
+        global_options[:readable_filenames] = true
+        global_options[:cache_location] = @stash_folder
+      end
+      resp_and_cache = Downloader.new(url, global_options.merge(options)).get
+      if resp_and_cache[:from_resource]
+        puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
+        sleep @sleep_time_between_requests
+      end
+      resp_and_cache[:resp]
+    end
+    ##
+    # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
+    # resolve_url resolves them to absolute urls.
+    # absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
+    ##
+    def resolve_url(href_str, absolute_url_str)
+      if absolute_url_str.class <= URI::Generic
+        absolute_url = absolute_url_str.dup
+      else
+        begin
+          absolute_url = URI(absolute_url_str).dup
+        rescue URI::InvalidURIError
+          raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
+        end
+      end
+      raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
+      if href_str.class <= URI::Generic
+        href = href_str.dup
+      else
+        begin
+          href = URI(href_str).dup
+        rescue URI::InvalidURIError
+          raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
+        end
+      end
+      # return :href if :href is already absolute
+      return href.to_s if href.absolute?
+      #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
+      URI.join(absolute_url.to_s, href.to_s).to_s
+    end
+    ##
+    # Return a list of URLs for the instances you want to scrape.
+    # This can optionally be overridden if, for example, the list of instances
+    # comes from an API.
+    ##
+    def get_index
+      index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
+    end
+    # TODO: Not sure the best way to handle this
+    # Currently, #parse_index is called upon #get_index_pages,
+    #  which itself is dependent on @index_url
+    # Does @index_url stay unaltered for the lifetime of the Upton instance?
+    # It seems to at this point, but that may be something that gets
+    #  deprecated later
+    #
+    # So for now, @index_url is used in conjunction with resolve_url
+    # to make sure that this method returns absolute urls
+    # i.e. this method expects @index_url to always have an absolute address
+    # for the lifetime of an Upton instance
+    def parse_index(text, selector)
+      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
+        href = a_element["href"]
+        resolved_url = resolve_url( href, @index_url) unless href.nil?
+        puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
+        resolved_url
+      end
+    end
+    ##
+    # Returns the concatenated output of each member of a paginated index,
+    # e.g. a site listing links with 2+ pages.
+    ##
+    def get_index_pages(url, pagination_index, pagination_interval, options={})
+      resps = [self.get_page(url, @index_debug, options)]
+      prev_url = url
+      while !resps.last.empty?
+        pagination_index += pagination_interval
+        next_url = self.next_index_page_url(url, pagination_index)
+        next_url = resolve_url(next_url, url)
+        break if next_url == prev_url || next_url.empty?
+        next_resp = self.get_page(next_url, @index_debug, options).to_s
+        prev_url = next_url
+        resps << next_resp
+      end
+      resps
+    end
+    ##
+    # Returns the instance at `url`.
+    #
+    # If the page is stashed, returns that, otherwise, fetches it from the web.
+    #
+    # If an instance is paginated, returns the concatenated output of each
+    # page, e.g. if a news article has two pages.
+    ##
+    def get_instance(url, pagination_index=0, options={})
+      resps = [self.get_page(url, @debug, options)]
+      pagination_index = pagination_index.to_i
+      prev_url = url
+      while !resps.last.empty?
+        next_url = self.next_instance_page_url(url, pagination_index + 1)
+        break if next_url == prev_url || next_url.empty?
+        next_resp = self.get_page(next_url, @debug, options)
+        prev_url = next_url
+        resps << next_resp
+      end
+      resps
+    end
+    # Just a helper for +scrape+.
+    def scrape_from_list(list, blk)
+      puts "Scraping #{list.size} instances" if @verbose
+      list.each_with_index.map do |instance_url, instance_index|
+        instance_resps = get_instance instance_url, nil, :instance_index => instance_index
+        instance_resps.each_with_index.map do |instance_resp, pagination_index|
+          blk.call(instance_resp, instance_url, instance_index, pagination_index)
+        end
+      end.flatten(1)
+    end
+    # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
+    def slug(url)
+      url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
+    end
+  end
+end

data/lib/upton/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Upton # :nodoc:
-    VERSION = '0.3.0'
+    VERSION = '0.3.1'
 end

data/lib/upton.rb CHANGED Viewed

@@ -35,7 +35,8 @@ module Upton
     EMPTY_STRING = ''
     attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
-      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
+      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
+      :pagination_interval
     ##
     # This is the main user-facing method for a basic scraper.
@@ -101,6 +102,8 @@ module Upton
       @pagination_max_pages = 2
       # Default starting number for pagination (second page is this plus 1).
       @pagination_start_index = 1
+      # Default value to increment page number by
+      @pagination_interval = 1
       # Folder name for stashes, if you want them to be stored somewhere else,
       # e.g. under /tmp.
@@ -260,7 +263,7 @@ module Upton
     # comes from an API.
     ##
     def get_index
-      index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
+      index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
     end
     # TODO: Not sure the best way to handle this
@@ -288,11 +291,11 @@ module Upton
     # Returns the concatenated output of each member of a paginated index,
     # e.g. a site listing links with 2+ pages.
     ##
-    def get_index_pages(url, pagination_index, options={})
+    def get_index_pages(url, pagination_index, pagination_interval, options={})
       resps = [self.get_page(url, @index_debug, options)]
       prev_url = url
       while !resps.last.empty?
-        pagination_index += 1
+        pagination_index += pagination_interval
         next_url = self.next_index_page_url(url, pagination_index)
         next_url = resolve_url(next_url, url)
         break if next_url == prev_url || next_url.empty?

metadata CHANGED Viewed

@@ -1,116 +1,102 @@
 --- !ruby/object:Gem::Specification
 name: upton
 version: !ruby/object:Gem::Version
-  version: 0.3.0
-  prerelease:
+  version: 0.3.1
 platform: ruby
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-22 00:00:00.000000000 Z
+date: 2014-02-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rack
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: webmock
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: thin
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 1.5.1
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: 1.5.1
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rest-client
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -118,7 +104,6 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -126,33 +111,29 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: mechanize
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 description: Don't re-write web scrapers every time. Upton gives you a scraper template
@@ -163,6 +144,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/upton.rb
+- lib/upton/scraper.rb
 - lib/upton/utils.rb
 - lib/upton/downloader.rb
 - lib/upton/version.rb
@@ -181,27 +163,26 @@ files:
 homepage: http://github.org/propublica/upton
 licenses:
 - MIT
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: 1.9.2
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.23
+rubygems_version: 2.0.14
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: A simple web-scraping framework
 test_files:
 - spec/data/prosecutor.html