RubyGems - upton - Versions diffs - 0.3.6 → 1.0.0.prea - Mend

upton 0.3.6 → 1.0.0.prea

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dc480dd21a06c69b7a337b17ed01f95fa262d33b
-  data.tar.gz: f5f5f1c99471d884a585e0098c8d3ca975fe8c8e
+  metadata.gz: c446c20e57e387b365d9c5bcda546a1b48ebbcf1
+  data.tar.gz: a29ee1aa35b18a9324d504ae8e99e2a9bafcfb27
 SHA512:
-  metadata.gz: 7bda1a3ee82d668b3d966b50ba7bff58133e807d9da88e988288be25bdcb26956917216a9925a91275c3978906b31eaf4ee9a5dba1694bbeb9811b6912339771
-  data.tar.gz: 121c8c524f56f41c24e6f9cb9fb9584d9b0d988bf0c1c4a163b63b539dfd44f8e969ccd41efa45f96b4da41ed4c3963c87b6f5e708ef20b31c0e518f7d850dff
+  metadata.gz: 11d5e990c42441d5bf599952bf3d49289754f68da72a51b31d62c86281531960fc18adffb7e52c59fe37ac5e275c35e766ce922c9a1922294f032d2a5c7cbea7
+  data.tar.gz: a6cbe33126fe3506c2248d40677e88e3d4e545ec2dc3e6613b3d623232f2288eae1305f41ee1ce63a996ff64812558596a95f1b14dffb63bca71bd89562e9fe7

data/lib/upton.rb CHANGED Viewed

@@ -1,10 +1,10 @@
 # encoding: UTF-8
-require 'nokogiri'
-require 'uri'
-require 'restclient'
+require_relative 'upton/scraper'
 require_relative 'upton/utils'
+require_relative 'upton/version'
 require_relative 'upton/downloader'
+require_relative 'upton/scraper'
 ##
 # This module contains a scraper called Upton
@@ -22,332 +22,6 @@ module Upton
   #     site's search page or a newspaper's homepage.
   # 2. Instance pages, which represent the goal of your scraping, e.g.
   #     job listings or news articles.
-  #
-  # Upton::Scraper can be used as-is for basic use-cases by:
-  # 1. specifying the pages to be scraped in `new` as an index page
-  #      or as an Array of URLs.
-  # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
-  #      block from Upton::Utils.
-  # For more complicated cases; subclass Upton::Scraper
-  #    e.g. +MyScraper < Upton::Scraper+ and override various methods.
   ##
-  class Scraper
-    EMPTY_STRING = ''
-    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
-      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
-      :pagination_interval
-    ##
-    # This is the main user-facing method for a basic scraper.
-    # Call +scrape+ with a block; this block will be called on
-    # the text of each instance page, (and optionally, its URL and its index
-    # in the list of instance URLs returned by +get_index+).
-    ##
-    def scrape(&blk)
-      self.url_array = self.get_index unless self.url_array
-      blk = Proc.new{|x| x} if blk.nil?
-      self.scrape_from_list(self.url_array, blk)
-    end
-    ##
-    # +index_url_or_array+: A list of string URLs, OR
-    #              the URL of the page containing the list of instances.
-    # +selector+: The XPath expression or CSS selector that specifies the
-    #              anchor elements within the page, if a url is specified for
-    #              the previous argument.
-    #
-    # These options are a shortcut. If you plan to override +get_index+, you
-    # do not need to set them.
-    # If you don't specify a selector, the first argument will be treated as a
-    # list of URLs.
-    ##
-    def initialize(index_url_or_array, selector="")
-      #if first arg is a valid URL, do already-written stuff;
-      #if it's not (or if it's a list?) don't bother with get_index, etc.
-      #e.g. Scraper.new(["http://jeremybmerrill.com"])
-      #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
-      if index_url_or_array.respond_to? :each_with_index
-        @url_array = index_url_or_array
-      else
-        @index_url = index_url_or_array
-        @index_selector = selector
-      end
-      # If true, then Upton prints information about when it gets
-      # files from the internet and when it gets them from its stash.
-      @verbose = false
-      # If true, then Upton fetches each instance page only once
-      # future requests for that file are responded to with the locally stashed
-      # version.
-      # You may want to set @debug to false for production (but maybe not).
-      # You can also control stashing behavior on a per-call basis with the
-      # optional second argument to get_page, if, for instance, you want to
-      # stash certain instance pages, e.g. based on their modification date.
-      @debug = true
-      # Index debug does the same, but for index pages.
-      @index_debug = false
-      # In order to not hammer servers, Upton waits for, by default, 30
-      # seconds between requests to the remote server.
-      @sleep_time_between_requests = 30 #seconds
-      # If true, then Upton will attempt to scrape paginated index pages
-      @paginated = false
-      # Default query string parameter used to specify the current page
-      @pagination_param = 'page'
-      # Default number of paginated pages to scrape
-      @pagination_max_pages = 2
-      # Default starting number for pagination (second page is this plus 1).
-      @pagination_start_index = 1
-      # Default value to increment page number by
-      @pagination_interval = 1
-      # Folder name for stashes, if you want them to be stored somewhere else,
-      # e.g. under /tmp.
-      if @stash_folder
-        FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
-      end
-    end
-    ##
-    # If instance pages are paginated, <b>you must override</b>
-    # this method to return the next URL, given the current URL and its index.
-    #
-    # If instance pages aren't paginated, there's no need to override this.
-    #
-    # Recursion stops if the fetching URL returns an empty string or an error.
-    #
-    # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
-    # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
-    ##
-    def next_instance_page_url(url, pagination_index)
-      EMPTY_STRING
-    end
-    ##
-    # Return the next URL to scrape, given the current URL and its index.
-    #
-    # Recursion stops if the fetching URL returns an empty string or an error.
-    #
-    # If @paginated is not set (the default), this method returns an empty string.
-    #
-    # If @paginated is set, this method will return the next pagination URL
-    # to scrape using @pagination_param and the pagination_index.
-    #
-    # If the pagination_index is greater than @pagination_max_pages, then the
-    # method will return an empty string.
-    #
-    # Override this method to handle pagination is an alternative way
-    # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
-    # ought to return "http://whatever.com/articles?page=2"
-    #
-    ##
-    def next_index_page_url(url, pagination_index)
-      return url unless @paginated
-      if pagination_index > @pagination_max_pages
-        puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
-        EMPTY_STRING
-      else
-        uri = URI.parse(url)
-        query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
-        # update the pagination query string parameter
-        query[@pagination_param] = pagination_index
-        uri.query = URI.encode_www_form(query)
-        puts "Next index pagination url is #{uri}" if @verbose
-        uri.to_s
-      end
-    end
-    ##
-    # Writes the scraped result to a CSV at the given filename.
-    ##
-    def scrape_to_csv filename, &blk
-      require 'csv'
-      self.url_array = self.get_index unless self.url_array
-      CSV.open filename, 'wb' do |csv|
-        #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
-          if document[0].respond_to? :map
-            document.each{|row| csv << row }
-          else
-            csv << document
-          end
-        end
-        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
-      end
-    end
-    def scrape_to_tsv filename, &blk
-      require 'csv'
-      self.url_array = self.get_index unless self.url_array
-      CSV.open filename, 'wb', :col_sep => "\t" do |csv|
-        #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
-          if document[0].respond_to? :map
-            document.each{|row| csv << row }
-          else
-            csv << document
-          end
-        end
-        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
-      end
-    end
-    protected
-    ##
-    # Handles getting pages with Downlader, which handles stashing.
-    ##
-    def get_page(url, stash=false, options={})
-      return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
-      global_options = {
-        :cache => stash,
-        :verbose => @verbose
-      }
-      if @readable_filenames
-        global_options[:readable_filenames] = true
-      end
-      if @stash_folder
-        global_options[:readable_filenames] = true
-        global_options[:cache_location] = @stash_folder
-      end
-      resp_and_cache = Downloader.new(url, global_options.merge(options)).get
-      if resp_and_cache[:from_resource]
-        puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
-        sleep @sleep_time_between_requests
-      end
-      resp_and_cache[:resp]
-    end
-    ##
-    # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
-    # resolve_url resolves them to absolute urls.
-    # absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
-    ##
-    def resolve_url(href_str, absolute_url_str)
-      if absolute_url_str.class <= URI::Generic
-        absolute_url = absolute_url_str.dup
-      else
-        begin
-          absolute_url = URI(absolute_url_str).dup
-        rescue URI::InvalidURIError
-          raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
-        end
-      end
-      raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
-      if href_str.class <= URI::Generic
-        href = href_str.dup
-      else
-        begin
-          href = URI(href_str).dup
-        rescue URI::InvalidURIError
-          raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
-        end
-      end
-      # return :href if :href is already absolute
-      return href.to_s if href.absolute?
-      #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
-      URI.join(absolute_url.to_s, href.to_s).to_s
-    end
-    ##
-    # Return a list of URLs for the instances you want to scrape.
-    # This can optionally be overridden if, for example, the list of instances
-    # comes from an API.
-    ##
-    def get_index
-      index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
-    end
-    # TODO: Not sure the best way to handle this
-    # Currently, #parse_index is called upon #get_index_pages,
-    #  which itself is dependent on @index_url
-    # Does @index_url stay unaltered for the lifetime of the Upton instance?
-    # It seems to at this point, but that may be something that gets
-    #  deprecated later
-    #
-    # So for now, @index_url is used in conjunction with resolve_url
-    # to make sure that this method returns absolute urls
-    # i.e. this method expects @index_url to always have an absolute address
-    # for the lifetime of an Upton instance
-    def parse_index(text, selector)
-      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
-        href = a_element["href"]
-        resolved_url = resolve_url( href, @index_url) unless href.nil?
-        puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
-        resolved_url
-      end
-    end
-    ##
-    # Returns the concatenated output of each member of a paginated index,
-    # e.g. a site listing links with 2+ pages.
-    ##
-    def get_index_pages(original_url, pagination_index, pagination_interval, options={})
-      resps = []
-      prev_url = nil
-      while resps.empty? || !resps.last.empty?
-        next_url = self.next_index_page_url(original_url, pagination_index)
-        break if next_url.empty?
-        next_url = resolve_url(next_url, original_url)
-        break if next_url == prev_url
-        next_resp = self.get_page(next_url, @index_debug, options).to_s
-        prev_url = next_url
-        pagination_index += pagination_interval
-        resps << next_resp
-      end
-      resps
-    end
-    ##
-    # Returns the instance at `url`.
-    #
-    # If the page is stashed, returns that, otherwise, fetches it from the web.
-    #
-    # If an instance is paginated, returns the concatenated output of each
-    # page, e.g. if a news article has two pages.
-    ##
-    def get_instance(url, pagination_index=0, options={})
-      resps = [self.get_page(url, @debug, options)]
-      pagination_index = pagination_index.to_i
-      prev_url = url
-      while !resps.last.empty?
-        next_url = self.next_instance_page_url(url, pagination_index + 1)
-        break if next_url == prev_url || next_url.empty?
-        next_resp = self.get_page(next_url, @debug, options)
-        prev_url = next_url
-        resps << next_resp
-      end
-      resps
-    end
-    # Just a helper for +scrape+.
-    def scrape_from_list(list, blk)
-      puts "Scraping #{list.size} instances" if @verbose
-      list.each_with_index.map do |instance_url, instance_index|
-        instance_resps = get_instance instance_url, nil, :instance_index => instance_index
-        instance_resps.each_with_index.map do |instance_resp, pagination_index|
-          blk.call(instance_resp, instance_url, instance_index, pagination_index)
-        end
-      end.flatten(1)
-    end
-    # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
-    def slug(url)
-      url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
-    end
-  end
 end

data/lib/upton/downloader.rb CHANGED Viewed

@@ -103,7 +103,7 @@ module Upton
       msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
       resp_html = Nokogiri::HTML(resp)
       comment = Nokogiri::XML::Comment.new(resp_html, msg)
-      if resp_html.root.nil? || !resp_html.include?("<html")
+      if resp_html.root.nil?
         return resp
       elsif resp_html.root.children.empty?
         resp_html.root.add_child(comment)

data/lib/upton/scraper.rb CHANGED Viewed

@@ -1,9 +1,10 @@
 require 'uri'
 require 'nokogiri'
 require_relative './downloader'
+require_relative './page'
 module Upton
-    # Upton::Scraper can be used as-is for basic use-cases by:
+  # Upton::Scraper can be used as-is for basic use-cases by:
   # 1. specifying the pages to be scraped in `new` as an index page
   #      or as an Array of URLs.
   # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
@@ -14,9 +15,8 @@ module Upton
   class Scraper
     EMPTY_STRING = ''
-    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
-      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
-      :pagination_interval
+    attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
+     :stash_folder, :readable_filenames
     ##
     # This is the main user-facing method for a basic scraper.
@@ -25,8 +25,8 @@ module Upton
     # in the list of instance URLs returned by +get_index+).
     ##
     def scrape(&blk)
-      self.url_array = self.get_index unless self.url_array
-      self.scrape_from_list(self.url_array, blk)
+      get_indexes!
+      self.scrape_from_list(@instance_urls, blk)
     end
     ##
@@ -41,23 +41,10 @@ module Upton
     # If you don't specify a selector, the first argument will be treated as a
     # list of URLs.
     ##
-    def initialize(index_url_or_array, selector="")
-      #if first arg is a valid URL, do already-written stuff;
-      #if it's not (or if it's a list?) don't bother with get_index, etc.
-      #e.g. Scraper.new(["http://jeremybmerrill.com"])
-      #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
-      if index_url_or_array.respond_to? :each_with_index
-        @url_array = index_url_or_array
-      else
-        @index_url = index_url_or_array
-        @index_selector = selector
-      end
+    def initialize(options={})
       # If true, then Upton prints information about when it gets
       # files from the internet and when it gets them from its stash.
-      @verbose = false
+      @verbose = options[:verbose] || false
       # If true, then Upton fetches each instance page only once
       # future requests for that file are responded to with the locally stashed
@@ -66,29 +53,77 @@ module Upton
       # You can also control stashing behavior on a per-call basis with the
       # optional second argument to get_page, if, for instance, you want to
       # stash certain instance pages, e.g. based on their modification date.
-      @debug = true
+      @debug = options[:debug] || true
       # Index debug does the same, but for index pages.
-      @index_debug = false
+      @index_debug = options[:index_debug] || false
       # In order to not hammer servers, Upton waits for, by default, 30
       # seconds between requests to the remote server.
-      @sleep_time_between_requests = 30 #seconds
+      @sleep_time_between_requests = options[:sleep_time_between_requests] || 30 #seconds
+      # Folder name for stashes, if you want them to be stored somewhere else,
+      # e.g. under /tmp.
+      if @stash_folder
+        FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
+      end
+      @indexes = []
+      @instance_urls = []
+    end
+    def index(index_url, selector, options={})
+      # for future:
+      @indexes ||= []
+      ##
+      # Pagination options are per-index page
+      #
       # If true, then Upton will attempt to scrape paginated index pages
-      @paginated = false
+      options[:paginated] ||= false
       # Default query string parameter used to specify the current page
-      @pagination_param = 'page'
+      options[:pagination_param] ||= 'page'
       # Default number of paginated pages to scrape
-      @pagination_max_pages = 2
+      options[:pagination_max_pages] ||= 2
       # Default starting number for pagination (second page is this plus 1).
-      @pagination_start_index = 1
+      options[:pagination_start_index] ||= 1
       # Default value to increment page number by
-      @pagination_interval = 1
-      # Folder name for stashes, if you want them to be stored somewhere else,
-      # e.g. under /tmp.
-      if @stash_folder
-        FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
+      options[:pagination_interval] ||= 1
+      ##
+      @indexes << [index_url, selector, options]
+      # and actually go scrape the index page, populate @instances
+      self
+    end
+    def self.index(index_url, selector, options={})
+      scraper = self.new
+      scraper.index(index_url, selector, options)
+      scraper
+    end
+    def self.instances(instances, options={})
+      s = self.new
+      s.instance_variable_set(:@instance_urls, instances)
+      s
+    end
+    # does
+    # def add_instances(urls)
+    #   #for future:
+    #   # @instances += urls
+    #   # @instances.uniq!
+    #   @instance_urls ||= []
+    #   @instance_urls += urls
+    #   @instance_urls.uniq!
+    # end
+    def instances(urls=nil)
+      if urls.nil?
+        @instance_urls
+      else
+        @instance_urls ||= []
+        @instance_urls += urls
+        self
       end
     end
@@ -125,21 +160,14 @@ module Upton
     # ought to return "http://whatever.com/articles?page=2"
     #
     ##
-    def next_index_page_url(url, pagination_index)
-      return EMPTY_STRING unless @paginated
-      if pagination_index > @pagination_max_pages
-        puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
-        EMPTY_STRING
-      else
-        uri = URI.parse(url)
-        query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
-        # update the pagination query string parameter
-        query[@pagination_param] = pagination_index
-        uri.query = URI.encode_www_form(query)
-        puts "Next index pagination url is #{uri}" if @verbose
-        uri.to_s
-      end
+    def next_index_page_url(url, pagination_param, pagination_index)
+      uri = URI.parse(url)
+      query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
+      # update the pagination query string parameter
+      query[pagination_param] = pagination_index
+      uri.query = URI.encode_www_form(query)
+      puts "Next index pagination url is #{uri}" if @verbose
+      uri.to_s
     end
     ##
@@ -147,36 +175,46 @@ module Upton
     ##
     def scrape_to_csv filename, &blk
       require 'csv'
-      self.url_array = self.get_index unless self.url_array
+      self.get_indexes!
       CSV.open filename, 'wb' do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+        self.scrape_from_list(@instance_urls, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
             csv << document
           end
         end
-        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+        #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
       end
     end
     def scrape_to_tsv filename, &blk
       require 'csv'
-      self.url_array = self.get_index unless self.url_array
+      get_indexes!
       CSV.open filename, 'wb', :col_sep => "\t" do |csv|
         #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
-        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+        self.scrape_from_list(@instance_urls, blk).compact.each do |document|
           if document[0].respond_to? :map
             document.each{|row| csv << row }
           else
             csv << document
           end
         end
-        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+        #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
       end
     end
+    def +(other_scraper)
+      raise ArgumentError, "#{other_scraper.class} can't be coerced into Upton::Scraper" unless other_scraper.class <= Upton::Scraper
+      new_scraper = Scraper.new
+      new_indexes = @indexes + other_scraper.instance_variable_get(:@indexes)
+      new_instances = @instance_urls + other_scraper.instance_variable_get(:@instance_urls)
+      new_scraper.instance_variable_set(:@indexes, new_indexes)
+      new_scraper.instance_variable_set(:@instance_urls, new_instances)
+      new_scraper
+    end
     protected
     ##
@@ -217,6 +255,8 @@ module Upton
           absolute_url = URI(absolute_url_str).dup
         rescue URI::InvalidURIError
           raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
+        rescue ArgumentError
+          raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
         end
       end
       raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
@@ -237,15 +277,6 @@ module Upton
       URI.join(absolute_url.to_s, href.to_s).to_s
     end
-    ##
-    # Return a list of URLs for the instances you want to scrape.
-    # This can optionally be overridden if, for example, the list of instances
-    # comes from an API.
-    ##
-    def get_index
-      index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
-    end
     # TODO: Not sure the best way to handle this
     # Currently, #parse_index is called upon #get_index_pages,
     #  which itself is dependent on @index_url
@@ -253,30 +284,31 @@ module Upton
     # It seems to at this point, but that may be something that gets
     #  deprecated later
     #
-    # So for now, @index_url is used in conjunction with resolve_url
+    # So for now, index_url is used in conjunction with resolve_url
     # to make sure that this method returns absolute urls
-    # i.e. this method expects @index_url to always have an absolute address
-    # for the lifetime of an Upton instance
-    def parse_index(text, selector)
-      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
-        href = a_element["href"]
-        resolved_url = resolve_url( href, @index_url) unless href.nil?
+    def parse_index(text, selector, index_url)
+      Nokogiri::HTML(text).search(selector).to_a.map do |anchor|
+        href = anchor["href"]
+        resolved_url = resolve_url( href, index_url) unless href.nil?
         puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
         resolved_url
       end
     end
     ##
     # Returns the concatenated output of each member of a paginated index,
     # e.g. a site listing links with 2+ pages.
     ##
-    def get_index_pages(url, pagination_index, pagination_interval, options={})
+    def get_index_pages(url, pagination_index, options={})
       resps = [self.get_page(url, @index_debug, options)]
+      return resps unless options[:paginated]
       prev_url = url
       while !resps.last.empty?
-        pagination_index += pagination_interval
-        next_url = self.next_index_page_url(url, pagination_index)
+        pagination_index += options[:pagination_interval]
+        break if pagination_index > options[:pagination_max_pages]
+        next_url = self.next_index_page_url(url, options[:pagination_param], pagination_index)
         next_url = resolve_url(next_url, url)
         break if next_url == prev_url || next_url.empty?
@@ -310,13 +342,28 @@ module Upton
       resps
     end
+    ##
+    # Return a list of URLs for the instances you want to scrape.
+    # This can optionally be overridden if, for example, the list of instances
+    # comes from an API.
+    ##
+    def get_indexes!
+      @indexes.each do |index_url, index_selector, options|
+        #TODO: cope with pagination stuff per URL
+        @instance_urls += get_index_pages(index_url, options[:pagination_start_index], options).map{|page| parse_index(page, index_selector, index_url) }.flatten
+      end
+    end
     # Just a helper for +scrape+.
     def scrape_from_list(list, blk)
       puts "Scraping #{list.size} instances" if @verbose
       list.each_with_index.map do |instance_url, instance_index|
         instance_resps = get_instance instance_url, nil, :instance_index => instance_index
         instance_resps.each_with_index.map do |instance_resp, pagination_index|
-          blk.call(instance_resp, instance_url, instance_index, pagination_index)
+          page = Page.new(instance_resp, instance_url, instance_index, pagination_index)
+          blk.call(page)
         end
       end.flatten(1)
     end

data/lib/upton/utils.rb CHANGED Viewed

@@ -18,8 +18,7 @@ module Upton
     # present, is returned as the first row.
     ##
     def self.table(table_selector, deprecated=nil)
-      return Proc.new do |instance_html|
-        html = ::Nokogiri::HTML(instance_html)
+      return Proc.new do |html|
         output = []
         headers = html.search(table_selector).css("th").map &:text
         output << headers
@@ -33,8 +32,7 @@ module Upton
     # Scrapes any set of HTML elements into an Array.
     ##
     def self.list(list_selector, deprecated=nil)
-      return Proc.new do |instance_html|
-        html = ::Nokogiri::HTML(instance_html)
+      return Proc.new do |html|
         html.search(list_selector).map{|list_element| list_element.text }
       end
     end

data/lib/upton/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Upton # :nodoc:
-    VERSION = '0.3.6'
+    VERSION = '1.0.0.prea'
 end

data/spec/upton_spec.rb CHANGED Viewed

@@ -52,15 +52,14 @@ describe Upton do
     stub_request(:get, "www.example.com/sixfacts.html").
       to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
-    propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
+    propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
     propubscraper.debug = true
     propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
     propubscraper.stash_folder = "test_stashes"
-    heds = propubscraper.scrape do |article_str|
-      doc = Nokogiri::HTML(article_str)
-      hed = doc.css('h1.article-title').text
+    heds = propubscraper.scrape do |doc|
+      doc.css('h1.article-title').text
     end
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
     heds.should eql @headlines
@@ -87,14 +86,13 @@ describe Upton do
       to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
-    propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
+    propubscraper = Upton::Scraper.index("http://www.example.com/propublica-relative.html", "section#river h1 a")
     propubscraper.debug = true
     propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
     propubscraper.stash_folder = "test_stashes"
-    heds = propubscraper.scrape do |article_str|
-      doc = Nokogiri::HTML(article_str)
+    heds = propubscraper.scrape do |doc|
       hed = doc.css('h1.article-title').text
     end
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -105,7 +103,7 @@ describe Upton do
     stub_request(:get, "www.example.com/propublica.html").
       to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
-    propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
+    propubscraper = Upton::Scraper.instances(["http://www.example.com/propublica.html"])
     propubscraper.debug = true
     propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
@@ -120,7 +118,7 @@ describe Upton do
     stub_request(:get, "www.example.com/easttimor.html").
       to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
-    propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
+    propubscraper = Upton::Scraper.instances(["http://www.example.com/easttimor.html"])
     propubscraper.debug = true
     propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
@@ -139,8 +137,6 @@ describe Upton do
   it "should scrape paginated pages" do
     stub_request(:get, "www.example.com/propublica_search.html").
       to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
-    stub_request(:get, "www.example.com/propublica_search.html?p=1").
-      to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
     stub_request(:get, "www.example.com/propublica_search.html?p=2").
       to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
     stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -153,17 +149,21 @@ describe Upton do
       to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
-    propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
+    propubscraper = Upton::Scraper.index(
+        "http://www.example.com/propublica_search.html",
+        '.compact-list a.title-link',
+        {
+          :paginated => true,
+          :pagination_param => 'p',
+          :pagination_max_pages => 3,
+        }
+      )
     propubscraper.debug = true
     propubscraper.verbose = false
-    propubscraper.paginated = true
-    propubscraper.pagination_param = 'p'
-    propubscraper.pagination_max_pages = 3
     propubscraper.sleep_time_between_requests = 0
     propubscraper.stash_folder = "test_stashes"
-    results = propubscraper.scrape do |article_str|
-      doc = Nokogiri::HTML(article_str)
+    results = propubscraper.scrape do |doc|
       hed = doc.css('h1.article-title').text
     end
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -177,7 +177,7 @@ describe Upton do
   it "should sleep after requests with caching disabled" do
     stub_request(:get, "www.example.com")
-    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u = Upton::Scraper.index("http://www.example.com", '.whatever')
     u.index_debug = false
     u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
     u.should_receive(:sleep)
@@ -187,7 +187,7 @@ describe Upton do
   it "should sleep after uncached requests when caching is enabled" do
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
     stub_request(:get, "www.example.com")
-    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u = Upton::Scraper.index("http://www.example.com", '.whatever')
     u.index_debug = true
     u.stash_folder = "test_stashes"
     u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
@@ -199,8 +199,6 @@ describe Upton do
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
     stub_request(:get, "www.example.com/propublica_search.html").
       to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
-    stub_request(:get, "www.example.com/propublica_search.html?p=1").
-      to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
     stub_request(:get, "www.example.com/propublica_search.html?p=2").
       to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
     stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -213,12 +211,15 @@ describe Upton do
       to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
-    u = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.nonexistent')
+    u = Upton::Scraper.index("http://www.example.com/propublica_search.html", '.nonexistent',
+        {
+          :paginated => true,
+          :pagination_param => 'p',
+          :pagination_max_pages => 3,
+        }
+      )
     u.index_debug = false
     u.debug = false
-    u.paginated = true
-    u.pagination_param = 'p'
-    u.pagination_max_pages = 3
     u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
     u.stash_folder = "test_stashes"
@@ -234,7 +235,7 @@ describe Upton do
     stub_request(:get, "www.example.com").
       to_return(:body => '', :status => 200)
-    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u = Upton::Scraper.index("http://www.example.com", '.whatever')
     u.sleep_time_between_requests = 0.0
     u.stash_folder = custom_cache_folder
     u.debug = true
@@ -245,6 +246,76 @@ describe Upton do
     expect(files).not_to be_empty
   end
+  it "should scrape in the basic case with the index method" do
+    stub_request(:get, "www.example.com/propublica.html").
+      to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
+    stub_request(:get, "www.example.com/discussion.html").
+      to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
+    stub_request(:get, "www.example.com/prosecutor.html").
+      to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
+    stub_request(:get, "www.example.com/webinar.html").
+      to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
+    stub_request(:get, "www.example.com/sixfacts.html").
+      to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
+    propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
+    propubscraper.debug = true
+    propubscraper.verbose = false
+    propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
+    heds = propubscraper.scrape do |doc|
+      hed = doc.css('h1.article-title').text
+    end
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    heds.should eql @headlines
+  end
+  it "should allow instances to be set on a new Scraper" do
+    stub_request(:get, "www.example.com/propublica.html").
+      to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
+    stub_request(:get, "www.example.com/discussion.html").
+      to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
+    stub_request(:get, "www.example.com/prosecutor.html").
+      to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
+    stub_request(:get, "www.example.com/webinar.html").
+      to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
+    stub_request(:get, "www.example.com/sixfacts.html").
+      to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
+    propubscraper = Upton::Scraper.instances(["www.example.com/webinar.html",
+                                              "www.example.com/discussion.html",
+                                              "www.example.com/prosecutor.html",
+                                              "www.example.com/sixfacts.html"])
+    propubscraper.debug = true
+    propubscraper.verbose = false
+    propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
+    heds = propubscraper.scrape do |doc|
+      hed = doc.css('h1.article-title').text
+    end
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    heds.should eql @headlines
+  end
+  it "should allow Scrapers to be added (indexes)" do
+    u = Upton::Scraper.index("http://www.example1.com", '.link')
+    w = Upton::Scraper.index("http://www.example2.com", '.link')
+    new_scraper = u + w
+    new_scraper.instance_variable_get(:@indexes).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
+  end
+  it "should allow Scrapers to be added (instances)" do
+    pending
+    u = Upton::Scraper.instances(["http://www.example1.com"])
+    w = Upton::Scraper.instances(["http://www.example2.com"])
+    new_scraper = u + w
+    new_scraper.instance_variable_get(:@indexes).should eql []
+    new_scraper.instance_variable_get(:@instance_urls).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
+  end
   before do
     Upton::Scraper.stub(:puts)
@@ -252,7 +323,7 @@ describe Upton do
   it "should be silent if verbose is false" do
     stub_request(:get, "www.example.com")
-    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u = Upton::Scraper.index("http://www.example.com", '.whatever')
     u.sleep_time_between_requests = 0.0
     u.verbose = false
     u.should_not_receive(:puts)

metadata CHANGED Viewed

@@ -1,119 +1,127 @@
 --- !ruby/object:Gem::Specification
 name: upton
 version: !ruby/object:Gem::Version
-  version: 0.3.6
+  version: 1.0.0.prea
 platform: ruby
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-12-25 00:00:00.000000000 Z
+date: 2014-03-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rack
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: webmock
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: thin
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: 1.5.1
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rest-client
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '1.6'
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: 1.6.7
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '1.6'
-    - - "~>"
+    - - ~>
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: 1.6.7
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - '>='
       - !ruby/object:Gem::Version
-        version: '1.5'
+        version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - '>='
       - !ruby/object:Gem::Version
-        version: '1.5'
+        version: '0'
 description: Don't re-write web scrapers every time. Upton gives you a scraper template
   that's easy to use for debugging and doesn't hammer servers by default.
 email: jeremybmerrill@jeremybmerrill.com
@@ -122,22 +130,22 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/upton.rb
-- lib/upton/downloader.rb
 - lib/upton/scraper.rb
 - lib/upton/utils.rb
+- lib/upton/downloader.rb
 - lib/upton/version.rb
-- spec/data/discussion.html
+- spec/data/prosecutor.html
 - spec/data/easttimor.html
-- spec/data/propublica-relative.html
-- spec/data/propublica.html
+- spec/data/discussion.html
 - spec/data/propublica_search.html
 - spec/data/propublica_search_page_2.html
-- spec/data/prosecutor.html
-- spec/data/sixfacts.html
+- spec/data/propublica-relative.html
 - spec/data/webinar.html
+- spec/data/propublica.html
+- spec/data/sixfacts.html
+- spec/upton_spec.rb
 - spec/spec_helper.rb
 - spec/upton_downloader_spec.rb
-- spec/upton_spec.rb
 homepage: http://github.org/propublica/upton
 licenses:
 - MIT
@@ -148,30 +156,31 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - '>='
     - !ruby/object:Gem::Version
       version: 1.9.2
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - '>'
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
-rubygems_version: 2.5.1
+rubygems_version: 2.0.14
 signing_key:
 specification_version: 4
 summary: A simple web-scraping framework
 test_files:
 - spec/data/prosecutor.html
-- spec/data/propublica_search.html
-- spec/data/propublica.html
+- spec/data/easttimor.html
 - spec/data/discussion.html
+- spec/data/propublica_search.html
 - spec/data/propublica_search_page_2.html
-- spec/data/sixfacts.html
 - spec/data/propublica-relative.html
-- spec/data/easttimor.html
 - spec/data/webinar.html
+- spec/data/propublica.html
+- spec/data/sixfacts.html
 - spec/upton_spec.rb
 - spec/spec_helper.rb
 - spec/upton_downloader_spec.rb
+has_rdoc: true