RubyGems - upton - Versions diffs - 0.2.11 → 0.3.0 - Mend

upton 0.2.11 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/lib/upton.rb CHANGED Viewed

@@ -35,7 +35,7 @@ module Upton
     EMPTY_STRING = ''
     attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
-      :paginated, :pagination_param, :pagination_max_pages, :readable_filenames
+      :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
     ##
     # This is the main user-facing method for a basic scraper.
@@ -54,21 +54,13 @@ module Upton
     # +selector+: The XPath expression or CSS selector that specifies the
     #              anchor elements within the page, if a url is specified for
     #              the previous argument.
-    # +selector_method+: Deprecated and ignored. Next breaking release will
-    #                      remove this option.x
     #
     # These options are a shortcut. If you plan to override +get_index+, you
     # do not need to set them.
     # If you don't specify a selector, the first argument will be treated as a
     # list of URLs.
     ##
-    # DEPRECATION NOTE, re: selector_method
-      # the selector_method parameter is unneeded, as Nokogiri provides the
-      #  #search method, which picks a selector depending on whether
-      #  the String passed is of CSS/XPath notation
-    def initialize(index_url_or_array, selector="", selector_method=:deprecated)
+    def initialize(index_url_or_array, selector="")
       #if first arg is a valid URL, do already-written stuff;
       #if it's not (or if it's a list?) don't bother with get_index, etc.
@@ -107,8 +99,9 @@ module Upton
       @pagination_param = 'page'
       # Default number of paginated pages to scrape
       @pagination_max_pages = 2
+      # Default starting number for pagination (second page is this plus 1).
+      @pagination_start_index = 1
       # Folder name for stashes, if you want them to be stored somewhere else,
       # e.g. under /tmp.
       if @stash_folder
@@ -231,18 +224,34 @@ module Upton
     ##
     # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
     # resolve_url resolves them to absolute urls.
-    # absolute_url_str must be a URL, as a string, that is absolute.
+    # absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
     ##
     def resolve_url(href_str, absolute_url_str)
-      absolute_url = URI(absolute_url_str).dup
+      if absolute_url_str.class <= URI::Generic
+        absolute_url = absolute_url_str.dup
+      else
+        begin
+          absolute_url = URI(absolute_url_str).dup
+        rescue URI::InvalidURIError
+          raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
+        end
+      end
       raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
-      href = URI(href_str).dup
+      if href_str.class <= URI::Generic
+        href = href_str.dup
+      else
+        begin
+          href = URI(href_str).dup
+        rescue URI::InvalidURIError
+          raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
+        end
+      end
       # return :href if :href is already absolute
       return href.to_s if href.absolute?
       #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
-      URI.join(absolute_url, href).to_s
+      URI.join(absolute_url.to_s, href.to_s).to_s
     end
     ##
@@ -251,16 +260,7 @@ module Upton
     # comes from an API.
     ##
     def get_index
-      # TODO: Deprecate @index_Selector_method in next minor release
-      parse_index(get_index_pages(@index_url, 1), @index_selector)
-    end
-    ##
-    # Using the XPath expression or CSS selector and selector_method that
-    # uniquely identifies the links in the index, return those links as strings.    ##
-    def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
-      # for now, override selector_method with :search, which will work with either CSS or XPath
-      Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
+      index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
     end
     # TODO: Not sure the best way to handle this
@@ -274,8 +274,7 @@ module Upton
     # to make sure that this method returns absolute urls
     # i.e. this method expects @index_url to always have an absolute address
     # for the lifetime of an Upton instance
-    def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
-      # for now, override selector_method with :search, which will work with either CSS or XPath
+    def parse_index(text, selector)
       Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
         href = a_element["href"]
         resolved_url = resolve_url( href, @index_url) unless href.nil?
@@ -290,18 +289,19 @@ module Upton
     # e.g. a site listing links with 2+ pages.
     ##
     def get_index_pages(url, pagination_index, options={})
-      resp = self.get_page(url, @index_debug, options)
-      unless resp.empty?
-        next_url = self.next_index_page_url(url, pagination_index + 1)
-        # resolve to absolute url
-        #
+      resps = [self.get_page(url, @index_debug, options)]
+      prev_url = url
+      while !resps.last.empty?
+        pagination_index += 1
+        next_url = self.next_index_page_url(url, pagination_index)
         next_url = resolve_url(next_url, url)
-        unless next_url == url
-          next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
-          resp += next_resp
-        end
+        break if next_url == prev_url || next_url.empty?
+        next_resp = self.get_page(next_url, @index_debug, options).to_s
+        prev_url = next_url
+        resps << next_resp
       end
-      resp
+      resps
     end
     ##
@@ -313,26 +313,29 @@ module Upton
     # page, e.g. if a news article has two pages.
     ##
     def get_instance(url, pagination_index=0, options={})
-      resp = self.get_page(url, @debug, options)
-      if !resp.empty?
-        next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
-        #next_url = resolve_url(next_url, url)
-        unless next_url == url
-          next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
-          resp += next_resp
-        end
+      resps = [self.get_page(url, @debug, options)]
+      pagination_index = pagination_index.to_i
+      prev_url = url
+      while !resps.last.empty?
+        next_url = self.next_instance_page_url(url, pagination_index + 1)
+        break if next_url == prev_url || next_url.empty?
+        next_resp = self.get_page(next_url, @debug, options)
+        prev_url = next_url
+        resps << next_resp
       end
-      resp
+      resps
     end
     # Just a helper for +scrape+.
     def scrape_from_list(list, blk)
       puts "Scraping #{list.size} instances" if @verbose
       list.each_with_index.map do |instance_url, instance_index|
-        instance_resp = get_instance instance_url, nil, :instance_index => instance_index
-        blk.call(instance_resp, instance_url, instance_index)
-      end
+        instance_resps = get_instance instance_url, nil, :instance_index => instance_index
+        instance_resps.each_with_index.map do |instance_resp, pagination_index|
+          blk.call(instance_resp, instance_url, instance_index, pagination_index)
+        end
+      end.flatten(1)
     end
     # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.

data/lib/upton/downloader.rb CHANGED Viewed

@@ -42,10 +42,14 @@ module Upton
     private
+    def make_request_for_resource!
+      RestClient.get(uri)
+    end
     def download_from_resource!
       begin
         puts "Downloading from #{uri}" if @verbose
-        resp = RestClient.get(uri)
+        resp = make_request_for_resource!
         puts "Downloaded #{uri}" if @verbose
       rescue RestClient::ResourceNotFound
         puts "404 error, skipping: #{uri}" if @verbose
@@ -73,7 +77,7 @@ module Upton
                   puts "Cache of #{uri} unavailable. Will download from the internet"
                 end
               end
-              from_resource = false
+              from_resource = true
               download_from_resource!
             end
       unless cached_file_exists?
@@ -84,7 +88,7 @@ module Upton
             puts "Writing #{uri} data to the cache"
           end
         end
-        File.write(cached_file, resp)
+        open(cached_file, 'w'){|f| f << resp}
       end
       {:resp => resp, :from_resource => from_resource }
     end

data/lib/upton/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Upton # :nodoc:
+    VERSION = '0.3.0'
+end

data/spec/upton_spec.rb CHANGED Viewed

@@ -54,8 +54,9 @@ describe Upton do
     propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
     propubscraper.debug = true
-    propubscraper.verbose = true
+    propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
     heds = propubscraper.scrape do |article_str|
       doc = Nokogiri::HTML(article_str)
@@ -88,8 +89,9 @@ describe Upton do
     propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
     propubscraper.debug = true
-    propubscraper.verbose = true
+    propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
     heds = propubscraper.scrape do |article_str|
       doc = Nokogiri::HTML(article_str)
@@ -105,8 +107,9 @@ describe Upton do
     propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
     propubscraper.debug = true
-    propubscraper.verbose = true
+    propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
     list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a"))
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -119,10 +122,12 @@ describe Upton do
     propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
     propubscraper.debug = true
-    propubscraper.verbose = true
+    propubscraper.verbose = false
     propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
     table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
+    table.map{|outer| outer.map{|row| row.map{|cell| cell.gsub!("\n", '') } }} # cope with diff nokogiri versions differing behavior.
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
     table.should eql @east_timor_prime_ministers
   end
@@ -148,11 +153,12 @@ describe Upton do
     propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
     propubscraper.debug = true
-    propubscraper.verbose = true
+    propubscraper.verbose = false
     propubscraper.paginated = true
     propubscraper.pagination_param = 'p'
     propubscraper.pagination_max_pages = 3
     propubscraper.sleep_time_between_requests = 0
+    propubscraper.stash_folder = "test_stashes"
     results = propubscraper.scrape do |article_str|
       doc = Nokogiri::HTML(article_str)
@@ -167,15 +173,57 @@ describe Upton do
     Upton::Scraper.stub(:sleep)
   end
-  it "should sleep after uncached requests" do
+  it "should sleep after requests with caching disabled" do
     stub_request(:get, "www.example.com")
     u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u.index_debug = false
     u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
     u.should_receive(:sleep)
-    stub = stub_request(:get, "http://www.example.com")
     u.scrape
   end
+  it "should sleep after uncached requests when caching is enabled" do
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    stub_request(:get, "www.example.com")
+    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u.index_debug = true
+    u.stash_folder = "test_stashes"
+    u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
+    u.should_receive(:sleep)
+    u.scrape
+  end
+  it "should sleep after paginated requests when caching is disabled" do
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    stub_request(:get, "www.example.com/propublica_search.html").
+      to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
+    stub_request(:get, "www.example.com/propublica_search.html?p=2").
+      to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
+    stub_request(:get, "www.example.com/propublica_search.html?p=3").
+      to_return(:body => '', :status => 200)
+    stub_request(:get, "www.example.com/webinar.html").
+      to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
+    stub_request(:get, "www.example.com/prosecutor.html").
+      to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
+    stub_request(:get, "www.example.com/sixfacts.html").
+      to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
+    u = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.nonexistent')
+    u.index_debug = false
+    u.debug = false
+    u.paginated = true
+    u.pagination_param = 'p'
+    u.pagination_max_pages = 3
+    u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
+    u.stash_folder = "test_stashes"
+    u.should_receive(:sleep).exactly(3).times #once for each search page, so 3.
+    u.scrape
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+  end
   it "should save to the designated stash folder" do
     custom_cache_folder = "#{Dir.tmpdir}/upton/test"
     FileUtils.rm_rf(custom_cache_folder)
@@ -183,17 +231,28 @@ describe Upton do
       to_return(:body => '', :status => 200)
     u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u.sleep_time_between_requests = 0.0
     u.stash_folder = custom_cache_folder
     u.debug = true
     u.scrape do
       1+1
     end
-    puts [custom_cache_folder, custom_cache_folder + "/*", Dir.glob(custom_cache_folder)].inspect
     files = Dir.glob(custom_cache_folder)
     expect(files).not_to be_empty
   end
-  it "should be silent if verbose if false" do
-    pending
+  before do
+    Upton::Scraper.stub(:puts)
   end
+  it "should be silent if verbose is false" do
+    stub_request(:get, "www.example.com")
+    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u.sleep_time_between_requests = 0.0
+    u.verbose = false
+    u.should_not_receive(:puts)
+    u.scrape
+  end
 end

metadata CHANGED Viewed

@@ -1,102 +1,116 @@
 --- !ruby/object:Gem::Specification
 name: upton
 version: !ruby/object:Gem::Version
-  version: 0.2.11
+  version: 0.3.0
+  prerelease:
 platform: ruby
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-11-14 00:00:00.000000000 Z
+date: 2013-12-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rack
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: webmock
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: thin
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.5.1
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.5.1
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: rest-client
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -104,6 +118,7 @@ dependencies:
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
@@ -111,29 +126,33 @@ dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: mechanize
   requirement: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
+    none: false
     requirements:
-    - - '>='
+    - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
 description: Don't re-write web scrapers every time. Upton gives you a scraper template
@@ -146,52 +165,54 @@ files:
 - lib/upton.rb
 - lib/upton/utils.rb
 - lib/upton/downloader.rb
-- spec/data/discussion.html
+- lib/upton/version.rb
+- spec/data/prosecutor.html
 - spec/data/easttimor.html
-- spec/data/propublica-relative.html
-- spec/data/propublica.html
+- spec/data/discussion.html
 - spec/data/propublica_search.html
 - spec/data/propublica_search_page_2.html
-- spec/data/prosecutor.html
-- spec/data/sixfacts.html
+- spec/data/propublica-relative.html
 - spec/data/webinar.html
+- spec/data/propublica.html
+- spec/data/sixfacts.html
 - spec/upton_spec.rb
 - spec/spec_helper.rb
 - spec/upton_downloader_spec.rb
 homepage: http://github.org/propublica/upton
 licenses:
 - MIT
-metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
-      version: 1.8.7
+      version: 1.9.2
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
-  - - '>='
+  - - ! '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.3
+rubygems_version: 1.8.23
 signing_key:
-specification_version: 4
+specification_version: 3
 summary: A simple web-scraping framework
 test_files:
-- spec/data/discussion.html
+- spec/data/prosecutor.html
 - spec/data/easttimor.html
-- spec/data/propublica-relative.html
-- spec/data/propublica.html
+- spec/data/discussion.html
 - spec/data/propublica_search.html
 - spec/data/propublica_search_page_2.html
-- spec/data/prosecutor.html
-- spec/data/sixfacts.html
+- spec/data/propublica-relative.html
 - spec/data/webinar.html
+- spec/data/propublica.html
+- spec/data/sixfacts.html
 - spec/upton_spec.rb
 - spec/spec_helper.rb
 - spec/upton_downloader_spec.rb

checksums.yaml DELETED Viewed

@@ -1,7 +0,0 @@
----
-SHA1:
-  metadata.gz: 2ef1916db6e2fb734cb8ea7ed33eb5edb67b37e3
-  data.tar.gz: 2a9da49f8a47dfc9e1feab2138045f7aa49268d6
-SHA512:
-  metadata.gz: e94a228a8fb01c90c0e7535b106b2af4dd8983ea3e92b2813cd5d038c3985a5f55c5fbcac19ee5f16f3271ad9e390d426f0ad8ad7b0c08afdf3b9d745cff2738
-  data.tar.gz: f8b0475e022980cd6ca0eec6dc8512394723084ba59d0b47cd36c24c736fbfc4a58b52ce186a3f5b91c69fd1241dfaa9d57c5f71bf6867255426e0fd3f26ed0f