RubyGems - upton - Versions diffs - 0.2.7 → 0.2.8 - Mend

upton 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/lib/upton.rb +86 -116
data/lib/upton/downloader.rb +126 -0
data/lib/upton/utils.rb +43 -0
data/spec/data/propublica.html +269 -269
data/spec/data/propublica_search.html +388 -0
data/spec/data/propublica_search_page_2.html +375 -0
data/spec/spec_helper.rb +20 -0
data/spec/upton_downloader_spec.rb +75 -0
data/spec/upton_spec.rb +110 -47
metadata +26 -3
data/lib/utils.rb +0 -74

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,20 @@
+# This file was generated by the `rspec --init` command. Conventionally, all
+# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
+# Require this file using `require "spec_helper"` to ensure that it is only
+# loaded once.
+#
+# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
+require "webmock/rspec"
+RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
+  config.run_all_when_everything_filtered = true
+  config.filter_run :focus
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+  WebMock.disable_net_connect!(:allow_localhost => true)
+end

data/spec/upton_downloader_spec.rb ADDED

@@ -0,0 +1,75 @@
+require "spec_helper.rb"
+require_relative "../lib/upton/downloader.rb"
+describe Upton::Downloader do
+  def remove_default_cache_folder!
+    FileUtils.rm_rf(default_cache_folder)
+  end
+  def default_cache_folder
+    "#{Dir.tmpdir}/upton"
+  end
+  let(:cache) { Upton::Downloader.new("http://www.example.com") }
+  let(:uncache) { Upton::Downloader.new("http://www.example.com", cache: false ) }
+  context "When caching enabled" do
+    context "When disk cache is unavailable" do
+      before(:each) do
+        remove_default_cache_folder!
+      end
+      it "should download from the resource once" do
+        stub = stub_request(:get, "http://www.example.com")
+        cache.get
+        stub.should have_been_requested.once
+      end
+      it "should use the cache from the second request" do
+        stub = stub_request(:get, "http://www.example.com")
+        cache.get
+        cache.get
+        stub.should have_been_requested.once
+      end
+    end
+    context "cache available" do
+      it "should not make a http request" do
+        stub = stub_request(:get, "http://www.example.com")
+        cache.get
+        stub.should_not have_been_requested
+      end
+    end
+    context "Different urls should have different caches" do
+      let(:cache_one) { Upton::Downloader.new("http://www.example.com", cache: true) }
+      let(:cache_two) { Upton::Downloader.new("http://www.example.com?a=1&b=2", cache: true) }
+      it "should create two cached files inside the cache directory" do
+        remove_default_cache_folder!
+        stub_one = stub_request(:get, "http://www.example.com")
+        stub_two = stub_request(:get, "http://www.example.com?a=1&b=2")
+        cache_one.get
+        cache_two.get
+        Dir.entries(default_cache_folder).count.should eq(4)
+      end
+    end
+  end
+  context "When caching disabled" do
+    context "When #download is called twice" do
+      it "should make two requests" do
+        stub = stub_request(:get, "http://www.example.com")
+        uncache.get
+        uncache.get
+        stub.should have_been_requested.twice
+      end
+    end
+  end
+end

data/spec/upton_spec.rb CHANGED

@@ -5,53 +5,29 @@ require 'thin'
 require 'nokogiri'
 require 'restclient'
 require 'fileutils'
+require "spec_helper.rb"
 require './lib/upton'
 describe Upton do
   before :all do
-    #start the server
-    class Server
-      def call(env)
-        @root = File.expand_path(File.dirname(__FILE__))
-        path = Rack::Utils.unescape(env['PATH_INFO'])
-        path += 'index.html' if path == '/'
-        file = File.join(@root, "data", path)
-        params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
-        if File.exists?(file)
-          [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
-        else
-          [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
-        end
-      end
-    end
-    def start_test_server
-      @server_thread = Thread.new do
-        Rack::Handler::Thin.run ::Server.new, :Port => 9876
-      end
-      sleep(1) # wait a sec for the server to be booted
-    end
-    start_test_server()
-    @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
+    @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
                  "",
                  "A Prosecutor, a Wrongful Conviction and a Question of Justice",
                  "Six Facts Lost in the IRS Scandal"]
-    @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
-                        "How the IRS’s Nonprofit Division Got So Dysfunctional",
-                        "Sound, Fury and the IRS Mess",
-                        "The Most Important #Muckreads on Rape in the Military",
-                        "Congressmen to Hagel: Where Are the Missing War Records?",
-                        "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
-                        "A Prosecutor, a Wrongful Conviction and a Question of Justice",
-                        "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
+    @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
+                        "How the IRS’s Nonprofit Division Got So Dysfunctional",
+                        "Sound, Fury and the IRS Mess",
+                        "The Most Important #Muckreads on Rape in the Military",
+                        "Congressmen to Hagel: Where Are the Missing War Records?",
+                        "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
+                        "A Prosecutor, a Wrongful Conviction and a Question of Justice",
+                        "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
                         "The Story Behind Our Hospital Interactive",
                         "irs-test-charts-for-embedding"]]
-    @east_timor_prime_ministers = [[
-                                    ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
+    @east_timor_prime_ministers = [[
+                                    ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
                                       "1", "2", "3", "4",],
                                     [],
                                     ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
@@ -59,12 +35,27 @@ describe Upton do
                                     ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
                                     ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
                                   ]]
+    @searchResults = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
+                 "A Prosecutor, a Wrongful Conviction and a Question of Justice",
+                 "Six Facts Lost in the IRS Scandal"]
   end
   it "should scrape in the basic case" do
-    propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
+    stub_request(:get, "www.example.com/propublica.html").
+      to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
+    stub_request(:get, "www.example.com/discussion.html").
+      to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
+    stub_request(:get, "www.example.com/prosecutor.html").
+      to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
+    stub_request(:get, "www.example.com/webinar.html").
+      to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
+    stub_request(:get, "www.example.com/sixfacts.html").
+      to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
+    propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
     propubscraper.debug = true
     propubscraper.verbose = true
+    propubscraper.sleep_time_between_requests = 0
     heds = propubscraper.scrape do |article_str|
       doc = Nokogiri::HTML(article_str)
@@ -74,18 +65,31 @@ describe Upton do
     heds.should eql @headlines
   end
-  it 'should properly handle relative urls'  do
+  it 'should properly handle relative urls'  do
 # uses a modified page from the previous test in which the target
 # href, http://127.0.0.1:9876/prosecutors.html, has been changed
 # to a relative url
 #
-# Note: this test is a bit quirky, because it passes on the fact that
+# Note: this test is a bit quirky, because it passes on the fact that
 # the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
 # So it works, but because of a coupling to how Upton handles caching in the file system
-    propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
+    stub_request(:get, "www.example.com/propublica-relative.html").
+      to_return(:body => File.new('./spec/data/propublica-relative.html'), :status => 200)
+    stub_request(:get, "www.example.com/prosecutor.html").
+      to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
+    stub_request(:get, "www.example.com/sixfacts.html").
+      to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
+    stub_request(:get, "www.example.com/webinar.html").
+      to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
+    stub_request(:get, "www.example.com/discussion.html").
+      to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
+    propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
     propubscraper.debug = true
     propubscraper.verbose = true
+    propubscraper.sleep_time_between_requests = 0
     heds = propubscraper.scrape do |article_str|
       doc = Nokogiri::HTML(article_str)
@@ -96,23 +100,82 @@ describe Upton do
   end
   it "should scrape a list properly with the list helper" do
-    propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
+    stub_request(:get, "www.example.com/propublica.html").
+      to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
+    propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
     propubscraper.debug = true
     propubscraper.verbose = true
-    list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
+    propubscraper.sleep_time_between_requests = 0
+    list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a"))
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
     list.should eql @most_commented_heds
   end
   it "should scrape a table properly with the table helper" do
-    propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
+    stub_request(:get, "www.example.com/easttimor.html").
+      to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
+    propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
     propubscraper.debug = true
     propubscraper.verbose = true
+    propubscraper.sleep_time_between_requests = 0
     table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
     FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
     table.should eql @east_timor_prime_ministers
   end
-  it "should test saving files with the right encoding"
-  it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
+  it "should test saving files with the right encoding" do
+    pending "finding a site that gives funny encodings"
+  end
+  it "should scrape paginated pages" do
+    stub_request(:get, "www.example.com/propublica_search.html").
+      to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
+    stub_request(:get, "www.example.com/propublica_search.html?p=2").
+      to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
+    stub_request(:get, "www.example.com/propublica_search.html?p=3").
+      to_return(:body => '', :status => 200)
+    stub_request(:get, "www.example.com/webinar.html").
+      to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
+    stub_request(:get, "www.example.com/prosecutor.html").
+      to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
+    stub_request(:get, "www.example.com/sixfacts.html").
+      to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
+    propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
+    propubscraper.debug = true
+    propubscraper.verbose = true
+    propubscraper.paginated = true
+    propubscraper.pagination_param = 'p'
+    propubscraper.pagination_max_pages = 3
+    propubscraper.sleep_time_between_requests = 0
+    results = propubscraper.scrape do |article_str|
+      doc = Nokogiri::HTML(article_str)
+      hed = doc.css('h1.article-title').text
+    end
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    results.should eql @searchResults
+  end
+  before do
+    Upton::Scraper.stub(:sleep)
+  end
+  it "should sleep after uncached requests" do
+    stub_request(:get, "www.example.com")
+    u = Upton::Scraper.new("http://www.example.com", '.whatever')
+    u.should_receive(:sleep)
+    stub = stub_request(:get, "http://www.example.com")
+    u.scrape
+  end
+  it "should be silent if verbose if false" do
+    pending
+  end
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: upton
 version: !ruby/object:Gem::Version
-  version: 0.2.7
+  version: 0.2.8
 platform: ruby
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-08-11 00:00:00.000000000 Z
+date: 2013-08-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rack
@@ -38,6 +38,20 @@ dependencies:
     - - '>='
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: webmock
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: thin
   requirement: !ruby/object:Gem::Requirement
@@ -130,7 +144,9 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/upton.rb
-- lib/utils.rb
+- lib/upton/utils.rb
+- lib/upton/downloader.rb
+- spec/data/propublica_search_page_2.html
 - spec/data/webinar.html
 - spec/data/propublica-relative.html
 - spec/data/propublica.html
@@ -138,7 +154,10 @@ files:
 - spec/data/sixfacts.html
 - spec/data/discussion.html
 - spec/data/easttimor.html
+- spec/data/propublica_search.html
 - spec/upton_spec.rb
+- spec/spec_helper.rb
+- spec/upton_downloader_spec.rb
 homepage: http://github.org/propublica/upton
 licenses:
 - MIT
@@ -164,6 +183,7 @@ signing_key:
 specification_version: 4
 summary: A simple web-scraping framework
 test_files:
+- spec/data/propublica_search_page_2.html
 - spec/data/webinar.html
 - spec/data/propublica-relative.html
 - spec/data/propublica.html
@@ -171,5 +191,8 @@ test_files:
 - spec/data/sixfacts.html
 - spec/data/discussion.html
 - spec/data/easttimor.html
+- spec/data/propublica_search.html
 - spec/upton_spec.rb
+- spec/spec_helper.rb
+- spec/upton_downloader_spec.rb
 has_rdoc: true

data/lib/utils.rb DELETED

@@ -1,74 +0,0 @@
-# encoding: UTF-8
-##
-# This module contains a collection of helpers for Upton
-##
-module Upton
-  ##
-  # This class contains a collection of helpers for Upton
-  #
-  # Each method returns a Proc that (with an & ) can be used as the final
-  # argument to Upton's `scrape` and `scrape_to_csv`
-  ##
-  module Utils
-    ##
-    # Scrapes an HTML <table> element into an Array of Arrays. The header, if
-    # present, is returned as the first row.
-    ##
-    def self.table(table_selector, selector_method=:xpath)
-      return Proc.new do |instance_html|
-        html = ::Nokogiri::HTML(instance_html)
-        output = []
-        headers = html.send(selector_method, table_selector).css("th").map &:text
-        output << headers
-        table = html.send(selector_method, table_selector).css("tr").each{|tr| output << tr.css("td").map(&:text) }
-        output
-      end
-    end
-    ##
-    # Scrapes any set of HTML elements into an Array.
-    ##
-    def self.list(list_selector, selector_method=:xpath)
-      return Proc.new do |instance_html|
-        html = ::Nokogiri::HTML(instance_html)
-        html.send(selector_method, list_selector).map{|list_element| list_element.text }
-      end
-    end
-    ##
-    # Takes :_href and resolves it to an absolute URL according to
-    #  the supplied :_page_url. They can be either Strings or URI
-    #  instances.
-    #
-    # raises ArgumentError if either href or page_url is nil
-    # raises ArgumentError if page_url is not absolute
-    #
-    # returns: a String with absolute URL
-    def self.resolve_url(_href, _page_url)
-      page_url = URI(_page_url).dup
-      raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
-      href = URI(_href).dup
-      # return :href if :href is already absolute
-      return href.to_s if href.absolute?
-      # TODO: There may be edge cases worth considering
-      # but this should handle the following non-absolute href possibilities:
-      # //anothersite.com (keeps scheme, too!)
-      # /root/dir
-      # relative/dir
-      # ?query=2
-      # #bang
-      URI.join(page_url, href).to_s
-    end
-  end
-end