RubyGems - upton - Versions diffs - 0.2.6 → 0.2.7 - Mend

upton 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +6 -14
data/lib/upton.rb +130 -35
data/lib/utils.rb +32 -2
data/{test → spec}/data/discussion.html +0 -0
data/{test → spec}/data/easttimor.html +0 -0
data/spec/data/propublica-relative.html +17 -0
data/{test → spec}/data/propublica.html +0 -0
data/{test → spec}/data/prosecutor.html +0 -0
data/{test → spec}/data/sixfacts.html +0 -0
data/{test → spec}/data/webinar.html +0 -0
data/spec/upton_spec.rb +118 -0
metadata +59 -29
data/test/test_upton.rb +0 -141

checksums.yaml CHANGED Viewed

@@ -1,15 +1,7 @@
 ---
-!binary "U0hBMQ==":
-  metadata.gz: !binary |-
-    OWFkOGUyYjcyNzA3ZWQ2YTNmYTZmMjJjOTc3NzJiMjY0MTllMDhhOA==
-  data.tar.gz: !binary |-
-    YTA5YTEyMzczZjNjYjVlYjNmNmUyZWM0MTA4Zjk2NTRjYWQwZjFjMg==
-!binary "U0hBNTEy":
-  metadata.gz: !binary |-
-    MjFhYjI5OTYwZGVlYTNlMmNhYTc1OWQ5ZGJmMzBlN2FiM2U4MzllMDM4Nzhk
-    MjJkMTczOGZjNWUwNDMyYmFlOGRkZDlhNjFkM2RlMzM1YjFmZTgyZWQ4MTBj
-    MjY2ZmFiYmZlOTc5YmE2YzFjMWE1YjVjZWY2MWMyYTczZmEwNGU=
-  data.tar.gz: !binary |-
-    YmFjOTllZjdlNWIwNzhhMGIxODQwOTI1Y2EwY2YzMTE1YWEzOTdkMWI3NDEy
-    ZjA1OTE1N2Q0OGYwOWEyYjVjMDM3ZWQ1NzlhZmU3NDZlNTAxNDJmZWZjZGFm
-    YjUxMzc3ZThkZDg1ZDdkMjgwM2UyODMwZTZiMjdjZDAyNjAxNTQ=
+SHA1:
+  metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
+  data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
+SHA512:
+  metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
+  data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a

data/lib/upton.rb CHANGED Viewed

@@ -28,7 +28,7 @@ module Upton
   # 2.  supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
   #      block from Upton::Utils.
   # For more complicated cases; subclass Upton::Scraper
-  #    e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
+  #    e.g. +MyScraper < Upton::Scraper+ and override various methods.
   ##
   class Scraper
@@ -53,28 +53,32 @@ module Upton
     # +selector+: The XPath expression or CSS selector that specifies the
     #              anchor elements within the page, if a url is specified for
     #              the previous argument.
-    # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
+    # +selector_method+: Deprecated and ignored. Next breaking release will
+    #                      remove this option.x
     #
-    # These options are a shortcut. If you plant to override +get_index+, you
+    # These options are a shortcut. If you plan to override +get_index+, you
     # do not need to set them.
     # If you don't specify a selector, the first argument will be treated as a
     # list of URLs.
     ##
-    def initialize(index_url_or_array, selector="", selector_method=:xpath)
+    # DEPRECATION NOTE, re: selector_method
+      # the selector_method parameter is unneeded, as Nokogiri provides the
+      #  #search method, which picks a selector depending on whether
+      #  the String passed is of CSS/XPath notation
+    def initialize(index_url_or_array, selector="", selector_method=:deprecated)
       #if first arg is a valid URL, do already-written stuff;
       #if it's not (or if it's a list?) don't bother with get_index, etc.
       #e.g. Scraper.new(["http://jeremybmerrill.com"])
       #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
-      if selector.empty?
+      if index_url_or_array.respond_to? :each_with_index
         @url_array = index_url_or_array
-      elsif index_url_or_array =~ ::URI::ABS_URI
+      else
         @index_url = index_url_or_array
         @index_selector = selector
-        @index_selector_method = selector_method
-      else
-        raise ArgumentError
       end
       # If true, then Upton prints information about when it gets
       # files from the internet and when it gets them from its stash.
@@ -97,9 +101,9 @@ module Upton
       # Folder name for stashes, if you want them to be stored somewhere else,
       # e.g. under /tmp.
-      @stash_folder = "stashes"
+      @stash_folder ||= "stashes"
       unless Dir.exists?(@stash_folder)
-        Dir.mkdir(@stash_folder)
+        FileUtils.mkdir_p(@stash_folder)
       end
     end
@@ -114,7 +118,7 @@ module Upton
     # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
     # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
     ##
-    def next_instance_page_url(url, index)
+    def next_instance_page_url(url, pagination_index)
       ""
     end
@@ -129,7 +133,7 @@ module Upton
     # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
     # ought to return "http://whatever.com/articles?page=2"
     ##
-    def next_index_page_url(url, index)
+    def next_index_page_url(url, pagination_index)
       ""
     end
@@ -142,29 +146,64 @@ module Upton
         self.url_array = self.get_index
       end
       CSV.open filename, 'wb' do |csv|
-        self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
+        #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+          puts document.inspect
+          if document[0].respond_to? :map
+            document.each{|row| csv << row }
+          else
+            csv << document
+          end
+        end
+        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
+      end
+    end
+    def scrape_to_tsv filename, &blk
+      require 'csv'
+      unless self.url_array
+        self.url_array = self.get_index
+      end
+      CSV.open filename, 'wb', :col_sep => "\t" do |csv|
+        #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
+        self.scrape_from_list(self.url_array, blk).compact.each do |document|
+          puts document.inspect
+          if document[0].respond_to? :map
+            document.each{|row| csv << row }
+          else
+            csv << document
+          end
+        end
+        #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
       end
     end
     protected
+    ##
+    # Actually fetches the page
+    ##
+    def fetch_page(url, options={})
+      RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
+    end
     ##
     # Handles getting pages with RestClient or getting them from the local stash.
     #
     # Uses a kludge (because rest-client is outdated) to handle encoding.
     ##
-    def get_page(url, stash=false)
+    def get_page(url, stash=false, options={})
       return "" if url.empty?
       #the filename for each stashed version is a cleaned version of the URL.
-      if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
+      if stash && File.exists?( url_to_filename(url, options) )
         puts "usin' a stashed copy of " + url if @verbose
-        resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
+        resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
       else
         begin
           puts "getting " + url if @verbose
           sleep @sleep_time_between_requests
-          resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
+          resp = fetch_page(url, options)
           #this is silly, but rest-client needs to get on their game.
           #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
@@ -188,42 +227,95 @@ module Upton
         rescue URI::InvalidURIError
           puts "Invalid URI: #{url}" if @verbose
           resp = ""
+        rescue RestClient::RequestTimeout
+          "Timeout: #{url}" if @verbose
+          retry
         end
         if stash
           puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
-          open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
+          open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
         end
       end
       resp
     end
+    def url_to_filename(url, options={})
+      File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
+    end
+    ##
+    # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
+    # resolve_url resolves them to absolute urls.
+    # absolute_url_str must be a URL, as a string, that is absolute.
+    ##
+    def resolve_url(href_str, absolute_url_str)
+      absolute_url = URI(absolute_url_str).dup
+      raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
+      href = URI(href_str).dup
+      # return :href if :href is already absolute
+      return href.to_s if href.absolute?
+      #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
+      URI.join(absolute_url, href).to_s
+    end
     ##
     # Return a list of URLs for the instances you want to scrape.
     # This can optionally be overridden if, for example, the list of instances
     # comes from an API.
     ##
     def get_index
-      parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
+      # TODO: Deprecate @index_Selector_method in next minor release
+      parse_index(get_index_pages(@index_url, 1), @index_selector)
     end
     ##
     # Using the XPath expression or CSS selector and selector_method that
-    # uniquely identifies the links in the index, return those links as strings.
-    ##
-    def parse_index(text, selector, selector_method=:xpath)
-      Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
+    # uniquely identifies the links in the index, return those links as strings.    ##
+    def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
+      # for now, override selector_method with :search, which will work with either CSS or XPath
+      Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
+    end
+    # TODO: Not sure the best way to handle this
+    # Currently, #parse_index is called upon #get_index_pages,
+    #  which itself is dependent on @index_url
+    # Does @index_url stay unaltered for the lifetime of the Upton instance?
+    # It seems to at this point, but that may be something that gets
+    #  deprecated later
+    #
+    # So for now, @index_url is used in conjunction with resolve_url
+    # to make sure that this method returns absolute urls
+    # i.e. this method expects @index_url to always have an absolute address
+    # for the lifetime of an Upton instance
+    def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
+      # for now, override selector_method with :search, which will work with either CSS or XPath
+      Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
+        href = a_element["href"]
+        u = resolve_url( href, @index_url) unless href.nil?
+        unless u == href
+          puts "resolved #{href} to #{u}"
+        end
+        u
+      end
     end
     ##
     # Returns the concatenated output of each member of a paginated index,
     # e.g. a site listing links with 2+ pages.
     ##
-    def get_index_pages(url, index)
-      resp = self.get_page(url, @index_debug)
+    def get_index_pages(url, pagination_index, options={})
+      resp = self.get_page(url, @index_debug, options)
       if !resp.empty?
-        next_url = self.next_index_page_url(url, index + 1)
+        next_url = self.next_index_page_url(url, pagination_index + 1)
+        # resolve to absolute url
+        #
+        next_url = resolve_url(next_url, url)
         unless next_url == url
-          next_resp = self.get_index_pages(next_url, index + 1).to_s
+          next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
           resp += next_resp
         end
       end
@@ -231,19 +323,21 @@ module Upton
     end
     ##
-    # Returns the article at `url`.
+    # Returns the instance at `url`.
     #
     # If the page is stashed, returns that, otherwise, fetches it from the web.
     #
     # If an instance is paginated, returns the concatenated output of each
     # page, e.g. if a news article has two pages.
     ##
-    def get_instance(url, index=0)
-      resp = self.get_page(url, @debug)
+    def get_instance(url, pagination_index=0, options={})
+      resp = self.get_page(url, @debug, options)
       if !resp.empty?
-        next_url = self.next_instance_page_url(url, index + 1)
+        next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
+#        next_url = resolve_url(next_url, url)
         unless next_url == url
-          next_resp = self.get_instance(next_url, index + 1).to_s
+          next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
           resp += next_resp
         end
       end
@@ -253,8 +347,9 @@ module Upton
     # Just a helper for +scrape+.
     def scrape_from_list(list, blk)
       puts "Scraping #{list.size} instances" if @verbose
-      list.each_with_index.map do |instance_url, index|
-        blk.call(get_instance(instance_url), instance_url, index)
+      list.each_with_index.map do |instance_url, instance_index|
+        instance_resp = get_instance instance_url, nil, :instance_index => instance_index
+        blk.call(instance_resp, instance_url, instance_index)
       end
     end

data/lib/utils.rb CHANGED Viewed

@@ -18,7 +18,6 @@ module Upton
     # present, is returned as the first row.
     ##
     def self.table(table_selector, selector_method=:xpath)
-      require 'csv'
       return Proc.new do |instance_html|
         html = ::Nokogiri::HTML(instance_html)
         output = []
@@ -34,11 +33,42 @@ module Upton
     # Scrapes any set of HTML elements into an Array.
     ##
     def self.list(list_selector, selector_method=:xpath)
-      require 'csv'
       return Proc.new do |instance_html|
         html = ::Nokogiri::HTML(instance_html)
         html.send(selector_method, list_selector).map{|list_element| list_element.text }
       end
     end
+    ##
+    # Takes :_href and resolves it to an absolute URL according to
+    #  the supplied :_page_url. They can be either Strings or URI
+    #  instances.
+    #
+    # raises ArgumentError if either href or page_url is nil
+    # raises ArgumentError if page_url is not absolute
+    #
+    # returns: a String with absolute URL
+    def self.resolve_url(_href, _page_url)
+      page_url = URI(_page_url).dup
+      raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
+      href = URI(_href).dup
+      # return :href if :href is already absolute
+      return href.to_s if href.absolute?
+      # TODO: There may be edge cases worth considering
+      # but this should handle the following non-absolute href possibilities:
+      # //anothersite.com (keeps scheme, too!)
+      # /root/dir
+      # relative/dir
+      # ?query=2
+      # #bang
+      URI.join(page_url, href).to_s
+    end
   end
 end

data/{test → spec}/data/discussion.html RENAMED Viewed

File without changes

data/{test → spec}/data/easttimor.html RENAMED Viewed

File without changes

data/spec/data/propublica-relative.html ADDED Viewed

@@ -0,0 +1,17 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>Document</title>
+</head>
+<body>
+<!-- refactored fixture for relative URL testing -->
+<h2><a href="iamnottobeselected.html" class="title-link">An unnecessary proof of concept but just for kicks</a></h2>
+<section id="river">
+  <h1><a href="prosecutor.html" class="title-link">A Prosecutor, a Wrongful Conviction and a Question of Justice</a></h1>
+  </section>
+</body>
+</html>

data/{test → spec}/data/propublica.html RENAMED Viewed

File without changes

data/{test → spec}/data/prosecutor.html RENAMED Viewed

File without changes

data/{test → spec}/data/sixfacts.html RENAMED Viewed

File without changes

data/{test → spec}/data/webinar.html RENAMED Viewed

File without changes

data/spec/upton_spec.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# encoding: UTF-8
+require 'rack'
+require 'thin'
+require 'nokogiri'
+require 'restclient'
+require 'fileutils'
+require './lib/upton'
+describe Upton do
+  before :all do
+    #start the server
+    class Server
+      def call(env)
+        @root = File.expand_path(File.dirname(__FILE__))
+        path = Rack::Utils.unescape(env['PATH_INFO'])
+        path += 'index.html' if path == '/'
+        file = File.join(@root, "data", path)
+        params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
+        if File.exists?(file)
+          [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
+        else
+          [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
+        end
+      end
+    end
+    def start_test_server
+      @server_thread = Thread.new do
+        Rack::Handler::Thin.run ::Server.new, :Port => 9876
+      end
+      sleep(1) # wait a sec for the server to be booted
+    end
+    start_test_server()
+    @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
+                 "",
+                 "A Prosecutor, a Wrongful Conviction and a Question of Justice",
+                 "Six Facts Lost in the IRS Scandal"]
+    @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
+                        "How the IRS’s Nonprofit Division Got So Dysfunctional",
+                        "Sound, Fury and the IRS Mess",
+                        "The Most Important #Muckreads on Rape in the Military",
+                        "Congressmen to Hagel: Where Are the Missing War Records?",
+                        "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
+                        "A Prosecutor, a Wrongful Conviction and a Question of Justice",
+                        "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
+                        "The Story Behind Our Hospital Interactive",
+                        "irs-test-charts-for-embedding"]]
+    @east_timor_prime_ministers = [[
+                                    ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
+                                      "1", "2", "3", "4",],
+                                    [],
+                                    ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
+                                    ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
+                                    ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
+                                    ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
+                                  ]]
+  end
+  it "should scrape in the basic case" do
+    propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
+    propubscraper.debug = true
+    propubscraper.verbose = true
+    heds = propubscraper.scrape do |article_str|
+      doc = Nokogiri::HTML(article_str)
+      hed = doc.css('h1.article-title').text
+    end
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    heds.should eql @headlines
+  end
+  it 'should properly handle relative urls'  do
+# uses a modified page from the previous test in which the target
+# href, http://127.0.0.1:9876/prosecutors.html, has been changed
+# to a relative url
+#
+# Note: this test is a bit quirky, because it passes on the fact that
+# the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
+# So it works, but because of a coupling to how Upton handles caching in the file system
+    propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
+    propubscraper.debug = true
+    propubscraper.verbose = true
+    heds = propubscraper.scrape do |article_str|
+      doc = Nokogiri::HTML(article_str)
+      hed = doc.css('h1.article-title').text
+    end
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    heds.should eql ["A Prosecutor, a Wrongful Conviction and a Question of Justice"]
+  end
+  it "should scrape a list properly with the list helper" do
+    propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
+    propubscraper.debug = true
+    propubscraper.verbose = true
+    list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    list.should eql @most_commented_heds
+  end
+  it "should scrape a table properly with the table helper" do
+    propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
+    propubscraper.debug = true
+    propubscraper.verbose = true
+    table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
+    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
+    table.should eql @east_timor_prime_ministers
+  end
+  it "should test saving files with the right encoding"
+  it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
+end

metadata CHANGED Viewed

@@ -1,69 +1,83 @@
 --- !ruby/object:Gem::Specification
 name: upton
 version: !ruby/object:Gem::Version
-  version: 0.2.6
+  version: 0.2.7
 platform: ruby
 authors:
 - Jeremy B. Merrill
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-07-22 00:00:00.000000000 Z
+date: 2013-08-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rack
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: thin
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: yard
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
@@ -84,14 +98,28 @@ dependencies:
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 description: Don't re-write web scrapers every time. Upton gives you a scraper template
@@ -103,13 +131,14 @@ extra_rdoc_files: []
 files:
 - lib/upton.rb
 - lib/utils.rb
-- test/data/discussion.html
-- test/data/easttimor.html
-- test/data/propublica.html
-- test/data/prosecutor.html
-- test/data/sixfacts.html
-- test/data/webinar.html
-- test/test_upton.rb
+- spec/data/webinar.html
+- spec/data/propublica-relative.html
+- spec/data/propublica.html
+- spec/data/prosecutor.html
+- spec/data/sixfacts.html
+- spec/data/discussion.html
+- spec/data/easttimor.html
+- spec/upton_spec.rb
 homepage: http://github.org/propublica/upton
 licenses:
 - MIT
@@ -120,26 +149,27 @@ require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: 1.8.7
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.0.5
+rubygems_version: 2.0.2
 signing_key:
 specification_version: 4
 summary: A simple web-scraping framework
 test_files:
-- test/data/discussion.html
-- test/data/easttimor.html
-- test/data/propublica.html
-- test/data/prosecutor.html
-- test/data/sixfacts.html
-- test/data/webinar.html
-- test/test_upton.rb
+- spec/data/webinar.html
+- spec/data/propublica-relative.html
+- spec/data/propublica.html
+- spec/data/prosecutor.html
+- spec/data/sixfacts.html
+- spec/data/discussion.html
+- spec/data/easttimor.html
+- spec/upton_spec.rb
 has_rdoc: true

data/test/test_upton.rb DELETED Viewed

@@ -1,141 +0,0 @@
-# encoding: UTF-8
-require 'test/unit'
-require 'rack'
-require 'thin'
-require 'nokogiri'
-require 'restclient'
-require './lib/upton'
-require 'fileutils'
-module Upton
-  module Test
-    # class ProPublicaScraper < Upton::Scraper
-    #   def initialize(a, b, c)
-    #     super
-    #     @verbose = false
-    #     @debug = false
-    #     @stash_folder = "test_stashes"
-    #   end
-    # end
-    class UptonTest < ::Test::Unit::TestCase
-      # def test_get_page
-      #TODO
-      # end
-      # def test_stash
-      #TODO
-      # end
-      def test_scrape
-        #this doesn't test stashing.
-        start_test_server()
-        headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
-                     "",
-                     "A Prosecutor, a Wrongful Conviction and a Question of Justice",
-                     "Six Facts Lost in the IRS Scandal"]
-        propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
-        propubscraper.debug = true
-        propubscraper.verbose = true
-        heds = propubscraper.scrape do |article_str|
-          doc = Nokogiri::HTML(article_str)
-          hed = doc.css('h1.article-title').text
-        end
-        assert_equal(heds, headlines)
-        FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
-      end
-      def test_encodings
-        skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
-      end
-      def test_stashing
-        skip "should test stashing, make sure we never send too many requests"
-      end
-      def test_scrape_list
-        #this doesn't test stashing.
-        #TODO: needs a website that has links to a multi-page list (or table)
-        start_test_server()
-        most_commented_heds = [["Six Facts Lost in the IRS Scandal",
-                            "How the IRS’s Nonprofit Division Got So Dysfunctional",
-                            "Sound, Fury and the IRS Mess",
-                            "The Most Important #Muckreads on Rape in the Military",
-                            "Congressmen to Hagel: Where Are the Missing War Records?",
-                            "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
-                            "A Prosecutor, a Wrongful Conviction and a Question of Justice",
-                            "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
-                            "The Story Behind Our Hospital Interactive",
-                            "irs-test-charts-for-embedding"]]
-        propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
-        propubscraper.debug = true
-        propubscraper.verbose = true
-        list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
-        assert_equal(list, most_commented_heds)
-        FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
-      end
-      def test_scrape_table
-        #this doesn't test stashing.
-        start_test_server()
-        east_timor_prime_ministers = [[
-                                        ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
-                                          "1", "2", "3", "4",],
-                                        [],
-                                        ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
-                                        ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
-                                        ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
-                                        ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
-                                      ]]
-        propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
-        propubscraper.debug = true
-        propubscraper.verbose = true
-        table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
-        assert_equal(table, east_timor_prime_ministers)
-        FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
-      end
-      private
-      def start_test_server
-        @server_thread = Thread.new do
-          Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
-        end
-        sleep(1) # wait a sec for the server to be booted
-      end
-    end
-    # via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
-    class Server
-      def call(env)
-        @root = File.expand_path(File.dirname(__FILE__))
-        path = Rack::Utils.unescape(env['PATH_INFO'])
-        path += 'index.html' if path == '/'
-        file = File.join(@root, "data", path)
-        params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
-        if File.exists?(file)
-          [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
-        else
-          [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
-        end
-      end
-    end
-  end
-end