upton 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/upton.rb +186 -0
- data/test/data/discussion.html +620 -0
- data/test/data/propublica.html +1554 -0
- data/test/data/prosecutor.html +2223 -0
- data/test/data/sixfacts.html +2234 -0
- data/test/data/webinar.html +881 -0
- data/test/test_upton.rb +82 -0
- metadata +114 -0
    
        data/test/test_upton.rb
    ADDED
    
    | @@ -0,0 +1,82 @@ | |
| 1 | 
            +
            require 'test/unit'
         | 
| 2 | 
            +
            require 'rack'
         | 
| 3 | 
            +
            require 'thin'
         | 
| 4 | 
            +
            require 'nokogiri'
         | 
| 5 | 
            +
            require 'restclient'
         | 
| 6 | 
            +
            require 'upton'
         | 
| 7 | 
            +
            require 'fileutils'
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            module Upton
         | 
| 10 | 
            +
              module Test
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                # class ProPublicaScraper < Upton::Scraper
         | 
| 13 | 
            +
                #   def initialize(a, b, c)
         | 
| 14 | 
            +
                #     super
         | 
| 15 | 
            +
                #     @verbose = false
         | 
| 16 | 
            +
                #     @debug = false
         | 
| 17 | 
            +
                #     @stash_folder = "test_stashes"
         | 
| 18 | 
            +
                #   end
         | 
| 19 | 
            +
                # end
         | 
| 20 | 
            +
             | 
| 21 | 
            +
             | 
| 22 | 
            +
                class UptonTest < ::Test::Unit::TestCase
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                  # def test_get_page
         | 
| 25 | 
            +
                  #TODO
         | 
| 26 | 
            +
                  # end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                  # def test_stash
         | 
| 29 | 
            +
                  #TODO
         | 
| 30 | 
            +
                  # end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                  def test_scrape
         | 
| 33 | 
            +
                    #this doesn't test stashing.
         | 
| 34 | 
            +
                    start_test_server()
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting", 
         | 
| 37 | 
            +
                                 "Discussion: Military Lending and Debt",
         | 
| 38 | 
            +
                                 "A Prosecutor, a Wrongful Conviction and a Question of Justice",
         | 
| 39 | 
            +
                                 "Six Facts Lost in the IRS Scandal"]
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
         | 
| 42 | 
            +
                    propubscraper.verbose = false
         | 
| 43 | 
            +
                    propubscraper.debug = false
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                    propubscraper.scrape do |article_str|
         | 
| 46 | 
            +
                      doc = Nokogiri::HTML(article_str)
         | 
| 47 | 
            +
                      hed = doc.css('h1.article-title').text
         | 
| 48 | 
            +
                      assert_equal(hed, headlines.shift)
         | 
| 49 | 
            +
                    end
         | 
| 50 | 
            +
                    FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
         | 
| 51 | 
            +
                  end
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  private
         | 
| 54 | 
            +
                  def start_test_server
         | 
| 55 | 
            +
                    @server_thread = Thread.new do
         | 
| 56 | 
            +
                      Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
         | 
| 57 | 
            +
                    end
         | 
| 58 | 
            +
                    sleep(1) # wait a sec for the server to be booted
         | 
| 59 | 
            +
                  end
         | 
| 60 | 
            +
                end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
                # via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
         | 
| 65 | 
            +
                class Server
         | 
| 66 | 
            +
                  def call(env)
         | 
| 67 | 
            +
                    @root = File.expand_path(File.dirname(__FILE__))
         | 
| 68 | 
            +
                    path = Rack::Utils.unescape(env['PATH_INFO'])
         | 
| 69 | 
            +
                    path += 'index.html' if path == '/'
         | 
| 70 | 
            +
                    file = @root + "#{path}"
         | 
| 71 | 
            +
             | 
| 72 | 
            +
                    params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    if File.exists?(file)
         | 
| 75 | 
            +
                      [ 200, {"Content-Type" => "text/html"}, File.read(file) ]
         | 
| 76 | 
            +
                    else
         | 
| 77 | 
            +
                      [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
         | 
| 78 | 
            +
                    end
         | 
| 79 | 
            +
                  end
         | 
| 80 | 
            +
                end
         | 
| 81 | 
            +
              end
         | 
| 82 | 
            +
            end
         | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,114 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification 
         | 
| 2 | 
            +
            name: upton
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            +
              version: 0.1.0
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors: 
         | 
| 7 | 
            +
            - Jeremy B. Merrill
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            date: 2013-05-29 00:00:00 Z
         | 
| 13 | 
            +
            dependencies: 
         | 
| 14 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 15 | 
            +
              name: rack
         | 
| 16 | 
            +
              prerelease: false
         | 
| 17 | 
            +
              requirement: &id001 !ruby/object:Gem::Requirement 
         | 
| 18 | 
            +
                requirements: 
         | 
| 19 | 
            +
                - &id002 
         | 
| 20 | 
            +
                  - ">="
         | 
| 21 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 22 | 
            +
                    version: "0"
         | 
| 23 | 
            +
              type: :development
         | 
| 24 | 
            +
              version_requirements: *id001
         | 
| 25 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 26 | 
            +
              name: thin
         | 
| 27 | 
            +
              prerelease: false
         | 
| 28 | 
            +
              requirement: &id003 !ruby/object:Gem::Requirement 
         | 
| 29 | 
            +
                requirements: 
         | 
| 30 | 
            +
                - *id002
         | 
| 31 | 
            +
              type: :development
         | 
| 32 | 
            +
              version_requirements: *id003
         | 
| 33 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 34 | 
            +
              name: nokogiri
         | 
| 35 | 
            +
              prerelease: false
         | 
| 36 | 
            +
              requirement: &id004 !ruby/object:Gem::Requirement 
         | 
| 37 | 
            +
                requirements: 
         | 
| 38 | 
            +
                - *id002
         | 
| 39 | 
            +
              type: :development
         | 
| 40 | 
            +
              version_requirements: *id004
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 42 | 
            +
              name: yard
         | 
| 43 | 
            +
              prerelease: false
         | 
| 44 | 
            +
              requirement: &id005 !ruby/object:Gem::Requirement 
         | 
| 45 | 
            +
                requirements: 
         | 
| 46 | 
            +
                - *id002
         | 
| 47 | 
            +
              type: :development
         | 
| 48 | 
            +
              version_requirements: *id005
         | 
| 49 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 50 | 
            +
              name: rest-client
         | 
| 51 | 
            +
              prerelease: false
         | 
| 52 | 
            +
              requirement: &id006 !ruby/object:Gem::Requirement 
         | 
| 53 | 
            +
                requirements: 
         | 
| 54 | 
            +
                - - ~>
         | 
| 55 | 
            +
                  - !ruby/object:Gem::Version 
         | 
| 56 | 
            +
                    version: 1.6.7
         | 
| 57 | 
            +
              type: :runtime
         | 
| 58 | 
            +
              version_requirements: *id006
         | 
| 59 | 
            +
            - !ruby/object:Gem::Dependency 
         | 
| 60 | 
            +
              name: nokogiri
         | 
| 61 | 
            +
              prerelease: false
         | 
| 62 | 
            +
              requirement: &id007 !ruby/object:Gem::Requirement 
         | 
| 63 | 
            +
                requirements: 
         | 
| 64 | 
            +
                - *id002
         | 
| 65 | 
            +
              type: :runtime
         | 
| 66 | 
            +
              version_requirements: *id007
         | 
| 67 | 
            +
            description: Don't re-write web scrapers every time. Skrapojan gives you a scraper template that's easy to use for debugging and doesn't hammer servers by default
         | 
| 68 | 
            +
            email: jeremy.merrill@propublica.org
         | 
| 69 | 
            +
            executables: []
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            extensions: []
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            extra_rdoc_files: []
         | 
| 74 | 
            +
             | 
| 75 | 
            +
            files: 
         | 
| 76 | 
            +
            - lib/upton.rb
         | 
| 77 | 
            +
            - test/data/discussion.html
         | 
| 78 | 
            +
            - test/data/propublica.html
         | 
| 79 | 
            +
            - test/data/prosecutor.html
         | 
| 80 | 
            +
            - test/data/sixfacts.html
         | 
| 81 | 
            +
            - test/data/webinar.html
         | 
| 82 | 
            +
            - test/test_upton.rb
         | 
| 83 | 
            +
            homepage: http://github.org/propublica/upton
         | 
| 84 | 
            +
            licenses: 
         | 
| 85 | 
            +
            - MIT
         | 
| 86 | 
            +
            metadata: {}
         | 
| 87 | 
            +
             | 
| 88 | 
            +
            post_install_message: 
         | 
| 89 | 
            +
            rdoc_options: []
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            require_paths: 
         | 
| 92 | 
            +
            - lib
         | 
| 93 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         | 
| 94 | 
            +
              requirements: 
         | 
| 95 | 
            +
              - - ">="
         | 
| 96 | 
            +
                - !ruby/object:Gem::Version 
         | 
| 97 | 
            +
                  version: 1.8.7
         | 
| 98 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         | 
| 99 | 
            +
              requirements: 
         | 
| 100 | 
            +
              - *id002
         | 
| 101 | 
            +
            requirements: []
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            rubyforge_project: 
         | 
| 104 | 
            +
            rubygems_version: 2.0.3
         | 
| 105 | 
            +
            signing_key: 
         | 
| 106 | 
            +
            specification_version: 4
         | 
| 107 | 
            +
            summary: A simple web-scraping framework
         | 
| 108 | 
            +
            test_files: 
         | 
| 109 | 
            +
            - test/data/discussion.html
         | 
| 110 | 
            +
            - test/data/propublica.html
         | 
| 111 | 
            +
            - test/data/prosecutor.html
         | 
| 112 | 
            +
            - test/data/sixfacts.html
         | 
| 113 | 
            +
            - test/data/webinar.html
         | 
| 114 | 
            +
            - test/test_upton.rb
         |