upton 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ require 'test/unit'
2
+ require 'rack'
3
+ require 'thin'
4
+ require 'nokogiri'
5
+ require 'restclient'
6
+ require 'upton'
7
+ require 'fileutils'
8
+
9
+ module Upton
10
+ module Test
11
+
12
+ # class ProPublicaScraper < Upton::Scraper
13
+ # def initialize(a, b, c)
14
+ # super
15
+ # @verbose = false
16
+ # @debug = false
17
+ # @stash_folder = "test_stashes"
18
+ # end
19
+ # end
20
+
21
+
22
+ class UptonTest < ::Test::Unit::TestCase
23
+
24
+ # def test_get_page
25
+ #TODO
26
+ # end
27
+
28
+ # def test_stash
29
+ #TODO
30
+ # end
31
+
32
+ def test_scrape
33
+ #this doesn't test stashing.
34
+ start_test_server()
35
+
36
+ headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
37
+ "Discussion: Military Lending and Debt",
38
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
39
+ "Six Facts Lost in the IRS Scandal"]
40
+
41
+ propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
42
+ propubscraper.verbose = false
43
+ propubscraper.debug = false
44
+
45
+ propubscraper.scrape do |article_str|
46
+ doc = Nokogiri::HTML(article_str)
47
+ hed = doc.css('h1.article-title').text
48
+ assert_equal(hed, headlines.shift)
49
+ end
50
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
51
+ end
52
+
53
+ private
54
+ def start_test_server
55
+ @server_thread = Thread.new do
56
+ Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
57
+ end
58
+ sleep(1) # wait a sec for the server to be booted
59
+ end
60
+ end
61
+
62
+
63
+
64
+ # via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
65
+ class Server
66
+ def call(env)
67
+ @root = File.expand_path(File.dirname(__FILE__))
68
+ path = Rack::Utils.unescape(env['PATH_INFO'])
69
+ path += 'index.html' if path == '/'
70
+ file = @root + "#{path}"
71
+
72
+ params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
73
+
74
+ if File.exists?(file)
75
+ [ 200, {"Content-Type" => "text/html"}, File.read(file) ]
76
+ else
77
+ [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: upton
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy B. Merrill
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2013-05-29 00:00:00 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rack
16
+ prerelease: false
17
+ requirement: &id001 !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - &id002
20
+ - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ type: :development
24
+ version_requirements: *id001
25
+ - !ruby/object:Gem::Dependency
26
+ name: thin
27
+ prerelease: false
28
+ requirement: &id003 !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - *id002
31
+ type: :development
32
+ version_requirements: *id003
33
+ - !ruby/object:Gem::Dependency
34
+ name: nokogiri
35
+ prerelease: false
36
+ requirement: &id004 !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - *id002
39
+ type: :development
40
+ version_requirements: *id004
41
+ - !ruby/object:Gem::Dependency
42
+ name: yard
43
+ prerelease: false
44
+ requirement: &id005 !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - *id002
47
+ type: :development
48
+ version_requirements: *id005
49
+ - !ruby/object:Gem::Dependency
50
+ name: rest-client
51
+ prerelease: false
52
+ requirement: &id006 !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ~>
55
+ - !ruby/object:Gem::Version
56
+ version: 1.6.7
57
+ type: :runtime
58
+ version_requirements: *id006
59
+ - !ruby/object:Gem::Dependency
60
+ name: nokogiri
61
+ prerelease: false
62
+ requirement: &id007 !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - *id002
65
+ type: :runtime
66
+ version_requirements: *id007
67
+ description: Don't re-write web scrapers every time. Skrapojan gives you a scraper template that's easy to use for debugging and doesn't hammer servers by default
68
+ email: jeremy.merrill@propublica.org
69
+ executables: []
70
+
71
+ extensions: []
72
+
73
+ extra_rdoc_files: []
74
+
75
+ files:
76
+ - lib/upton.rb
77
+ - test/data/discussion.html
78
+ - test/data/propublica.html
79
+ - test/data/prosecutor.html
80
+ - test/data/sixfacts.html
81
+ - test/data/webinar.html
82
+ - test/test_upton.rb
83
+ homepage: http://github.org/propublica/upton
84
+ licenses:
85
+ - MIT
86
+ metadata: {}
87
+
88
+ post_install_message:
89
+ rdoc_options: []
90
+
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 1.8.7
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - *id002
101
+ requirements: []
102
+
103
+ rubyforge_project:
104
+ rubygems_version: 2.0.3
105
+ signing_key:
106
+ specification_version: 4
107
+ summary: A simple web-scraping framework
108
+ test_files:
109
+ - test/data/discussion.html
110
+ - test/data/propublica.html
111
+ - test/data/prosecutor.html
112
+ - test/data/sixfacts.html
113
+ - test/data/webinar.html
114
+ - test/test_upton.rb