upton 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,82 @@
1
+ require 'test/unit'
2
+ require 'rack'
3
+ require 'thin'
4
+ require 'nokogiri'
5
+ require 'restclient'
6
+ require 'upton'
7
+ require 'fileutils'
8
+
9
+ module Upton
10
+ module Test
11
+
12
+ # class ProPublicaScraper < Upton::Scraper
13
+ # def initialize(a, b, c)
14
+ # super
15
+ # @verbose = false
16
+ # @debug = false
17
+ # @stash_folder = "test_stashes"
18
+ # end
19
+ # end
20
+
21
+
22
+ class UptonTest < ::Test::Unit::TestCase
23
+
24
+ # def test_get_page
25
+ #TODO
26
+ # end
27
+
28
+ # def test_stash
29
+ #TODO
30
+ # end
31
+
32
+ def test_scrape
33
+ #this doesn't test stashing.
34
+ start_test_server()
35
+
36
+ headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
37
+ "Discussion: Military Lending and Debt",
38
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
39
+ "Six Facts Lost in the IRS Scandal"]
40
+
41
+ propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
42
+ propubscraper.verbose = false
43
+ propubscraper.debug = false
44
+
45
+ propubscraper.scrape do |article_str|
46
+ doc = Nokogiri::HTML(article_str)
47
+ hed = doc.css('h1.article-title').text
48
+ assert_equal(hed, headlines.shift)
49
+ end
50
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
51
+ end
52
+
53
+ private
54
+ def start_test_server
55
+ @server_thread = Thread.new do
56
+ Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
57
+ end
58
+ sleep(1) # wait a sec for the server to be booted
59
+ end
60
+ end
61
+
62
+
63
+
64
+ # via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
65
+ class Server
66
+ def call(env)
67
+ @root = File.expand_path(File.dirname(__FILE__))
68
+ path = Rack::Utils.unescape(env['PATH_INFO'])
69
+ path += 'index.html' if path == '/'
70
+ file = @root + "#{path}"
71
+
72
+ params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
73
+
74
+ if File.exists?(file)
75
+ [ 200, {"Content-Type" => "text/html"}, File.read(file) ]
76
+ else
77
+ [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
metadata ADDED
@@ -0,0 +1,114 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: upton
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jeremy B. Merrill
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2013-05-29 00:00:00 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rack
16
+ prerelease: false
17
+ requirement: &id001 !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - &id002
20
+ - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: "0"
23
+ type: :development
24
+ version_requirements: *id001
25
+ - !ruby/object:Gem::Dependency
26
+ name: thin
27
+ prerelease: false
28
+ requirement: &id003 !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - *id002
31
+ type: :development
32
+ version_requirements: *id003
33
+ - !ruby/object:Gem::Dependency
34
+ name: nokogiri
35
+ prerelease: false
36
+ requirement: &id004 !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - *id002
39
+ type: :development
40
+ version_requirements: *id004
41
+ - !ruby/object:Gem::Dependency
42
+ name: yard
43
+ prerelease: false
44
+ requirement: &id005 !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - *id002
47
+ type: :development
48
+ version_requirements: *id005
49
+ - !ruby/object:Gem::Dependency
50
+ name: rest-client
51
+ prerelease: false
52
+ requirement: &id006 !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ~>
55
+ - !ruby/object:Gem::Version
56
+ version: 1.6.7
57
+ type: :runtime
58
+ version_requirements: *id006
59
+ - !ruby/object:Gem::Dependency
60
+ name: nokogiri
61
+ prerelease: false
62
+ requirement: &id007 !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - *id002
65
+ type: :runtime
66
+ version_requirements: *id007
67
+ description: Don't re-write web scrapers every time. Skrapojan gives you a scraper template that's easy to use for debugging and doesn't hammer servers by default
68
+ email: jeremy.merrill@propublica.org
69
+ executables: []
70
+
71
+ extensions: []
72
+
73
+ extra_rdoc_files: []
74
+
75
+ files:
76
+ - lib/upton.rb
77
+ - test/data/discussion.html
78
+ - test/data/propublica.html
79
+ - test/data/prosecutor.html
80
+ - test/data/sixfacts.html
81
+ - test/data/webinar.html
82
+ - test/test_upton.rb
83
+ homepage: http://github.org/propublica/upton
84
+ licenses:
85
+ - MIT
86
+ metadata: {}
87
+
88
+ post_install_message:
89
+ rdoc_options: []
90
+
91
+ require_paths:
92
+ - lib
93
+ required_ruby_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - ">="
96
+ - !ruby/object:Gem::Version
97
+ version: 1.8.7
98
+ required_rubygems_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - *id002
101
+ requirements: []
102
+
103
+ rubyforge_project:
104
+ rubygems_version: 2.0.3
105
+ signing_key:
106
+ specification_version: 4
107
+ summary: A simple web-scraping framework
108
+ test_files:
109
+ - test/data/discussion.html
110
+ - test/data/propublica.html
111
+ - test/data/prosecutor.html
112
+ - test/data/sixfacts.html
113
+ - test/data/webinar.html
114
+ - test/test_upton.rb