upton 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/upton.rb +55 -9
- data/test/data/easttimor.html +833 -0
- data/test/test_upton.rb +67 -8
- metadata +111 -67
- checksums.yaml +0 -7
data/test/test_upton.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'test/unit'
|
2
4
|
require 'rack'
|
3
5
|
require 'thin'
|
4
6
|
require 'nokogiri'
|
5
7
|
require 'restclient'
|
6
|
-
require 'upton'
|
8
|
+
require './lib/upton'
|
7
9
|
require 'fileutils'
|
8
10
|
|
9
11
|
module Upton
|
@@ -34,22 +36,79 @@ module Upton
|
|
34
36
|
start_test_server()
|
35
37
|
|
36
38
|
headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
|
37
|
-
"
|
39
|
+
"",
|
38
40
|
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
39
41
|
"Six Facts Lost in the IRS Scandal"]
|
40
42
|
|
41
43
|
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
|
42
|
-
propubscraper.
|
43
|
-
propubscraper.
|
44
|
+
propubscraper.debug = true
|
45
|
+
propubscraper.verbose = true
|
44
46
|
|
45
|
-
propubscraper.scrape do |article_str|
|
47
|
+
heds = propubscraper.scrape do |article_str|
|
46
48
|
doc = Nokogiri::HTML(article_str)
|
47
49
|
hed = doc.css('h1.article-title').text
|
48
|
-
assert_equal(hed, headlines.shift)
|
49
50
|
end
|
51
|
+
assert_equal(heds, headlines)
|
50
52
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
51
53
|
end
|
52
54
|
|
55
|
+
def test_encodings
|
56
|
+
skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_stashing
|
60
|
+
skip "should test stashing, make sure we never send too many requests"
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_scrape_list
|
64
|
+
#this doesn't test stashing.
|
65
|
+
#TODO: needs a website that has links to a multi-page list (or table)
|
66
|
+
start_test_server()
|
67
|
+
|
68
|
+
most_commented_heds = [["Six Facts Lost in the IRS Scandal",
|
69
|
+
"How the IRS’s Nonprofit Division Got So Dysfunctional",
|
70
|
+
"Sound, Fury and the IRS Mess",
|
71
|
+
"The Most Important #Muckreads on Rape in the Military",
|
72
|
+
"Congressmen to Hagel: Where Are the Missing War Records?",
|
73
|
+
"As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
|
74
|
+
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
75
|
+
"A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
|
76
|
+
"The Story Behind Our Hospital Interactive",
|
77
|
+
"irs-test-charts-for-embedding"]]
|
78
|
+
|
79
|
+
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
|
80
|
+
propubscraper.debug = true
|
81
|
+
propubscraper.verbose = true
|
82
|
+
list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
|
83
|
+
|
84
|
+
assert_equal(list, most_commented_heds)
|
85
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_scrape_table
|
89
|
+
#this doesn't test stashing.
|
90
|
+
start_test_server()
|
91
|
+
|
92
|
+
east_timor_prime_ministers = [[
|
93
|
+
["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
|
94
|
+
"1", "2", "3", "4",],
|
95
|
+
[],
|
96
|
+
["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
|
97
|
+
["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
|
98
|
+
["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
|
99
|
+
["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
|
100
|
+
]]
|
101
|
+
|
102
|
+
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
|
103
|
+
propubscraper.debug = true
|
104
|
+
propubscraper.verbose = true
|
105
|
+
table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
|
106
|
+
assert_equal(table, east_timor_prime_ministers)
|
107
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
|
53
112
|
private
|
54
113
|
def start_test_server
|
55
114
|
@server_thread = Thread.new do
|
@@ -67,12 +126,12 @@ module Upton
|
|
67
126
|
@root = File.expand_path(File.dirname(__FILE__))
|
68
127
|
path = Rack::Utils.unescape(env['PATH_INFO'])
|
69
128
|
path += 'index.html' if path == '/'
|
70
|
-
file = @root
|
129
|
+
file = File.join(@root, "data", path)
|
71
130
|
|
72
131
|
params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
|
73
132
|
|
74
133
|
if File.exists?(file)
|
75
|
-
[ 200, {"Content-Type" => "text/html"}, File.read(file) ]
|
134
|
+
[ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
|
76
135
|
else
|
77
136
|
[ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
|
78
137
|
end
|
metadata
CHANGED
@@ -1,114 +1,158 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
|
-
authors:
|
7
|
+
authors:
|
7
8
|
- Jeremy B. Merrill
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2013-06-25 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
15
|
name: rack
|
16
|
-
|
17
|
-
|
18
|
-
requirements:
|
19
|
-
-
|
20
|
-
-
|
21
|
-
|
22
|
-
version: "0"
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
23
22
|
type: :development
|
24
|
-
version_requirements: *id001
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: thin
|
27
23
|
prerelease: false
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: thin
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
31
38
|
type: :development
|
32
|
-
version_requirements: *id003
|
33
|
-
- !ruby/object:Gem::Dependency
|
34
|
-
name: nokogiri
|
35
39
|
prerelease: false
|
36
|
-
|
37
|
-
|
38
|
-
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: nokogiri
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
39
54
|
type: :development
|
40
|
-
version_requirements: *id004
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: yard
|
43
55
|
prerelease: false
|
44
|
-
|
45
|
-
|
46
|
-
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: yard
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
47
70
|
type: :development
|
48
|
-
version_requirements: *id005
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: rest-client
|
51
71
|
prerelease: false
|
52
|
-
|
53
|
-
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rest-client
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
54
83
|
- - ~>
|
55
|
-
- !ruby/object:Gem::Version
|
84
|
+
- !ruby/object:Gem::Version
|
56
85
|
version: 1.6.7
|
57
86
|
type: :runtime
|
58
|
-
version_requirements: *id006
|
59
|
-
- !ruby/object:Gem::Dependency
|
60
|
-
name: nokogiri
|
61
87
|
prerelease: false
|
62
|
-
|
63
|
-
|
64
|
-
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.6.7
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: nokogiri
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
65
102
|
type: :runtime
|
66
|
-
|
67
|
-
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
111
|
+
that's easy to use for debugging and doesn't hammer servers by default
|
68
112
|
email: jeremy.merrill@propublica.org
|
69
113
|
executables: []
|
70
|
-
|
71
114
|
extensions: []
|
72
|
-
|
73
115
|
extra_rdoc_files: []
|
74
|
-
|
75
|
-
files:
|
116
|
+
files:
|
76
117
|
- lib/upton.rb
|
77
118
|
- test/data/discussion.html
|
119
|
+
- test/data/easttimor.html
|
78
120
|
- test/data/propublica.html
|
79
121
|
- test/data/prosecutor.html
|
80
122
|
- test/data/sixfacts.html
|
81
123
|
- test/data/webinar.html
|
82
124
|
- test/test_upton.rb
|
83
125
|
homepage: http://github.org/propublica/upton
|
84
|
-
licenses:
|
126
|
+
licenses:
|
85
127
|
- MIT
|
86
|
-
metadata: {}
|
87
|
-
|
88
128
|
post_install_message:
|
89
129
|
rdoc_options: []
|
90
|
-
|
91
|
-
require_paths:
|
130
|
+
require_paths:
|
92
131
|
- lib
|
93
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
-
|
95
|
-
|
96
|
-
|
132
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
133
|
+
none: false
|
134
|
+
requirements:
|
135
|
+
- - ! '>='
|
136
|
+
- !ruby/object:Gem::Version
|
97
137
|
version: 1.8.7
|
98
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
-
|
100
|
-
|
138
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
139
|
+
none: false
|
140
|
+
requirements:
|
141
|
+
- - ! '>='
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
101
144
|
requirements: []
|
102
|
-
|
103
145
|
rubyforge_project:
|
104
|
-
rubygems_version:
|
146
|
+
rubygems_version: 1.8.23
|
105
147
|
signing_key:
|
106
|
-
specification_version:
|
148
|
+
specification_version: 3
|
107
149
|
summary: A simple web-scraping framework
|
108
|
-
test_files:
|
150
|
+
test_files:
|
109
151
|
- test/data/discussion.html
|
152
|
+
- test/data/easttimor.html
|
110
153
|
- test/data/propublica.html
|
111
154
|
- test/data/prosecutor.html
|
112
155
|
- test/data/sixfacts.html
|
113
156
|
- test/data/webinar.html
|
114
157
|
- test/test_upton.rb
|
158
|
+
has_rdoc: true
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA512:
|
3
|
-
data.tar.gz: 0c50b13aca2d3f11f8ebbef72420dfc3276169fb0c1eafadd437ae7d1a83f2d9467511ddc8e529321be677e26bcbd1af02c6c2c615d14e1ea09de0cc9b6d9762
|
4
|
-
metadata.gz: e795b2a9ffe25373b419ccded0db1a7786f7b4789dbf8fc7da28cd53764232ddcad30f398b569e78ae5404690d96661577ea84340b4cf4a4e40798dbd82779e0
|
5
|
-
SHA1:
|
6
|
-
data.tar.gz: 13e674057073d5cd0fc1dff400450717a0613bc4
|
7
|
-
metadata.gz: 1629ba3d63b260994bcebf99f1f5d79a499bb707
|