statement 1.9.9 → 2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +11 -7
- data/lib/statement/scraper.rb +140 -88
- data/lib/statement/version.rb +1 -1
- data/scraper_guide.md +49 -0
- data/spec/butterfield_press.html +407 -0
- data/spec/drupal_press.html +524 -0
- data/spec/ed_perlmutter_press.html +5032 -0
- data/spec/keating_press.html +2211 -0
- data/spec/statement_spec.rb +62 -10
- metadata +11 -2
data/spec/statement_spec.rb
CHANGED
@@ -10,21 +10,21 @@ describe Statement do
|
|
10
10
|
@results = Feed.from_rss(@feed_url)
|
11
11
|
@results.first[:domain].must_equal "ruiz.house.gov"
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
it "parses House GOP press release page" do
|
15
15
|
@feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
|
16
16
|
WebMock.stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
|
17
17
|
@results = Scraper.house_gop(@feed_url)
|
18
18
|
@results.first[:source].must_equal @feed_url
|
19
19
|
end
|
20
|
-
|
20
|
+
|
21
21
|
it "does not attempt to parse dates when none are present" do
|
22
22
|
@feed_url = "http://culberson.house.gov/feed/rss/"
|
23
23
|
WebMock.stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "culberson_rss.xml")), :status => 200)
|
24
24
|
@results = Feed.from_rss(@feed_url)
|
25
25
|
@results.first[:date].must_equal nil
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
it "parses invalid RSS" do
|
29
29
|
@feed_url = "http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed"
|
30
30
|
WebMock.stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "richard_burr.xml")), :status => 200)
|
@@ -39,26 +39,78 @@ describe Statement do
|
|
39
39
|
@results = Scraper.house_gop(@feed_url)
|
40
40
|
@results.last[:url].must_equal "http://www.gop.gov/republicans/other/relative_url_test.html"
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
it "scrapes a senate cold fusion page" do
|
44
44
|
@url = "http://www.billnelson.senate.gov/news/media.cfm?year=2013"
|
45
|
-
WebMock.stub_request(:any, @url).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'bill_nelson_press.html')), :status => 200)
|
45
|
+
WebMock.stub_request(:any, @url).with(:headers => {'Accept'=>'*/*', 'User-Agent'=>'Ruby'}).to_return(:headers => {}, :body => File.new(File.join(File.dirname(__FILE__), 'bill_nelson_press.html')), :status => 200)
|
46
46
|
@results = Scraper.billnelson(year=2013)
|
47
47
|
@results.last[:url].must_equal "http://www.billnelson.senate.gov/news/details.cfm?id=338190&"
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
it "scrapes vitter pages for 2013" do
|
51
51
|
@vitter = "http://www.vitter.senate.gov/newsroom/press?year=2013"
|
52
52
|
WebMock.stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
|
53
53
|
@results = Scraper.vitter(year=2013)
|
54
54
|
@results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov"]
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
it "only scrapes vitter page for 2012" do
|
58
58
|
@vitter = "http://www.vitter.senate.gov/newsroom/press?year=2012"
|
59
59
|
WebMock.stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
|
60
60
|
@results = Scraper.vitter(year=2012)
|
61
|
-
@results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov"]
|
61
|
+
@results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov"]
|
62
|
+
end
|
63
|
+
|
64
|
+
it "scrapes perlmutter's press page" do
|
65
|
+
@perlmutter_url = "http://perlmutter.house.gov/index.php/media-center/press-releases-86821"
|
66
|
+
@perlmutter_page = File.new(File.join(File.dirname(__FILE__), 'ed_perlmutter_press.html'))
|
67
|
+
WebMock.stub_request(:any, @perlmutter_url).to_return(:body => @perlmutter_page, :status => 200)
|
68
|
+
|
69
|
+
expected_result = {
|
70
|
+
:source => "http://perlmutter.house.gov/index.php/media-center/press-releases-86821",
|
71
|
+
:url => "http://perlmutter.house.gov/index.php/media-center/press-releases-86821/1505-polis-perlmutter-host-fed-reserve-chief-for-roundtable-with-cannabis-businesses",
|
72
|
+
:title => "Polis, Perlmutter Host Fed Reserve Chief for Roundtable with Cannabis Businesses",
|
73
|
+
:date => Date.parse("2015-04-10"),
|
74
|
+
:domain => "perlmutter.house.gov"
|
75
|
+
}
|
76
|
+
|
77
|
+
@results = Scraper.perlmutter
|
78
|
+
@results.first.must_equal expected_result
|
79
|
+
end
|
80
|
+
|
81
|
+
it "scrapes keating's press page" do
|
82
|
+
@keating_url = "http://keating.house.gov/index.php?option=com_content&view=category&id=14&Itemid=13"
|
83
|
+
@keating_page = File.new(File.join(File.dirname(__FILE__), 'keating_press.html'))
|
84
|
+
WebMock.stub_request(:any, @keating_url).to_return(:body => @keating_page, :status => 200)
|
85
|
+
|
86
|
+
expected_result = {
|
87
|
+
:source => "http://keating.house.gov/index.php?option=com_content&view=category&id=14&Itemid=13",
|
88
|
+
:url => "http://keating.house.gov/index.php?option=com_content&view=article&id=314:keating-announces-epa-grant-for-new-bedford&catid=14&Itemid=13",
|
89
|
+
:title => "Keating Announces EPA Grant for New Bedford",
|
90
|
+
:date => Date.parse("2015-03-13"),
|
91
|
+
:domain => "keating.house.gov"
|
92
|
+
}
|
93
|
+
|
94
|
+
@results = Scraper.keating
|
95
|
+
@results.first.must_equal expected_result
|
96
|
+
end
|
97
|
+
|
98
|
+
it "scrapes a drupal press page" do
|
99
|
+
@drupal_url = "http://walz.house.gov/media-center/press-releases"
|
100
|
+
@drupal_page = File.new(File.join(File.dirname(__FILE__), 'drupal_press.html'))
|
101
|
+
puts @drupal_page
|
102
|
+
WebMock.stub_request(:any, "#{@drupal_url}?page=0").to_return(:body => @drupal_page, :status => 200)
|
103
|
+
|
104
|
+
expected_result = {
|
105
|
+
:source => "http://walz.house.gov/media-center/press-releases?page=0",
|
106
|
+
:url => "http://walz.house.gov/media-center/press-releases/walz-calls-for-passage-of-the-paycheck-fairness-act-on-equal-pay-day-0",
|
107
|
+
:title => "Walz Calls for Passage of the Paycheck Fairness Act on Equal Pay Day",
|
108
|
+
:date => Date.parse("2015-04-14"),
|
109
|
+
:domain => "walz.house.gov"
|
110
|
+
}
|
111
|
+
|
112
|
+
@results = Scraper.drupal(urls=[@drupal_url])
|
113
|
+
@results.length.must_equal 10
|
114
|
+
@results.first.must_equal expected_result
|
62
115
|
end
|
63
|
-
|
64
|
-
end
|
116
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: '2.0'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Derek Willis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -169,10 +169,15 @@ files:
|
|
169
169
|
- lib/statement/tweets.rb
|
170
170
|
- lib/statement/utils.rb
|
171
171
|
- lib/statement/version.rb
|
172
|
+
- scraper_guide.md
|
172
173
|
- spec/bill_nelson_press.html
|
174
|
+
- spec/butterfield_press.html
|
173
175
|
- spec/cowan_press.html
|
174
176
|
- spec/culberson_rss.xml
|
177
|
+
- spec/drupal_press.html
|
178
|
+
- spec/ed_perlmutter_press.html
|
175
179
|
- spec/house_gop_releases.html
|
180
|
+
- spec/keating_press.html
|
176
181
|
- spec/richard_burr.xml
|
177
182
|
- spec/ruiz_rss.xml
|
178
183
|
- spec/statement_spec.rb
|
@@ -204,9 +209,13 @@ specification_version: 4
|
|
204
209
|
summary: Given a url, Statement returns links to press releases and official statements.
|
205
210
|
test_files:
|
206
211
|
- spec/bill_nelson_press.html
|
212
|
+
- spec/butterfield_press.html
|
207
213
|
- spec/cowan_press.html
|
208
214
|
- spec/culberson_rss.xml
|
215
|
+
- spec/drupal_press.html
|
216
|
+
- spec/ed_perlmutter_press.html
|
209
217
|
- spec/house_gop_releases.html
|
218
|
+
- spec/keating_press.html
|
210
219
|
- spec/richard_burr.xml
|
211
220
|
- spec/ruiz_rss.xml
|
212
221
|
- spec/statement_spec.rb
|