statement 0.8.2 → 0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/statement/feed.rb +43 -0
- data/lib/statement/scraper.rb +521 -0
- data/lib/statement/utils.rb +12 -0
- data/lib/statement/version.rb +1 -1
- data/lib/statement.rb +5 -547
- data/spec/statement_spec.rb +9 -10
- metadata +14 -11
@@ -0,0 +1,43 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'uri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'american_date'
|
5
|
+
require 'nokogiri'
|
6
|
+
include Statement
|
7
|
+
|
8
|
+
module Statement
|
9
|
+
class Feed
|
10
|
+
|
11
|
+
def self.open_rss(url)
|
12
|
+
begin
|
13
|
+
Nokogiri::XML(open(url))
|
14
|
+
rescue
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.date_from_rss_item(link)
|
20
|
+
if !link.xpath('pubDate').text.empty?
|
21
|
+
Date.parse(link.xpath('pubDate').text)
|
22
|
+
elsif !link.xpath('pubdate').empty?
|
23
|
+
Date.parse(link.xpath('pubdate').text)
|
24
|
+
else
|
25
|
+
nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.from_rss(url)
|
30
|
+
doc = open_rss(url)
|
31
|
+
return unless doc
|
32
|
+
links = doc.xpath('//item')
|
33
|
+
results = links.map do |link|
|
34
|
+
abs_link = Utils.absolute_link(url, link.xpath('link').text)
|
35
|
+
abs_link = "http://www.burr.senate.gov/public/"+ link.xpath('link').text if url == 'http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed'
|
36
|
+
abs_link = link.xpath('link').text[37..-1] if url == "http://www.johanns.senate.gov/public/?a=RSS.Feed"
|
37
|
+
{ :source => url, :url => abs_link, :title => link.xpath('title').text, :date => date_from_rss_item(link), :domain => URI.parse(url).host }
|
38
|
+
end
|
39
|
+
Utils.remove_generic_urls!(results)
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,521 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'uri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'american_date'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module Statement
|
8
|
+
class Scraper
|
9
|
+
|
10
|
+
def self.open_html(url)
|
11
|
+
begin
|
12
|
+
Nokogiri::HTML(open(url).read)
|
13
|
+
rescue
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.house_gop(url)
|
19
|
+
doc = open_html(url)
|
20
|
+
return unless doc
|
21
|
+
uri = URI.parse(url)
|
22
|
+
date = Date.parse(uri.query.split('=').last)
|
23
|
+
links = doc.xpath("//ul[@id='membernews']").search('a')
|
24
|
+
results = links.map do |link|
|
25
|
+
abs_link = Utils.absolute_link(url, link["href"])
|
26
|
+
{ :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
|
27
|
+
end
|
28
|
+
Utils.remove_generic_urls!(results)
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.member_methods
|
32
|
+
[:capuano, :cold_fusion, :conaway, :susandavis, :faleomavaega, :freshman_senators, :klobuchar, :lujan, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :mccain, :vitter_cowan, :donnelly, :inhofe, :levin, :reid, :palazzo, :document_query]
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.committee_methods
|
36
|
+
[:senate_approps_majority, :senate_approps_minority, :senate_banking, :senate_hsag_majority, :senate_hsag_minority, :senate_indian, :senate_aging, :senate_smallbiz_minority, :senate_intel, :house_energy_minority, :house_homeland_security_minority, :house_judiciary_majority, :house_rules_majority, :house_ways_means_majority]
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.member_scrapers
|
40
|
+
year = Date.today.year
|
41
|
+
results = [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, palazzo(page=1), billnelson(year=year),
|
42
|
+
document_query(page=1), document_query(page=2), donnelly(year=year), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
|
43
|
+
vitter_cowan(year=year), inhofe(year=year), reid].flatten
|
44
|
+
Utils.remove_generic_urls!(results)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.backfill_from_scrapers
|
48
|
+
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
49
|
+
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
50
|
+
boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
|
51
|
+
].flatten
|
52
|
+
Utils.remove_generic_urls!(results)
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.committee_scrapers
|
56
|
+
year = Date.today.year
|
57
|
+
results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
|
58
|
+
senate_indian, senate_aging, senate_smallbiz_minority, senate_intel(113, 2013, 2014), house_energy_minority, house_homeland_security_minority,
|
59
|
+
house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
|
60
|
+
Utils.remove_generic_urls!(results)
|
61
|
+
end
|
62
|
+
|
63
|
+
## special cases for committees without RSS feeds
|
64
|
+
|
65
|
+
def self.senate_approps_majority
|
66
|
+
results = []
|
67
|
+
url = "http://www.appropriations.senate.gov/news.cfm"
|
68
|
+
doc = open_html(url)
|
69
|
+
return if doc.nil?
|
70
|
+
doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
|
71
|
+
date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
|
72
|
+
results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'majority' }
|
73
|
+
end
|
74
|
+
end
|
75
|
+
results
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.senate_approps_minority
|
79
|
+
results = []
|
80
|
+
url = "http://www.appropriations.senate.gov/republican.cfm"
|
81
|
+
doc = open_html(url)
|
82
|
+
return if doc.nil?
|
83
|
+
doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
|
84
|
+
date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
|
85
|
+
results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'minority' }
|
86
|
+
end
|
87
|
+
end
|
88
|
+
results
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.senate_banking(year=Date.today.year)
|
92
|
+
results = []
|
93
|
+
url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
94
|
+
doc = open_html(url)
|
95
|
+
return if doc.nil?
|
96
|
+
doc.xpath("//tr").each do |row|
|
97
|
+
results << { :source => url, :url => "http://www.banking.senate.gov/public/" + row.children[2].children[1]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip+", #{year}"), :domain => "http://www.banking.senate.gov/", :party => 'majority' }
|
98
|
+
end
|
99
|
+
results
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.senate_hsag_majority(year=Date.today.year)
|
103
|
+
results = []
|
104
|
+
url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
|
105
|
+
doc = open_html(url)
|
106
|
+
return if doc.nil?
|
107
|
+
doc.xpath("//tr").each do |row|
|
108
|
+
next if row.text.strip.size < 30
|
109
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'majority' }
|
110
|
+
end
|
111
|
+
results
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.senate_hsag_minority(year=Date.today.year)
|
115
|
+
results = []
|
116
|
+
url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
|
117
|
+
doc = open_html(url)
|
118
|
+
return if doc.nil?
|
119
|
+
doc.xpath("//tr").each do |row|
|
120
|
+
next if row.text.strip.size < 30
|
121
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'minority' }
|
122
|
+
end
|
123
|
+
results
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.senate_indian
|
127
|
+
results = []
|
128
|
+
url = "http://www.indian.senate.gov/news/index.cfm"
|
129
|
+
doc = open_html(url)
|
130
|
+
return if doc.nil?
|
131
|
+
doc.xpath("//h3").each do |row|
|
132
|
+
results << { :source => url, :url => "http://www.indian.senate.gov"+row.children[0]['href'], :title => row.children[0].text, :date => Date.parse(row.previous.previous.text), :domain => "http://www.indian.senate.gov/", :party => 'majority' }
|
133
|
+
end
|
134
|
+
results
|
135
|
+
end
|
136
|
+
|
137
|
+
def self.senate_aging
|
138
|
+
results = []
|
139
|
+
url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
|
140
|
+
doc = open_html(url)
|
141
|
+
return if doc.nil?
|
142
|
+
doc.xpath("//tr")[6..104].each do |row|
|
143
|
+
results << { :source => url, :url => "http://www.aging.senate.gov/"+row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.aging.senate.gov/" }
|
144
|
+
end
|
145
|
+
results
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.senate_smallbiz_minority
|
149
|
+
results = []
|
150
|
+
url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
|
151
|
+
doc = open_html(url)
|
152
|
+
return if doc.nil?
|
153
|
+
doc.xpath("//ul[@class='recordList']").each do |row|
|
154
|
+
results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
|
155
|
+
end
|
156
|
+
results
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.senate_intel(congress=113, start_year=2013, end_year=2014)
|
160
|
+
results = []
|
161
|
+
url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
|
162
|
+
doc = open_html(url)
|
163
|
+
return if doc.nil?
|
164
|
+
doc.xpath("//tr[@valign='top']")[7..-1].each do |row|
|
165
|
+
results << { :source => url, :url => "http://www.intelligence.senate.gov/press/"+row.children[2].children[0]['href'], :title => row.children[2].children[0].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.intelligence.senate.gov/" }
|
166
|
+
end
|
167
|
+
results
|
168
|
+
end
|
169
|
+
|
170
|
+
def self.house_energy_minority
|
171
|
+
results = []
|
172
|
+
url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
|
173
|
+
doc = open_html(url)
|
174
|
+
return if doc.nil?
|
175
|
+
doc.xpath("//div[@class='views-field-title']").each do |row|
|
176
|
+
results << { :source => url, :url => "http://democrats.energycommerce.house.gov"+row.children[1].children[0]['href'], :title => row.children[1].children[0].text, :date => Date.parse(row.next.next.text.strip), :domain => "http://energycommerce.house.gov/", :party => 'minority' }
|
177
|
+
end
|
178
|
+
results
|
179
|
+
end
|
180
|
+
|
181
|
+
def self.house_homeland_security_minority
|
182
|
+
results = []
|
183
|
+
url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
|
184
|
+
doc = open_html(url)
|
185
|
+
return if doc.nil?
|
186
|
+
doc.xpath("//li[@class='article']").each do |row|
|
187
|
+
results << { :source => url, :url => "http://chsdemocrats.house.gov"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text), :domain => "http://chsdemocrats.house.gov/", :party => 'minority' }
|
188
|
+
end
|
189
|
+
results
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.house_judiciary_majority
|
193
|
+
results = []
|
194
|
+
url = "http://judiciary.house.gov/news/press2013.html"
|
195
|
+
doc = open_html(url)
|
196
|
+
return if doc.nil?
|
197
|
+
doc.xpath("//p")[3..60].each do |row|
|
198
|
+
next if row.text.size < 30
|
199
|
+
results << { :source => url, :url => row.children[5]['href'], :title => row.children[0].text, :date => Date.parse(row.children[1].text.strip), :domain => "http://judiciary.house.gov/", :party => 'majority' }
|
200
|
+
end
|
201
|
+
results
|
202
|
+
end
|
203
|
+
|
204
|
+
def self.house_rules_majority
|
205
|
+
results = []
|
206
|
+
url = "http://www.rules.house.gov/News/Default.aspx"
|
207
|
+
doc = open_html(url)
|
208
|
+
return if doc.nil?
|
209
|
+
doc.xpath("//tr")[1..-2].each do |row|
|
210
|
+
next if row.text.strip.size < 30
|
211
|
+
results << { :source => url, :url => "http://www.rules.house.gov/News/"+row.children[0].children[1].children[0]['href'], :title => row.children[0].children[1].children[0].text, :date => Date.parse(row.children[2].children[1].text.strip), :domain => "http://www.rules.house.gov/", :party => 'majority' }
|
212
|
+
end
|
213
|
+
results
|
214
|
+
end
|
215
|
+
|
216
|
+
def self.house_ways_means_majority
|
217
|
+
results = []
|
218
|
+
url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
|
219
|
+
doc = open_html(url)
|
220
|
+
return if doc.nil?
|
221
|
+
doc.xpath("//ul[@class='UnorderedNewsList']").children.each do |row|
|
222
|
+
next if row.text.strip.size < 10
|
223
|
+
results << { :source => url, :url => "http://waysandmeans.house.gov"+row.children[1].children[1]['href'], :title => row.children[1].children[1].text, :date => Date.parse(row.children[3].children[0].text.strip), :domain => "http://waysandmeans.house.gov/", :party => 'majority' }
|
224
|
+
end
|
225
|
+
results
|
226
|
+
end
|
227
|
+
|
228
|
+
## special cases for members without RSS feeds
|
229
|
+
|
230
|
+
def self.capuano
|
231
|
+
results = []
|
232
|
+
base_url = "http://www.house.gov/capuano/news/"
|
233
|
+
list_url = base_url + 'date.shtml'
|
234
|
+
doc = open_html(list_url)
|
235
|
+
return if doc.nil?
|
236
|
+
doc.xpath("//a").each do |link|
|
237
|
+
if link['href'] and link['href'].include?('/pr')
|
238
|
+
begin
|
239
|
+
date = Date.parse(link.text)
|
240
|
+
rescue
|
241
|
+
date = nil
|
242
|
+
end
|
243
|
+
results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
|
244
|
+
end
|
245
|
+
end
|
246
|
+
return results[0..-5]
|
247
|
+
end
|
248
|
+
|
249
|
+
def self.cold_fusion(year=Date.today.year, month=0)
|
250
|
+
results = []
|
251
|
+
year = Date.today.year if not year
|
252
|
+
month = 0 if not month
|
253
|
+
domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
|
254
|
+
domains.each do |domain|
|
255
|
+
if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
|
256
|
+
url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
257
|
+
elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
|
258
|
+
url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
259
|
+
else
|
260
|
+
url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
261
|
+
end
|
262
|
+
doc = open_html(url)
|
263
|
+
return if doc.nil?
|
264
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
265
|
+
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
266
|
+
next if date_text == 'Date' or date_text.size > 8
|
267
|
+
date = Date.parse(date_text)
|
268
|
+
results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
|
269
|
+
end
|
270
|
+
end
|
271
|
+
results.flatten
|
272
|
+
end
|
273
|
+
|
274
|
+
def self.conaway(page=1)
|
275
|
+
results = []
|
276
|
+
base_url = "http://conaway.house.gov/news/"
|
277
|
+
page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
|
278
|
+
doc = open_html(page_url)
|
279
|
+
return if doc.nil?
|
280
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
281
|
+
results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
|
282
|
+
end
|
283
|
+
results
|
284
|
+
end
|
285
|
+
|
286
|
+
def self.susandavis
|
287
|
+
results = []
|
288
|
+
base_url = "http://www.house.gov/susandavis/"
|
289
|
+
doc = open_html(base_url+'news.shtml')
|
290
|
+
return if doc.nil?
|
291
|
+
doc.search("ul")[6].children.each do |row|
|
292
|
+
next if row.text.strip == ''
|
293
|
+
results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
|
294
|
+
end
|
295
|
+
results
|
296
|
+
end
|
297
|
+
|
298
|
+
def self.faleomavaega
|
299
|
+
results = []
|
300
|
+
base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
|
301
|
+
doc = open_html(base_url)
|
302
|
+
return if doc.nil?
|
303
|
+
doc.xpath("//li[@type='disc']").each do |row|
|
304
|
+
results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[u201cu201d]/, '').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text), :domain => "house.gov/faleomavaega" }
|
305
|
+
end
|
306
|
+
results
|
307
|
+
end
|
308
|
+
|
309
|
+
def self.freshman_senators
|
310
|
+
results = []
|
311
|
+
['baldwin', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz'].each do |senator|
|
312
|
+
base_url = "http://www.#{senator}.senate.gov/"
|
313
|
+
doc = open_html(base_url+'press.cfm?maxrows=200&startrow=1&&type=1')
|
314
|
+
return if doc.nil?
|
315
|
+
doc.xpath("//tr")[3..-1].each do |row|
|
316
|
+
next if row.text.strip == ''
|
317
|
+
results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text), :domain => "#{senator}.senate.gov" }
|
318
|
+
end
|
319
|
+
end
|
320
|
+
results.flatten
|
321
|
+
end
|
322
|
+
|
323
|
+
def self.klobuchar
|
324
|
+
results = []
|
325
|
+
base_url = "http://www.klobuchar.senate.gov/"
|
326
|
+
[2012,2013].each do |year|
|
327
|
+
year_url = base_url + "newsreleases.cfm?year=#{year}"
|
328
|
+
doc = open_html(year_url)
|
329
|
+
return if doc.nil?
|
330
|
+
doc.xpath("//dt").each do |row|
|
331
|
+
results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[u201cu201d]/, '').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
|
332
|
+
end
|
333
|
+
end
|
334
|
+
results
|
335
|
+
end
|
336
|
+
|
337
|
+
def self.lujan
|
338
|
+
results = []
|
339
|
+
base_url = 'http://lujan.house.gov/'
|
340
|
+
doc = open_html(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78')
|
341
|
+
return if doc.nil?
|
342
|
+
doc.xpath('//ul')[1].children.each do |row|
|
343
|
+
next if row.text.strip == ''
|
344
|
+
results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text, :date => nil, :domain => "lujan.house.gov" }
|
345
|
+
end
|
346
|
+
results
|
347
|
+
end
|
348
|
+
|
349
|
+
def self.billnelson(year=2013)
|
350
|
+
results = []
|
351
|
+
base_url = "http://www.billnelson.senate.gov/news/"
|
352
|
+
year_url = base_url + "media.cfm?year=#{year}"
|
353
|
+
doc = open_html(year_url)
|
354
|
+
return if doc.nil?
|
355
|
+
doc.xpath('//li').each do |row|
|
356
|
+
results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
|
357
|
+
end
|
358
|
+
results
|
359
|
+
end
|
360
|
+
|
361
|
+
# fetches the latest 1000 releases, can be altered
|
362
|
+
def self.lautenberg(rows=1000)
|
363
|
+
results = []
|
364
|
+
base_url = 'http://www.lautenberg.senate.gov/newsroom/'
|
365
|
+
url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
|
366
|
+
doc = open_html(url)
|
367
|
+
return if doc.nil?
|
368
|
+
doc.xpath("//tr")[4..-2].each do |row|
|
369
|
+
results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
|
370
|
+
end
|
371
|
+
results
|
372
|
+
end
|
373
|
+
|
374
|
+
def self.crapo
|
375
|
+
results = []
|
376
|
+
base_url = "http://www.crapo.senate.gov/media/newsreleases/"
|
377
|
+
url = base_url + "release_all.cfm"
|
378
|
+
doc = open_html(url)
|
379
|
+
return if doc.nil?
|
380
|
+
doc.xpath("//tr").each do |row|
|
381
|
+
results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
|
382
|
+
end
|
383
|
+
results
|
384
|
+
end
|
385
|
+
|
386
|
+
def self.coburn(year=Date.today.year)
|
387
|
+
results = []
|
388
|
+
url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
|
389
|
+
doc = open_html(url)
|
390
|
+
return if doc.nil?
|
391
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
392
|
+
next if row.text[0..3] == "Date"
|
393
|
+
results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
|
394
|
+
end
|
395
|
+
results
|
396
|
+
end
|
397
|
+
|
398
|
+
def self.boxer(start=1)
|
399
|
+
results = []
|
400
|
+
url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
|
401
|
+
domain = 'www.boxer.senate.gov'
|
402
|
+
doc = open_html(url)
|
403
|
+
return if doc.nil?
|
404
|
+
doc.xpath("//div[@class='left']")[1..-1].each do |row|
|
405
|
+
results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
|
406
|
+
end
|
407
|
+
results
|
408
|
+
end
|
409
|
+
|
410
|
+
def self.mccain(year=Date.today.year)
|
411
|
+
results = []
|
412
|
+
url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
413
|
+
domain = 'www.mccain.senate.gov'
|
414
|
+
doc = open_html(url)
|
415
|
+
return if doc.nil?
|
416
|
+
doc.xpath("//li")[7..-1].each do |row|
|
417
|
+
results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
|
418
|
+
end
|
419
|
+
results
|
420
|
+
end
|
421
|
+
|
422
|
+
def self.vitter_cowan(year=Date.today.year)
|
423
|
+
results = []
|
424
|
+
urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
|
425
|
+
urls.each do |url|
|
426
|
+
next if year < 2013 and url == "http://www.cowan.senate.gov/"
|
427
|
+
if url == "http://www.vitter.senate.gov/newsroom/"
|
428
|
+
domain = "www.vitter.senate.gov"
|
429
|
+
elsif url == "http://www.cowan.senate.gov/"
|
430
|
+
domain = "www.cowan.senate.gov"
|
431
|
+
end
|
432
|
+
doc = open_html(url+"press?year=#{year}")
|
433
|
+
return if doc.nil?
|
434
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
435
|
+
next if row.text.strip.size < 30
|
436
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
437
|
+
end
|
438
|
+
end
|
439
|
+
results.flatten
|
440
|
+
end
|
441
|
+
|
442
|
+
def self.donnelly(year=Date.today.year)
|
443
|
+
results = []
|
444
|
+
url = "http://www.donnelly.senate.gov/newsroom/"
|
445
|
+
domain = "www.donnelly.senate.gov"
|
446
|
+
doc = open_html(url+"press?year=#{year}")
|
447
|
+
return if doc.nil?
|
448
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
449
|
+
next if row.text.strip.size < 30
|
450
|
+
results << { :source => url, :url => "http://www.donnelly.senate.gov"+row.children[2].children[1]['href'].strip, :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => domain}
|
451
|
+
end
|
452
|
+
results
|
453
|
+
end
|
454
|
+
|
455
|
+
def self.inhofe(year=Date.today.year)
|
456
|
+
results = []
|
457
|
+
url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
|
458
|
+
domain = "www.inhofe.senate.gov"
|
459
|
+
doc = open_html(url)
|
460
|
+
return if doc.nil?
|
461
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
462
|
+
next if row.text.strip.size < 30
|
463
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
464
|
+
end
|
465
|
+
results
|
466
|
+
end
|
467
|
+
|
468
|
+
def self.levin(page=1)
|
469
|
+
results = []
|
470
|
+
url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}§ion=press"
|
471
|
+
domain = "www.levin.senate.gov"
|
472
|
+
doc = open_html(url)
|
473
|
+
return if doc.nil?
|
474
|
+
doc.xpath('//tr').each do |row|
|
475
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
476
|
+
end
|
477
|
+
results
|
478
|
+
end
|
479
|
+
|
480
|
+
def self.reid
|
481
|
+
results = []
|
482
|
+
url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
|
483
|
+
domain = "www.reid.senate.gov"
|
484
|
+
doc = open_html(url)
|
485
|
+
return if doc.nil?
|
486
|
+
doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
|
487
|
+
results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
|
488
|
+
end
|
489
|
+
results
|
490
|
+
end
|
491
|
+
|
492
|
+
def self.palazzo(page=1)
|
493
|
+
results = []
|
494
|
+
domain = "palazzo.house.gov"
|
495
|
+
url = "http://palazzo.house.gov/news/documentquery.aspx?DocumentTypeID=2519&Page=#{page}"
|
496
|
+
doc = open_html(url)
|
497
|
+
return if doc.nil?
|
498
|
+
doc.xpath("//div[@class='middlecopy']//li").each do |row|
|
499
|
+
results << { :source => url, :url => "http://palazzo.house.gov/news/" + row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
|
500
|
+
end
|
501
|
+
results
|
502
|
+
end
|
503
|
+
|
504
|
+
def self.document_query(page=1)
|
505
|
+
results = []
|
506
|
+
domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
|
507
|
+
domains.each do |domain|
|
508
|
+
doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
|
509
|
+
return if doc.nil?
|
510
|
+
doc.xpath("//span[@class='middlecopy']").each do |row|
|
511
|
+
results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
|
512
|
+
end
|
513
|
+
end
|
514
|
+
results.flatten
|
515
|
+
end
|
516
|
+
|
517
|
+
|
518
|
+
|
519
|
+
|
520
|
+
end
|
521
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
3
|
+
module Utils
|
4
|
+
def self.absolute_link(url, link)
|
5
|
+
return link if link =~ /^http:\/\//
|
6
|
+
("http://"+URI.parse(url).host + "/"+link).to_s
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.remove_generic_urls!(results)
|
10
|
+
results.reject{|r| URI.parse(URI.escape(r[:url])).path == '/news/' or URI.parse(URI.escape(r[:url])).path == '/news'}
|
11
|
+
end
|
12
|
+
end
|
data/lib/statement/version.rb
CHANGED
data/lib/statement.rb
CHANGED
@@ -1,551 +1,9 @@
|
|
1
|
-
# encoding: utf-8
|
2
1
|
require "statement/version"
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
|
2
|
+
require "statement/feed"
|
3
|
+
require "statement/scraper"
|
4
|
+
require "statement/utils"
|
5
|
+
include Statement
|
7
6
|
|
8
7
|
module Statement
|
9
|
-
|
10
|
-
class Link
|
11
|
-
def self.absolute_link(url, link)
|
12
|
-
return link if link =~ /^http:\/\//
|
13
|
-
("http://"+URI.parse(url).host + "/"+link).to_s
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.open_rss(url)
|
17
|
-
begin
|
18
|
-
Nokogiri::XML(open(url))
|
19
|
-
rescue
|
20
|
-
nil
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def self.open_html(url)
|
25
|
-
begin
|
26
|
-
Nokogiri::HTML(open(url).read)
|
27
|
-
rescue
|
28
|
-
nil
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
def self.remove_generic_urls!(results)
|
33
|
-
results.reject{|r| URI.parse(r[:url]).path == '/news/' or URI.parse(r[:url]).path == '/news'}
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.date_from_rss_item(link)
|
37
|
-
if !link.xpath('pubDate').text.empty?
|
38
|
-
Date.parse(link.xpath('pubDate').text)
|
39
|
-
elsif !link.xpath('pubdate').empty?
|
40
|
-
Date.parse(link.xpath('pubdate').text)
|
41
|
-
else
|
42
|
-
nil
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
def self.from_rss(url)
|
47
|
-
doc = open_rss(url)
|
48
|
-
return unless doc
|
49
|
-
links = doc.xpath('//item')
|
50
|
-
results = links.map do |link|
|
51
|
-
abs_link = absolute_link(url, link.xpath('link').text)
|
52
|
-
abs_link = "http://www.burr.senate.gov/public/"+ link.xpath('link').text if url == 'http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed'
|
53
|
-
abs_link = link.xpath('link').text[37..-1] if url == "http://www.johanns.senate.gov/public/?a=RSS.Feed"
|
54
|
-
{ :source => url, :url => abs_link, :title => link.xpath('title').text, :date => date_from_rss_item(link), :domain => URI.parse(url).host }
|
55
|
-
end
|
56
|
-
remove_generic_urls!(results)
|
57
|
-
end
|
58
|
-
|
59
|
-
def self.house_gop(url)
|
60
|
-
doc = open_html(url)
|
61
|
-
return unless doc
|
62
|
-
uri = URI.parse(url)
|
63
|
-
date = Date.parse(uri.query.split('=').last)
|
64
|
-
links = doc.xpath("//ul[@id='membernews']").search('a')
|
65
|
-
results = links.map do |link|
|
66
|
-
abs_link = absolute_link(url, link["href"])
|
67
|
-
{ :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
|
68
|
-
end
|
69
|
-
remove_generic_urls!(results)
|
70
|
-
end
|
71
|
-
|
72
|
-
def self.from_scrapers
|
73
|
-
year = Date.today.year
|
74
|
-
results = [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, palazzo(page=1), billnelson(year=year),
|
75
|
-
document_query(page=1), document_query(page=2), donnelly(year=year), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
|
76
|
-
vitter_cowan(year=year), inhofe(year=year), reid].flatten
|
77
|
-
remove_generic_urls!(results)
|
78
|
-
end
|
79
|
-
|
80
|
-
def self.backfill_from_scrapers
|
81
|
-
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
82
|
-
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
83
|
-
boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
|
84
|
-
].flatten
|
85
|
-
remove_generic_urls!(results)
|
86
|
-
end
|
87
|
-
|
88
|
-
def self.committee_scrapers
|
89
|
-
year = Date.today.year
|
90
|
-
results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
|
91
|
-
senate_indian, senate_aging, senate_smallbiz_minority, senate_intel(113, 2013, 2014), house_energy_minority, house_homeland_security_minority,
|
92
|
-
house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
|
93
|
-
remove_generic_urls!(results)
|
94
|
-
end
|
95
|
-
|
96
|
-
## special cases for committees without RSS feeds
|
97
|
-
|
98
|
-
def self.senate_approps_majority
|
99
|
-
results = []
|
100
|
-
url = "http://www.appropriations.senate.gov/news.cfm"
|
101
|
-
doc = open_html(url)
|
102
|
-
return if doc.nil?
|
103
|
-
doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
|
104
|
-
date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
|
105
|
-
results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'majority' }
|
106
|
-
end
|
107
|
-
end
|
108
|
-
results
|
109
|
-
end
|
110
|
-
|
111
|
-
def self.senate_approps_minority
|
112
|
-
results = []
|
113
|
-
url = "http://www.appropriations.senate.gov/republican.cfm"
|
114
|
-
doc = open_html(url)
|
115
|
-
return if doc.nil?
|
116
|
-
doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
|
117
|
-
date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
|
118
|
-
results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'minority' }
|
119
|
-
end
|
120
|
-
end
|
121
|
-
results
|
122
|
-
end
|
123
|
-
|
124
|
-
def self.senate_banking(year)
|
125
|
-
results = []
|
126
|
-
url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
127
|
-
doc = open_html(url)
|
128
|
-
return if doc.nil?
|
129
|
-
doc.xpath("//tr").each do |row|
|
130
|
-
results << { :source => url, :url => "http://www.banking.senate.gov/public/" + row.children[2].children[1]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip+", #{year}"), :domain => "http://www.banking.senate.gov/", :party => 'majority' }
|
131
|
-
end
|
132
|
-
results
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.senate_hsag_majority(year)
|
136
|
-
results = []
|
137
|
-
url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
|
138
|
-
doc = open_html(url)
|
139
|
-
return if doc.nil?
|
140
|
-
doc.xpath("//tr").each do |row|
|
141
|
-
next if row.text.strip.size < 30
|
142
|
-
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'majority' }
|
143
|
-
end
|
144
|
-
results
|
145
|
-
end
|
146
|
-
|
147
|
-
def self.senate_hsag_minority(year)
|
148
|
-
results = []
|
149
|
-
url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
|
150
|
-
doc = open_html(url)
|
151
|
-
return if doc.nil?
|
152
|
-
doc.xpath("//tr").each do |row|
|
153
|
-
next if row.text.strip.size < 30
|
154
|
-
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'minority' }
|
155
|
-
end
|
156
|
-
results
|
157
|
-
end
|
158
|
-
|
159
|
-
def self.senate_indian
|
160
|
-
results = []
|
161
|
-
url = "http://www.indian.senate.gov/news/index.cfm"
|
162
|
-
doc = open_html(url)
|
163
|
-
return if doc.nil?
|
164
|
-
doc.xpath("//h3").each do |row|
|
165
|
-
results << { :source => url, :url => "http://www.indian.senate.gov"+row.children[0]['href'], :title => row.children[0].text, :date => Date.parse(row.previous.previous.text), :domain => "http://www.indian.senate.gov/", :party => 'majority' }
|
166
|
-
end
|
167
|
-
results
|
168
|
-
end
|
169
|
-
|
170
|
-
def self.senate_aging
|
171
|
-
results = []
|
172
|
-
url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
|
173
|
-
doc = open_html(url)
|
174
|
-
return if doc.nil?
|
175
|
-
doc.xpath("//tr")[6..104].each do |row|
|
176
|
-
results << { :source => url, :url => "http://www.aging.senate.gov/"+row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.aging.senate.gov/" }
|
177
|
-
end
|
178
|
-
results
|
179
|
-
end
|
180
|
-
|
181
|
-
def self.senate_smallbiz_minority
|
182
|
-
results = []
|
183
|
-
url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
|
184
|
-
doc = open_html(url)
|
185
|
-
return if doc.nil?
|
186
|
-
doc.xpath("//ul[@class='recordList']").each do |row|
|
187
|
-
results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
|
188
|
-
end
|
189
|
-
results
|
190
|
-
end
|
191
|
-
|
192
|
-
def self.senate_intel(congress, start_year, end_year)
|
193
|
-
results = []
|
194
|
-
url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
|
195
|
-
doc = open_html(url)
|
196
|
-
return if doc.nil?
|
197
|
-
doc.xpath("//tr[@valign='top']")[7..-1].each do |row|
|
198
|
-
results << { :source => url, :url => "http://www.intelligence.senate.gov/press/"+row.children[2].children[0]['href'], :title => row.children[2].children[0].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.intelligence.senate.gov/" }
|
199
|
-
end
|
200
|
-
results
|
201
|
-
end
|
202
|
-
|
203
|
-
def self.house_energy_minority
|
204
|
-
results = []
|
205
|
-
url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
|
206
|
-
doc = open_html(url)
|
207
|
-
return if doc.nil?
|
208
|
-
doc.xpath("//div[@class='views-field-title']").each do |row|
|
209
|
-
results << { :source => url, :url => "http://democrats.energycommerce.house.gov"+row.children[1].children[0]['href'], :title => row.children[1].children[0].text, :date => Date.parse(row.next.next.text.strip), :domain => "http://energycommerce.house.gov/", :party => 'minority' }
|
210
|
-
end
|
211
|
-
results
|
212
|
-
end
|
213
|
-
|
214
|
-
def self.house_homeland_security_minority
|
215
|
-
results = []
|
216
|
-
url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
|
217
|
-
doc = open_html(url)
|
218
|
-
return if doc.nil?
|
219
|
-
doc.xpath("//li[@class='article']").each do |row|
|
220
|
-
results << { :source => url, :url => "http://chsdemocrats.house.gov"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text), :domain => "http://chsdemocrats.house.gov/", :party => 'minority' }
|
221
|
-
end
|
222
|
-
results
|
223
|
-
end
|
224
|
-
|
225
|
-
def self.house_judiciary_majority
|
226
|
-
results = []
|
227
|
-
url = "http://judiciary.house.gov/news/press2013.html"
|
228
|
-
doc = open_html(url)
|
229
|
-
return if doc.nil?
|
230
|
-
doc.xpath("//p")[3..60].each do |row|
|
231
|
-
next if row.text.size < 30
|
232
|
-
results << { :source => url, :url => row.children[5]['href'], :title => row.children[0].text, :date => Date.parse(row.children[1].text.strip), :domain => "http://judiciary.house.gov/", :party => 'majority' }
|
233
|
-
end
|
234
|
-
results
|
235
|
-
end
|
236
|
-
|
237
|
-
def self.house_rules_majority
|
238
|
-
results = []
|
239
|
-
url = "http://www.rules.house.gov/News/Default.aspx"
|
240
|
-
doc = open_html(url)
|
241
|
-
return if doc.nil?
|
242
|
-
doc.xpath("//tr")[1..-2].each do |row|
|
243
|
-
next if row.text.strip.size < 30
|
244
|
-
results << { :source => url, :url => "http://www.rules.house.gov/News/"+row.children[0].children[1].children[0]['href'], :title => row.children[0].children[1].children[0].text, :date => Date.parse(row.children[2].children[1].text.strip), :domain => "http://www.rules.house.gov/", :party => 'majority' }
|
245
|
-
end
|
246
|
-
results
|
247
|
-
end
|
248
|
-
|
249
|
-
def self.house_ways_means_majority
|
250
|
-
results = []
|
251
|
-
url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
|
252
|
-
doc = open_html(url)
|
253
|
-
return if doc.nil?
|
254
|
-
doc.xpath("//ul[@class='UnorderedNewsList']").children.each do |row|
|
255
|
-
next if row.text.strip.size < 10
|
256
|
-
results << { :source => url, :url => "http://waysandmeans.house.gov"+row.children[1].children[1]['href'], :title => row.children[1].children[1].text, :date => Date.parse(row.children[3].children[0].text.strip), :domain => "http://waysandmeans.house.gov/", :party => 'majority' }
|
257
|
-
end
|
258
|
-
results
|
259
|
-
end
|
260
|
-
|
261
|
-
## special cases for members without RSS feeds
|
262
|
-
|
263
|
-
def self.capuano
|
264
|
-
results = []
|
265
|
-
base_url = "http://www.house.gov/capuano/news/"
|
266
|
-
list_url = base_url + 'date.shtml'
|
267
|
-
doc = open_html(list_url)
|
268
|
-
return if doc.nil?
|
269
|
-
doc.xpath("//a").each do |link|
|
270
|
-
if link['href'] and link['href'].include?('/pr')
|
271
|
-
begin
|
272
|
-
date = Date.parse(link.text)
|
273
|
-
rescue
|
274
|
-
date = nil
|
275
|
-
end
|
276
|
-
results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
|
277
|
-
end
|
278
|
-
end
|
279
|
-
return results[0..-5]
|
280
|
-
end
|
281
|
-
|
282
|
-
def self.cold_fusion(year, month)
|
283
|
-
results = []
|
284
|
-
year = Date.today.year if not year
|
285
|
-
month = 0 if not month
|
286
|
-
domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
|
287
|
-
domains.each do |domain|
|
288
|
-
if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
|
289
|
-
url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
290
|
-
elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
|
291
|
-
url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
292
|
-
else
|
293
|
-
url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
294
|
-
end
|
295
|
-
doc = open_html(url)
|
296
|
-
return if doc.nil?
|
297
|
-
doc.xpath("//tr")[2..-1].each do |row|
|
298
|
-
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
299
|
-
next if date_text == 'Date' or date_text.size > 8
|
300
|
-
date = Date.parse(date_text)
|
301
|
-
results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
|
302
|
-
end
|
303
|
-
end
|
304
|
-
results.flatten
|
305
|
-
end
|
306
|
-
|
307
|
-
def self.conaway(page=1)
|
308
|
-
results = []
|
309
|
-
base_url = "http://conaway.house.gov/news/"
|
310
|
-
page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
|
311
|
-
doc = open_html(page_url)
|
312
|
-
return if doc.nil?
|
313
|
-
doc.xpath("//tr")[1..-1].each do |row|
|
314
|
-
results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
|
315
|
-
end
|
316
|
-
results
|
317
|
-
end
|
318
|
-
|
319
|
-
def self.susandavis
|
320
|
-
results = []
|
321
|
-
base_url = "http://www.house.gov/susandavis/"
|
322
|
-
doc = open_html(base_url+'news.shtml')
|
323
|
-
return if doc.nil?
|
324
|
-
doc.search("ul")[6].children.each do |row|
|
325
|
-
next if row.text.strip == ''
|
326
|
-
results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
|
327
|
-
end
|
328
|
-
results
|
329
|
-
end
|
330
|
-
|
331
|
-
def self.faleomavaega
|
332
|
-
results = []
|
333
|
-
base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
|
334
|
-
doc = open_html(base_url)
|
335
|
-
return if doc.nil?
|
336
|
-
doc.xpath("//li[@type='disc']").each do |row|
|
337
|
-
results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[u201cu201d]/, '').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text), :domain => "house.gov/faleomavaega" }
|
338
|
-
end
|
339
|
-
results
|
340
|
-
end
|
341
|
-
|
342
|
-
def self.freshman_senators
|
343
|
-
results = []
|
344
|
-
['baldwin', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz'].each do |senator|
|
345
|
-
base_url = "http://www.#{senator}.senate.gov/"
|
346
|
-
doc = open_html(base_url+'press.cfm?maxrows=200&startrow=1&&type=1')
|
347
|
-
return if doc.nil?
|
348
|
-
doc.xpath("//tr")[3..-1].each do |row|
|
349
|
-
next if row.text.strip == ''
|
350
|
-
results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text), :domain => "#{senator}.senate.gov" }
|
351
|
-
end
|
352
|
-
end
|
353
|
-
results.flatten
|
354
|
-
end
|
355
|
-
|
356
|
-
def self.klobuchar
|
357
|
-
results = []
|
358
|
-
base_url = "http://www.klobuchar.senate.gov/"
|
359
|
-
[2012,2013].each do |year|
|
360
|
-
year_url = base_url + "newsreleases.cfm?year=#{year}"
|
361
|
-
doc = open_html(year_url)
|
362
|
-
return if doc.nil?
|
363
|
-
doc.xpath("//dt").each do |row|
|
364
|
-
results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[u201cu201d]/, '').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
|
365
|
-
end
|
366
|
-
end
|
367
|
-
results
|
368
|
-
end
|
369
|
-
|
370
|
-
def self.lujan
|
371
|
-
results = []
|
372
|
-
base_url = 'http://lujan.house.gov/'
|
373
|
-
doc = open_html(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78')
|
374
|
-
return if doc.nil?
|
375
|
-
doc.xpath('//ul')[1].children.each do |row|
|
376
|
-
next if row.text.strip == ''
|
377
|
-
results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text, :date => nil, :domain => "lujan.house.gov" }
|
378
|
-
end
|
379
|
-
results
|
380
|
-
end
|
381
|
-
|
382
|
-
def self.billnelson(year=2013)
|
383
|
-
results = []
|
384
|
-
base_url = "http://www.billnelson.senate.gov/news/"
|
385
|
-
year_url = base_url + "media.cfm?year=#{year}"
|
386
|
-
doc = open_html(year_url)
|
387
|
-
return if doc.nil?
|
388
|
-
doc.xpath('//li').each do |row|
|
389
|
-
results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
|
390
|
-
end
|
391
|
-
results
|
392
|
-
end
|
393
|
-
|
394
|
-
# fetches the latest 1000 releases, can be altered
|
395
|
-
def self.lautenberg(rows=1000)
|
396
|
-
results = []
|
397
|
-
base_url = 'http://www.lautenberg.senate.gov/newsroom/'
|
398
|
-
url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
|
399
|
-
doc = open_html(url)
|
400
|
-
return if doc.nil?
|
401
|
-
doc.xpath("//tr")[4..-2].each do |row|
|
402
|
-
results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
|
403
|
-
end
|
404
|
-
results
|
405
|
-
end
|
406
|
-
|
407
|
-
def self.crapo
|
408
|
-
results = []
|
409
|
-
base_url = "http://www.crapo.senate.gov/media/newsreleases/"
|
410
|
-
url = base_url + "release_all.cfm"
|
411
|
-
doc = open_html(url)
|
412
|
-
return if doc.nil?
|
413
|
-
doc.xpath("//tr").each do |row|
|
414
|
-
results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
|
415
|
-
end
|
416
|
-
results
|
417
|
-
end
|
418
|
-
|
419
|
-
def self.coburn(year=Date.today.year)
|
420
|
-
results = []
|
421
|
-
url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
|
422
|
-
doc = open_html(url)
|
423
|
-
return if doc.nil?
|
424
|
-
doc.xpath("//tr")[2..-1].each do |row|
|
425
|
-
next if row.text[0..3] == "Date"
|
426
|
-
results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
|
427
|
-
end
|
428
|
-
results
|
429
|
-
end
|
430
|
-
|
431
|
-
def self.boxer(start=1)
|
432
|
-
results = []
|
433
|
-
url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
|
434
|
-
domain = 'www.boxer.senate.gov'
|
435
|
-
doc = open_html(url)
|
436
|
-
return if doc.nil?
|
437
|
-
doc.xpath("//div[@class='left']")[1..-1].each do |row|
|
438
|
-
results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
|
439
|
-
end
|
440
|
-
results
|
441
|
-
end
|
442
|
-
|
443
|
-
def self.mccain(year=Date.today.year)
|
444
|
-
results = []
|
445
|
-
url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
446
|
-
domain = 'www.mccain.senate.gov'
|
447
|
-
doc = open_html(url)
|
448
|
-
return if doc.nil?
|
449
|
-
doc.xpath("//li")[7..-1].each do |row|
|
450
|
-
results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
|
451
|
-
end
|
452
|
-
results
|
453
|
-
end
|
454
|
-
|
455
|
-
def self.vitter_cowan(year=Date.today.year)
|
456
|
-
results = []
|
457
|
-
urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
|
458
|
-
urls.each do |url|
|
459
|
-
next if year < 2013 and url == "http://www.cowan.senate.gov/"
|
460
|
-
if url == "http://www.vitter.senate.gov/newsroom/"
|
461
|
-
domain = "www.vitter.senate.gov"
|
462
|
-
elsif url == "http://www.cowan.senate.gov/"
|
463
|
-
domain = "www.cowan.senate.gov"
|
464
|
-
end
|
465
|
-
doc = open_html(url+"press?year=#{year}")
|
466
|
-
return if doc.nil?
|
467
|
-
doc.xpath("//tr")[1..-1].each do |row|
|
468
|
-
next if row.text.strip.size < 30
|
469
|
-
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
470
|
-
end
|
471
|
-
end
|
472
|
-
results.flatten
|
473
|
-
end
|
474
|
-
|
475
|
-
def self.donnelly(year=Date.today.year)
|
476
|
-
results = []
|
477
|
-
url = "http://www.donnelly.senate.gov/newsroom/"
|
478
|
-
domain = "www.donnelly.senate.gov"
|
479
|
-
doc = open_html(url+"press?year=#{year}")
|
480
|
-
return if doc.nil?
|
481
|
-
doc.xpath("//tr")[1..-1].each do |row|
|
482
|
-
next if row.text.strip.size < 30
|
483
|
-
results << { :source => url, :url => "http://www.donnelly.senate.gov"+row.children[2].children[1]['href'].strip, :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => domain}
|
484
|
-
end
|
485
|
-
results
|
486
|
-
end
|
487
|
-
|
488
|
-
def self.inhofe(year=Date.today.year)
|
489
|
-
results = []
|
490
|
-
url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
|
491
|
-
domain = "www.inhofe.senate.gov"
|
492
|
-
doc = open_html(url)
|
493
|
-
return if doc.nil?
|
494
|
-
doc.xpath("//tr")[1..-1].each do |row|
|
495
|
-
next if row.text.strip.size < 30
|
496
|
-
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
497
|
-
end
|
498
|
-
results
|
499
|
-
end
|
500
|
-
|
501
|
-
def self.levin(page=1)
|
502
|
-
results = []
|
503
|
-
url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}§ion=press"
|
504
|
-
domain = "www.levin.senate.gov"
|
505
|
-
doc = open_html(url)
|
506
|
-
return if doc.nil?
|
507
|
-
doc.xpath('//tr').each do |row|
|
508
|
-
results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
509
|
-
end
|
510
|
-
results
|
511
|
-
end
|
512
|
-
|
513
|
-
def self.reid
|
514
|
-
results = []
|
515
|
-
url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
|
516
|
-
domain = "www.reid.senate.gov"
|
517
|
-
doc = open_html(url)
|
518
|
-
return if doc.nil?
|
519
|
-
doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
|
520
|
-
results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
|
521
|
-
end
|
522
|
-
results
|
523
|
-
end
|
524
|
-
|
525
|
-
def self.palazzo(page=1)
|
526
|
-
results = []
|
527
|
-
domain = "palazzo.house.gov"
|
528
|
-
url = "http://palazzo.house.gov/news/documentquery.aspx?DocumentTypeID=2519&Page=#{page}"
|
529
|
-
doc = open_html(url)
|
530
|
-
return if doc.nil?
|
531
|
-
doc.xpath("//div[@class='middlecopy']//li").each do |row|
|
532
|
-
results << { :source => url, :url => "http://palazzo.house.gov/news/" + row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
|
533
|
-
end
|
534
|
-
results
|
535
|
-
end
|
536
|
-
|
537
|
-
def self.document_query(page=1)
|
538
|
-
results = []
|
539
|
-
domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
|
540
|
-
domains.each do |domain|
|
541
|
-
doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
|
542
|
-
return if doc.nil?
|
543
|
-
doc.xpath("//span[@class='middlecopy']").each do |row|
|
544
|
-
results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
|
545
|
-
end
|
546
|
-
end
|
547
|
-
results.flatten
|
548
|
-
end
|
549
|
-
|
550
|
-
end
|
8
|
+
extend Utils
|
551
9
|
end
|
data/spec/statement_spec.rb
CHANGED
@@ -1,35 +1,34 @@
|
|
1
1
|
require "minitest/autorun"
|
2
2
|
require_relative "../lib/statement"
|
3
3
|
require 'webmock/minitest'
|
4
|
+
include Statement
|
4
5
|
|
5
6
|
describe Statement do
|
6
7
|
it "parses an rss feed" do
|
7
8
|
@feed_url = "http://ruiz.house.gov/rss.xml"
|
8
9
|
stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "ruiz_rss.xml")), :status => 200)
|
9
|
-
@results =
|
10
|
+
@results = Feed.from_rss(@feed_url)
|
10
11
|
@results.first[:domain].must_equal "ruiz.house.gov"
|
11
12
|
end
|
12
13
|
|
13
14
|
it "parses House GOP press release page" do
|
14
15
|
@feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
|
15
16
|
stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
|
16
|
-
@results =
|
17
|
+
@results = Scraper.house_gop(@feed_url)
|
17
18
|
@results.first[:source].must_equal @feed_url
|
18
19
|
end
|
19
20
|
|
20
21
|
it "does not attempt to parse dates when none are present" do
|
21
22
|
@feed_url = "http://culberson.house.gov/feed/rss/"
|
22
23
|
stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "culberson_rss.xml")), :status => 200)
|
23
|
-
|
24
|
-
@results = Statement::Link.from_rss(@feed_url)
|
24
|
+
@results = Feed.from_rss(@feed_url)
|
25
25
|
@results.first[:date].must_equal nil
|
26
26
|
end
|
27
27
|
|
28
28
|
it "parses invalid RSS" do
|
29
29
|
@feed_url = "http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed"
|
30
30
|
stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "richard_burr.xml")), :status => 200)
|
31
|
-
|
32
|
-
@results = Statement::Link.from_rss(@feed_url)
|
31
|
+
@results = Feed.from_rss(@feed_url)
|
33
32
|
@results.first[:url].must_equal "http://www.burr.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&Type=Press Release&ContentRecord_id=65dbea38-d64c-6208-ef8f-2b000e899b3a"
|
34
33
|
@results.first[:date].to_s.must_equal "2013-05-02"
|
35
34
|
end
|
@@ -37,14 +36,14 @@ describe Statement do
|
|
37
36
|
it "handles relative URLs" do
|
38
37
|
@feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
|
39
38
|
stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
|
40
|
-
@results =
|
39
|
+
@results = Scraper.house_gop(@feed_url)
|
41
40
|
@results.last[:url].must_equal "http://www.gop.gov/republicans/other/relative_url_test.html"
|
42
41
|
end
|
43
42
|
|
44
43
|
it "scrapes a senate cold fusion page" do
|
45
44
|
@url = "http://www.billnelson.senate.gov/news/media.cfm?year=2013"
|
46
45
|
stub_request(:any, @url).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'bill_nelson_press.html')), :status => 200)
|
47
|
-
@results =
|
46
|
+
@results = Scraper.billnelson(year=2013)
|
48
47
|
@results.last[:url].must_equal "http://www.billnelson.senate.gov/news/details.cfm?id=338190&"
|
49
48
|
end
|
50
49
|
|
@@ -53,7 +52,7 @@ describe Statement do
|
|
53
52
|
@cowan = "http://www.cowan.senate.gov/press?year=2013"
|
54
53
|
stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
|
55
54
|
stub_request(:any, @cowan).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'cowan_press.html')), :status => 200)
|
56
|
-
@results =
|
55
|
+
@results = Scraper.vitter_cowan(year=2013)
|
57
56
|
@results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov", "www.cowan.senate.gov"]
|
58
57
|
end
|
59
58
|
|
@@ -62,7 +61,7 @@ describe Statement do
|
|
62
61
|
@cowan = "http://www.cowan.senate.gov/press?year=2012"
|
63
62
|
stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
|
64
63
|
stub_request(:any, @cowan).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'cowan_press.html')), :status => 200)
|
65
|
-
@results =
|
64
|
+
@results = Scraper.vitter_cowan(year=2012)
|
66
65
|
@results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov"]
|
67
66
|
end
|
68
67
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.9'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2013-05-17 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
16
|
-
requirement: &
|
16
|
+
requirement: &2155945560 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '1.3'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2155945560
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rake
|
27
|
-
requirement: &
|
27
|
+
requirement: &2155944480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2155944480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: webmock
|
38
|
-
requirement: &
|
38
|
+
requirement: &2155943560 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2155943560
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: american_date
|
49
|
-
requirement: &
|
49
|
+
requirement: &2155942580 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *2155942580
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: nokogiri
|
60
|
-
requirement: &
|
60
|
+
requirement: &2155941660 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,7 +65,7 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *2155941660
|
69
69
|
description: Crawls congressional websites for press releases.
|
70
70
|
email:
|
71
71
|
- dwillis@gmail.com
|
@@ -79,6 +79,9 @@ files:
|
|
79
79
|
- README.md
|
80
80
|
- Rakefile
|
81
81
|
- lib/statement.rb
|
82
|
+
- lib/statement/feed.rb
|
83
|
+
- lib/statement/scraper.rb
|
84
|
+
- lib/statement/utils.rb
|
82
85
|
- lib/statement/version.rb
|
83
86
|
- spec/bill_nelson_press.html
|
84
87
|
- spec/cowan_press.html
|