statement 0.5 → 0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +9 -3
- data/lib/statement/version.rb +1 -1
- data/lib/statement.rb +136 -22
- data/spec/bill_nelson_press.html +593 -0
- data/spec/cowan_press.html +1840 -0
- data/spec/statement_spec.rb +26 -0
- data/spec/vitter_press.html +2461 -0
- metadata +18 -12
data/README.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
# Statement
|
2
2
|
|
3
|
-
Statement parses RSS feeds and HTML pages containing press releases and other official statements from members of Congress, and produces hashes with information about those pages.
|
3
|
+
Statement parses RSS feeds and HTML pages containing press releases and other official statements from members of Congress, and produces hashes with information about those pages. It has been tested under Ruby 1.9.2 and 1.9.3.
|
4
|
+
|
5
|
+
## Coverage
|
6
|
+
|
7
|
+
Statement currently parses press releases for members of the House and Senate. For members with RSS feeds, you can pass the feed URL into Statement. For members without RSS feeds, HTML scrapers are provided, as are methods for speciality groups, such as House Republicans. Suggestions are welcomed.
|
4
8
|
|
5
9
|
## Installation
|
6
10
|
|
@@ -28,9 +32,9 @@ $ gem install statement
|
|
28
32
|
require 'rubygems'
|
29
33
|
require 'statement'
|
30
34
|
|
31
|
-
results = Statement::Link.
|
35
|
+
results = Statement::Link.from_rss('http://blumenauer.house.gov/index.php?option=com_bca-rss-syndicator&feed_id=1')
|
32
36
|
puts results.first
|
33
|
-
{:source=>"http://
|
37
|
+
{:source=>"http://blumenauer.house.gov/index.php?option=com_bca-rss-syndicator&feed_id=1", :url=>"http://blumenauer.house.gov/index.php?option=com_content&view=article&id=2203:blumenauer-qwe-need-a-national-system-that-speaks-to-the-transportation-challenges-of-todayq&catid=66:2013-press-releases", :title=>"Blumenauer: "We need a national system that speaks to the transportation challenges of ...", :date=>#<Date: 2013-04-24 ((2456407j,0s,0n),+0s,2299161j)>, :domain=>"blumenauer.house.gov"}
|
34
38
|
```
|
35
39
|
|
36
40
|
## Tests
|
@@ -49,6 +53,8 @@ $ rake test
|
|
49
53
|
4. Push to the branch (`git push origin my-new-feature`)
|
50
54
|
5. Create new Pull Request
|
51
55
|
|
56
|
+
If you write a new scraper, please use Nokogiri for parsing - see some of the existing examples for guidance. The ``domain`` attribute represents the URI base domain of the source site.
|
57
|
+
|
52
58
|
## Authors
|
53
59
|
|
54
60
|
* Derek Willis
|
data/lib/statement/version.rb
CHANGED
data/lib/statement.rb
CHANGED
@@ -36,8 +36,17 @@ module Statement
|
|
36
36
|
end
|
37
37
|
|
38
38
|
def self.from_scrapers
|
39
|
-
|
40
|
-
|
39
|
+
year = Date.today.year
|
40
|
+
[freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=year),
|
41
|
+
document_query(page=1), document_query(page=2), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
|
42
|
+
vitter_cowan(year=year), inhofe(year=year), reid].flatten
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.backfill_from_scrapers
|
46
|
+
[cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
47
|
+
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
48
|
+
boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
|
49
|
+
].flatten
|
41
50
|
end
|
42
51
|
|
43
52
|
## special cases for members without RSS feeds
|
@@ -60,19 +69,28 @@ module Statement
|
|
60
69
|
return results[0..-5]
|
61
70
|
end
|
62
71
|
|
63
|
-
def self.
|
72
|
+
def self.cold_fusion(year, month)
|
64
73
|
results = []
|
65
74
|
year = Date.today.year if not year
|
66
75
|
month = 0 if not month
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
76
|
+
domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
|
77
|
+
domains.each do |domain|
|
78
|
+
if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
|
79
|
+
url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
80
|
+
elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
|
81
|
+
url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
82
|
+
else
|
83
|
+
url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
84
|
+
end
|
85
|
+
doc = Nokogiri::HTML(open(url).read)
|
86
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
87
|
+
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
88
|
+
next if date_text == 'Date' or date_text.size > 8
|
89
|
+
date = Date.parse(date_text)
|
90
|
+
results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
|
91
|
+
end
|
74
92
|
end
|
75
|
-
results
|
93
|
+
results.flatten
|
76
94
|
end
|
77
95
|
|
78
96
|
def self.conaway(page=1)
|
@@ -150,30 +168,126 @@ module Statement
|
|
150
168
|
year_url = base_url + "media.cfm?year=#{year}"
|
151
169
|
doc = Nokogiri::HTML(open(year_url).read)
|
152
170
|
doc.xpath('//li').each do |row|
|
153
|
-
results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text)
|
171
|
+
results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
|
172
|
+
end
|
173
|
+
results
|
174
|
+
end
|
175
|
+
|
176
|
+
# fetches the latest 1000 releases, can be altered
|
177
|
+
def self.lautenberg(rows=1000)
|
178
|
+
results = []
|
179
|
+
base_url = 'http://www.lautenberg.senate.gov/newsroom/'
|
180
|
+
url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
|
181
|
+
doc = Nokogiri::HTML(open(url).read)
|
182
|
+
doc.xpath("//tr")[4..-2].each do |row|
|
183
|
+
results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
|
184
|
+
end
|
185
|
+
results
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.crapo
|
189
|
+
results = []
|
190
|
+
base_url = "http://www.crapo.senate.gov/media/newsreleases/"
|
191
|
+
url = base_url + "release_all.cfm"
|
192
|
+
doc = Nokogiri::HTML(open(url).read)
|
193
|
+
doc.xpath("//tr").each do |row|
|
194
|
+
results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
|
195
|
+
end
|
196
|
+
results
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.coburn(year=Date.today.year)
|
200
|
+
results = []
|
201
|
+
url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
|
202
|
+
doc = Nokogiri::HTML(open(url).read)
|
203
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
204
|
+
next if row.text[0..3] == "Date"
|
205
|
+
results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
|
206
|
+
end
|
207
|
+
results
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.boxer(start=1)
|
211
|
+
results = []
|
212
|
+
url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
|
213
|
+
domain = 'www.boxer.senate.gov'
|
214
|
+
doc = Nokogiri::HTML(open(url).read)
|
215
|
+
doc.xpath("//div[@class='left']")[1..-1].each do |row|
|
216
|
+
results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
|
217
|
+
end
|
218
|
+
results
|
219
|
+
end
|
220
|
+
|
221
|
+
def self.mccain(year=Date.today.year)
|
222
|
+
results = []
|
223
|
+
url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
224
|
+
domain = 'www.mccain.senate.gov'
|
225
|
+
doc = Nokogiri::HTML(open(url).read)
|
226
|
+
doc.xpath("//li")[7..-1].each do |row|
|
227
|
+
results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
|
228
|
+
end
|
229
|
+
results
|
230
|
+
end
|
231
|
+
|
232
|
+
def self.vitter_cowan(year=Date.today.year)
|
233
|
+
results = []
|
234
|
+
urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
|
235
|
+
urls.each do |url|
|
236
|
+
next if year < 2013 and url == "http://www.cowan.senate.gov/"
|
237
|
+
domain = url == "http://www.vitter.senate.gov/newsroom/" ? "www.vitter.senate.gov" : "www.cowan.senate.gov"
|
238
|
+
doc = Nokogiri::HTML(open(url+"press?year=#{year}").read)
|
239
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
240
|
+
next if row.text.strip.size < 30
|
241
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
242
|
+
end
|
243
|
+
end
|
244
|
+
results.flatten
|
245
|
+
end
|
246
|
+
|
247
|
+
def self.inhofe(year=Date.today.year)
|
248
|
+
results = []
|
249
|
+
url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
|
250
|
+
domain = "www.inhofe.senate.gov"
|
251
|
+
doc = Nokogiri::HTML(open(url).read)
|
252
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
253
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
154
254
|
end
|
155
255
|
results
|
156
256
|
end
|
157
257
|
|
158
|
-
def self.
|
258
|
+
def self.levin(page=1)
|
159
259
|
results = []
|
160
|
-
|
161
|
-
|
162
|
-
doc
|
163
|
-
|
260
|
+
url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}§ion=press"
|
261
|
+
domain = "www.levin.senate.gov"
|
262
|
+
doc = Nokogiri::HTML(open(url).read)
|
263
|
+
doc.xpath('//tr').each do |row|
|
264
|
+
results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
|
164
265
|
end
|
165
266
|
results
|
166
267
|
end
|
167
268
|
|
168
|
-
def self.
|
269
|
+
def self.reid
|
169
270
|
results = []
|
170
|
-
|
171
|
-
|
172
|
-
doc
|
173
|
-
|
271
|
+
url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
|
272
|
+
domain = "www.reid.senate.gov"
|
273
|
+
doc = Nokogiri::HTML(open(url).read)
|
274
|
+
doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
|
275
|
+
results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
|
174
276
|
end
|
175
277
|
results
|
176
278
|
end
|
177
279
|
|
280
|
+
def self.document_query(page=1)
|
281
|
+
results = []
|
282
|
+
domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
|
283
|
+
domains.each do |domain|
|
284
|
+
doc = Nokogiri::HTML(open("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}").read)
|
285
|
+
doc.xpath("//span[@class='middlecopy']").each do |row|
|
286
|
+
results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
|
287
|
+
end
|
288
|
+
end
|
289
|
+
results.flatten
|
290
|
+
end
|
291
|
+
|
178
292
|
end
|
179
293
|
end
|