statement 0.5 → 0.6

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,10 @@
1
1
  # Statement
2
2
 
3
- Statement parses RSS feeds and HTML pages containing press releases and other official statements from members of Congress, and produces hashes with information about those pages.
3
+ Statement parses RSS feeds and HTML pages containing press releases and other official statements from members of Congress, and produces hashes with information about those pages. It has been tested under Ruby 1.9.2 and 1.9.3.
4
+
5
+ ## Coverage
6
+
7
+ Statement currently parses press releases for members of the House and Senate. For members with RSS feeds, you can pass the feed URL into Statement. For members without RSS feeds, HTML scrapers are provided, as are methods for speciality groups, such as House Republicans. Suggestions are welcomed.
4
8
 
5
9
  ## Installation
6
10
 
@@ -28,9 +32,9 @@ $ gem install statement
28
32
  require 'rubygems'
29
33
  require 'statement'
30
34
 
31
- results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
35
+ results = Statement::Link.from_rss('http://blumenauer.house.gov/index.php?option=com_bca-rss-syndicator&feed_id=1')
32
36
  puts results.first
33
- {:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
37
+ {:source=>"http://blumenauer.house.gov/index.php?option=com_bca-rss-syndicator&feed_id=1", :url=>"http://blumenauer.house.gov/index.php?option=com_content&amp;view=article&amp;id=2203:blumenauer-qwe-need-a-national-system-that-speaks-to-the-transportation-challenges-of-todayq&amp;catid=66:2013-press-releases", :title=>"Blumenauer: &quot;We need a national system that speaks to the transportation challenges of ...", :date=>#<Date: 2013-04-24 ((2456407j,0s,0n),+0s,2299161j)>, :domain=>"blumenauer.house.gov"}
34
38
  ```
35
39
 
36
40
  ## Tests
@@ -49,6 +53,8 @@ $ rake test
49
53
  4. Push to the branch (`git push origin my-new-feature`)
50
54
  5. Create new Pull Request
51
55
 
56
+ If you write a new scraper, please use Nokogiri for parsing - see some of the existing examples for guidance. The ``domain`` attribute represents the URI base domain of the source site.
57
+
52
58
  ## Authors
53
59
 
54
60
  * Derek Willis
@@ -1,3 +1,3 @@
1
1
  module Statement
2
- VERSION = "0.5"
2
+ VERSION = "0.6"
3
3
  end
data/lib/statement.rb CHANGED
@@ -36,8 +36,17 @@ module Statement
36
36
  end
37
37
 
38
38
  def self.from_scrapers
39
- [freshman_senators, capuano, crenshaw(2013, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=2013),
40
- billnelson(year=2012), roe(page=1), roe(page=2), roe(page=3), thornberry(page=1), thornberry(page=2), thornberry(page=3)].flatten
39
+ year = Date.today.year
40
+ [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=year),
41
+ document_query(page=1), document_query(page=2), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
42
+ vitter_cowan(year=year), inhofe(year=year), reid].flatten
43
+ end
44
+
45
+ def self.backfill_from_scrapers
46
+ [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
47
+ document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
48
+ boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
49
+ ].flatten
41
50
  end
42
51
 
43
52
  ## special cases for members without RSS feeds
@@ -60,19 +69,28 @@ module Statement
60
69
  return results[0..-5]
61
70
  end
62
71
 
63
- def self.crenshaw(year, month)
72
+ def self.cold_fusion(year, month)
64
73
  results = []
65
74
  year = Date.today.year if not year
66
75
  month = 0 if not month
67
- url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
68
- doc = Nokogiri::HTML(open(url).read)
69
- doc.xpath("//tr")[2..-1].each do |row|
70
- date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
71
- next if date_text == 'Date'
72
- date = Date.parse(date_text)
73
- results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => "crenshaw.house.gov" }
76
+ domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
77
+ domains.each do |domain|
78
+ if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
79
+ url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
80
+ elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
81
+ url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
82
+ else
83
+ url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
84
+ end
85
+ doc = Nokogiri::HTML(open(url).read)
86
+ doc.xpath("//tr")[2..-1].each do |row|
87
+ date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
88
+ next if date_text == 'Date' or date_text.size > 8
89
+ date = Date.parse(date_text)
90
+ results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
91
+ end
74
92
  end
75
- results
93
+ results.flatten
76
94
  end
77
95
 
78
96
  def self.conaway(page=1)
@@ -150,30 +168,126 @@ module Statement
150
168
  year_url = base_url + "media.cfm?year=#{year}"
151
169
  doc = Nokogiri::HTML(open(year_url).read)
152
170
  doc.xpath('//li').each do |row|
153
- results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text).to_s, :domain => "billnelson.senate.gov" }
171
+ results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
172
+ end
173
+ results
174
+ end
175
+
176
+ # fetches the latest 1000 releases, can be altered
177
+ def self.lautenberg(rows=1000)
178
+ results = []
179
+ base_url = 'http://www.lautenberg.senate.gov/newsroom/'
180
+ url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
181
+ doc = Nokogiri::HTML(open(url).read)
182
+ doc.xpath("//tr")[4..-2].each do |row|
183
+ results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
184
+ end
185
+ results
186
+ end
187
+
188
+ def self.crapo
189
+ results = []
190
+ base_url = "http://www.crapo.senate.gov/media/newsreleases/"
191
+ url = base_url + "release_all.cfm"
192
+ doc = Nokogiri::HTML(open(url).read)
193
+ doc.xpath("//tr").each do |row|
194
+ results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
195
+ end
196
+ results
197
+ end
198
+
199
+ def self.coburn(year=Date.today.year)
200
+ results = []
201
+ url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
202
+ doc = Nokogiri::HTML(open(url).read)
203
+ doc.xpath("//tr")[2..-1].each do |row|
204
+ next if row.text[0..3] == "Date"
205
+ results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
206
+ end
207
+ results
208
+ end
209
+
210
+ def self.boxer(start=1)
211
+ results = []
212
+ url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
213
+ domain = 'www.boxer.senate.gov'
214
+ doc = Nokogiri::HTML(open(url).read)
215
+ doc.xpath("//div[@class='left']")[1..-1].each do |row|
216
+ results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
217
+ end
218
+ results
219
+ end
220
+
221
+ def self.mccain(year=Date.today.year)
222
+ results = []
223
+ url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
224
+ domain = 'www.mccain.senate.gov'
225
+ doc = Nokogiri::HTML(open(url).read)
226
+ doc.xpath("//li")[7..-1].each do |row|
227
+ results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
228
+ end
229
+ results
230
+ end
231
+
232
+ def self.vitter_cowan(year=Date.today.year)
233
+ results = []
234
+ urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
235
+ urls.each do |url|
236
+ next if year < 2013 and url == "http://www.cowan.senate.gov/"
237
+ domain = url == "http://www.vitter.senate.gov/newsroom/" ? "www.vitter.senate.gov" : "www.cowan.senate.gov"
238
+ doc = Nokogiri::HTML(open(url+"press?year=#{year}").read)
239
+ doc.xpath("//tr")[1..-1].each do |row|
240
+ next if row.text.strip.size < 30
241
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
242
+ end
243
+ end
244
+ results.flatten
245
+ end
246
+
247
+ def self.inhofe(year=Date.today.year)
248
+ results = []
249
+ url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
250
+ domain = "www.inhofe.senate.gov"
251
+ doc = Nokogiri::HTML(open(url).read)
252
+ doc.xpath("//tr")[1..-1].each do |row|
253
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
154
254
  end
155
255
  results
156
256
  end
157
257
 
158
- def self.roe(page=1)
258
+ def self.levin(page=1)
159
259
  results = []
160
- base_url = "http://roe.house.gov/news/"
161
- doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}").read)
162
- doc.xpath("//span[@class='middlecopy']").each do |row|
163
- results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => "roe.house.gov" }
260
+ url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}&section=press"
261
+ domain = "www.levin.senate.gov"
262
+ doc = Nokogiri::HTML(open(url).read)
263
+ doc.xpath('//tr').each do |row|
264
+ results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
164
265
  end
165
266
  results
166
267
  end
167
268
 
168
- def self.thornberry(page=1)
269
+ def self.reid
169
270
  results = []
170
- base_url = "http://thornberry.house.gov/news/"
171
- doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}").read)
172
- doc.xpath("//span[@class='middlecopy']").each do |row|
173
- results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => "thornberry.house.gov" }
271
+ url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
272
+ domain = "www.reid.senate.gov"
273
+ doc = Nokogiri::HTML(open(url).read)
274
+ doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
275
+ results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
174
276
  end
175
277
  results
176
278
  end
177
279
 
280
+ def self.document_query(page=1)
281
+ results = []
282
+ domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
283
+ domains.each do |domain|
284
+ doc = Nokogiri::HTML(open("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}").read)
285
+ doc.xpath("//span[@class='middlecopy']").each do |row|
286
+ results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
287
+ end
288
+ end
289
+ results.flatten
290
+ end
291
+
178
292
  end
179
293
  end