statement 0.8.2 → 0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'american_date'
5
+ require 'nokogiri'
6
+ include Statement
7
+
8
+ module Statement
9
+ class Feed
10
+
11
+ def self.open_rss(url)
12
+ begin
13
+ Nokogiri::XML(open(url))
14
+ rescue
15
+ nil
16
+ end
17
+ end
18
+
19
+ def self.date_from_rss_item(link)
20
+ if !link.xpath('pubDate').text.empty?
21
+ Date.parse(link.xpath('pubDate').text)
22
+ elsif !link.xpath('pubdate').empty?
23
+ Date.parse(link.xpath('pubdate').text)
24
+ else
25
+ nil
26
+ end
27
+ end
28
+
29
+ def self.from_rss(url)
30
+ doc = open_rss(url)
31
+ return unless doc
32
+ links = doc.xpath('//item')
33
+ results = links.map do |link|
34
+ abs_link = Utils.absolute_link(url, link.xpath('link').text)
35
+ abs_link = "http://www.burr.senate.gov/public/"+ link.xpath('link').text if url == 'http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed'
36
+ abs_link = link.xpath('link').text[37..-1] if url == "http://www.johanns.senate.gov/public/?a=RSS.Feed"
37
+ { :source => url, :url => abs_link, :title => link.xpath('title').text, :date => date_from_rss_item(link), :domain => URI.parse(url).host }
38
+ end
39
+ Utils.remove_generic_urls!(results)
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,521 @@
1
+ # encoding: utf-8
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'american_date'
5
+ require 'nokogiri'
6
+
7
+ module Statement
8
+ class Scraper
9
+
10
+ def self.open_html(url)
11
+ begin
12
+ Nokogiri::HTML(open(url).read)
13
+ rescue
14
+ nil
15
+ end
16
+ end
17
+
18
+ def self.house_gop(url)
19
+ doc = open_html(url)
20
+ return unless doc
21
+ uri = URI.parse(url)
22
+ date = Date.parse(uri.query.split('=').last)
23
+ links = doc.xpath("//ul[@id='membernews']").search('a')
24
+ results = links.map do |link|
25
+ abs_link = Utils.absolute_link(url, link["href"])
26
+ { :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
27
+ end
28
+ Utils.remove_generic_urls!(results)
29
+ end
30
+
31
+ def self.member_methods
32
+ [:capuano, :cold_fusion, :conaway, :susandavis, :faleomavaega, :freshman_senators, :klobuchar, :lujan, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :mccain, :vitter_cowan, :donnelly, :inhofe, :levin, :reid, :palazzo, :document_query]
33
+ end
34
+
35
+ def self.committee_methods
36
+ [:senate_approps_majority, :senate_approps_minority, :senate_banking, :senate_hsag_majority, :senate_hsag_minority, :senate_indian, :senate_aging, :senate_smallbiz_minority, :senate_intel, :house_energy_minority, :house_homeland_security_minority, :house_judiciary_majority, :house_rules_majority, :house_ways_means_majority]
37
+ end
38
+
39
+ def self.member_scrapers
40
+ year = Date.today.year
41
+ results = [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, palazzo(page=1), billnelson(year=year),
42
+ document_query(page=1), document_query(page=2), donnelly(year=year), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
43
+ vitter_cowan(year=year), inhofe(year=year), reid].flatten
44
+ Utils.remove_generic_urls!(results)
45
+ end
46
+
47
+ def self.backfill_from_scrapers
48
+ results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
49
+ document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
50
+ boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
51
+ ].flatten
52
+ Utils.remove_generic_urls!(results)
53
+ end
54
+
55
+ def self.committee_scrapers
56
+ year = Date.today.year
57
+ results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
58
+ senate_indian, senate_aging, senate_smallbiz_minority, senate_intel(113, 2013, 2014), house_energy_minority, house_homeland_security_minority,
59
+ house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
60
+ Utils.remove_generic_urls!(results)
61
+ end
62
+
63
+ ## special cases for committees without RSS feeds
64
+
65
+ def self.senate_approps_majority
66
+ results = []
67
+ url = "http://www.appropriations.senate.gov/news.cfm"
68
+ doc = open_html(url)
69
+ return if doc.nil?
70
+ doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
71
+ date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
72
+ results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'majority' }
73
+ end
74
+ end
75
+ results
76
+ end
77
+
78
+ def self.senate_approps_minority
79
+ results = []
80
+ url = "http://www.appropriations.senate.gov/republican.cfm"
81
+ doc = open_html(url)
82
+ return if doc.nil?
83
+ doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
84
+ date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
85
+ results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'minority' }
86
+ end
87
+ end
88
+ results
89
+ end
90
+
91
+ def self.senate_banking(year=Date.today.year)
92
+ results = []
93
+ url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
94
+ doc = open_html(url)
95
+ return if doc.nil?
96
+ doc.xpath("//tr").each do |row|
97
+ results << { :source => url, :url => "http://www.banking.senate.gov/public/" + row.children[2].children[1]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip+", #{year}"), :domain => "http://www.banking.senate.gov/", :party => 'majority' }
98
+ end
99
+ results
100
+ end
101
+
102
+ def self.senate_hsag_majority(year=Date.today.year)
103
+ results = []
104
+ url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
105
+ doc = open_html(url)
106
+ return if doc.nil?
107
+ doc.xpath("//tr").each do |row|
108
+ next if row.text.strip.size < 30
109
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'majority' }
110
+ end
111
+ results
112
+ end
113
+
114
+ def self.senate_hsag_minority(year=Date.today.year)
115
+ results = []
116
+ url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
117
+ doc = open_html(url)
118
+ return if doc.nil?
119
+ doc.xpath("//tr").each do |row|
120
+ next if row.text.strip.size < 30
121
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'minority' }
122
+ end
123
+ results
124
+ end
125
+
126
+ def self.senate_indian
127
+ results = []
128
+ url = "http://www.indian.senate.gov/news/index.cfm"
129
+ doc = open_html(url)
130
+ return if doc.nil?
131
+ doc.xpath("//h3").each do |row|
132
+ results << { :source => url, :url => "http://www.indian.senate.gov"+row.children[0]['href'], :title => row.children[0].text, :date => Date.parse(row.previous.previous.text), :domain => "http://www.indian.senate.gov/", :party => 'majority' }
133
+ end
134
+ results
135
+ end
136
+
137
+ def self.senate_aging
138
+ results = []
139
+ url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
140
+ doc = open_html(url)
141
+ return if doc.nil?
142
+ doc.xpath("//tr")[6..104].each do |row|
143
+ results << { :source => url, :url => "http://www.aging.senate.gov/"+row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.aging.senate.gov/" }
144
+ end
145
+ results
146
+ end
147
+
148
+ def self.senate_smallbiz_minority
149
+ results = []
150
+ url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
151
+ doc = open_html(url)
152
+ return if doc.nil?
153
+ doc.xpath("//ul[@class='recordList']").each do |row|
154
+ results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
155
+ end
156
+ results
157
+ end
158
+
159
+ def self.senate_intel(congress=113, start_year=2013, end_year=2014)
160
+ results = []
161
+ url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
162
+ doc = open_html(url)
163
+ return if doc.nil?
164
+ doc.xpath("//tr[@valign='top']")[7..-1].each do |row|
165
+ results << { :source => url, :url => "http://www.intelligence.senate.gov/press/"+row.children[2].children[0]['href'], :title => row.children[2].children[0].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.intelligence.senate.gov/" }
166
+ end
167
+ results
168
+ end
169
+
170
+ def self.house_energy_minority
171
+ results = []
172
+ url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
173
+ doc = open_html(url)
174
+ return if doc.nil?
175
+ doc.xpath("//div[@class='views-field-title']").each do |row|
176
+ results << { :source => url, :url => "http://democrats.energycommerce.house.gov"+row.children[1].children[0]['href'], :title => row.children[1].children[0].text, :date => Date.parse(row.next.next.text.strip), :domain => "http://energycommerce.house.gov/", :party => 'minority' }
177
+ end
178
+ results
179
+ end
180
+
181
+ def self.house_homeland_security_minority
182
+ results = []
183
+ url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
184
+ doc = open_html(url)
185
+ return if doc.nil?
186
+ doc.xpath("//li[@class='article']").each do |row|
187
+ results << { :source => url, :url => "http://chsdemocrats.house.gov"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text), :domain => "http://chsdemocrats.house.gov/", :party => 'minority' }
188
+ end
189
+ results
190
+ end
191
+
192
+ def self.house_judiciary_majority
193
+ results = []
194
+ url = "http://judiciary.house.gov/news/press2013.html"
195
+ doc = open_html(url)
196
+ return if doc.nil?
197
+ doc.xpath("//p")[3..60].each do |row|
198
+ next if row.text.size < 30
199
+ results << { :source => url, :url => row.children[5]['href'], :title => row.children[0].text, :date => Date.parse(row.children[1].text.strip), :domain => "http://judiciary.house.gov/", :party => 'majority' }
200
+ end
201
+ results
202
+ end
203
+
204
+ def self.house_rules_majority
205
+ results = []
206
+ url = "http://www.rules.house.gov/News/Default.aspx"
207
+ doc = open_html(url)
208
+ return if doc.nil?
209
+ doc.xpath("//tr")[1..-2].each do |row|
210
+ next if row.text.strip.size < 30
211
+ results << { :source => url, :url => "http://www.rules.house.gov/News/"+row.children[0].children[1].children[0]['href'], :title => row.children[0].children[1].children[0].text, :date => Date.parse(row.children[2].children[1].text.strip), :domain => "http://www.rules.house.gov/", :party => 'majority' }
212
+ end
213
+ results
214
+ end
215
+
216
+ def self.house_ways_means_majority
217
+ results = []
218
+ url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
219
+ doc = open_html(url)
220
+ return if doc.nil?
221
+ doc.xpath("//ul[@class='UnorderedNewsList']").children.each do |row|
222
+ next if row.text.strip.size < 10
223
+ results << { :source => url, :url => "http://waysandmeans.house.gov"+row.children[1].children[1]['href'], :title => row.children[1].children[1].text, :date => Date.parse(row.children[3].children[0].text.strip), :domain => "http://waysandmeans.house.gov/", :party => 'majority' }
224
+ end
225
+ results
226
+ end
227
+
228
+ ## special cases for members without RSS feeds
229
+
230
+ def self.capuano
231
+ results = []
232
+ base_url = "http://www.house.gov/capuano/news/"
233
+ list_url = base_url + 'date.shtml'
234
+ doc = open_html(list_url)
235
+ return if doc.nil?
236
+ doc.xpath("//a").each do |link|
237
+ if link['href'] and link['href'].include?('/pr')
238
+ begin
239
+ date = Date.parse(link.text)
240
+ rescue
241
+ date = nil
242
+ end
243
+ results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
244
+ end
245
+ end
246
+ return results[0..-5]
247
+ end
248
+
249
+ def self.cold_fusion(year=Date.today.year, month=0)
250
+ results = []
251
+ year = Date.today.year if not year
252
+ month = 0 if not month
253
+ domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
254
+ domains.each do |domain|
255
+ if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
256
+ url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
257
+ elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
258
+ url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
259
+ else
260
+ url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
261
+ end
262
+ doc = open_html(url)
263
+ return if doc.nil?
264
+ doc.xpath("//tr")[2..-1].each do |row|
265
+ date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
266
+ next if date_text == 'Date' or date_text.size > 8
267
+ date = Date.parse(date_text)
268
+ results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
269
+ end
270
+ end
271
+ results.flatten
272
+ end
273
+
274
+ def self.conaway(page=1)
275
+ results = []
276
+ base_url = "http://conaway.house.gov/news/"
277
+ page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
278
+ doc = open_html(page_url)
279
+ return if doc.nil?
280
+ doc.xpath("//tr")[1..-1].each do |row|
281
+ results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
282
+ end
283
+ results
284
+ end
285
+
286
+ def self.susandavis
287
+ results = []
288
+ base_url = "http://www.house.gov/susandavis/"
289
+ doc = open_html(base_url+'news.shtml')
290
+ return if doc.nil?
291
+ doc.search("ul")[6].children.each do |row|
292
+ next if row.text.strip == ''
293
+ results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
294
+ end
295
+ results
296
+ end
297
+
298
+ def self.faleomavaega
299
+ results = []
300
+ base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
301
+ doc = open_html(base_url)
302
+ return if doc.nil?
303
+ doc.xpath("//li[@type='disc']").each do |row|
304
+ results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[u201cu201d]/, '').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text), :domain => "house.gov/faleomavaega" }
305
+ end
306
+ results
307
+ end
308
+
309
+ def self.freshman_senators
310
+ results = []
311
+ ['baldwin', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz'].each do |senator|
312
+ base_url = "http://www.#{senator}.senate.gov/"
313
+ doc = open_html(base_url+'press.cfm?maxrows=200&startrow=1&&type=1')
314
+ return if doc.nil?
315
+ doc.xpath("//tr")[3..-1].each do |row|
316
+ next if row.text.strip == ''
317
+ results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text), :domain => "#{senator}.senate.gov" }
318
+ end
319
+ end
320
+ results.flatten
321
+ end
322
+
323
+ def self.klobuchar
324
+ results = []
325
+ base_url = "http://www.klobuchar.senate.gov/"
326
+ [2012,2013].each do |year|
327
+ year_url = base_url + "newsreleases.cfm?year=#{year}"
328
+ doc = open_html(year_url)
329
+ return if doc.nil?
330
+ doc.xpath("//dt").each do |row|
331
+ results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[u201cu201d]/, '').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
332
+ end
333
+ end
334
+ results
335
+ end
336
+
337
+ def self.lujan
338
+ results = []
339
+ base_url = 'http://lujan.house.gov/'
340
+ doc = open_html(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78')
341
+ return if doc.nil?
342
+ doc.xpath('//ul')[1].children.each do |row|
343
+ next if row.text.strip == ''
344
+ results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text, :date => nil, :domain => "lujan.house.gov" }
345
+ end
346
+ results
347
+ end
348
+
349
+ def self.billnelson(year=2013)
350
+ results = []
351
+ base_url = "http://www.billnelson.senate.gov/news/"
352
+ year_url = base_url + "media.cfm?year=#{year}"
353
+ doc = open_html(year_url)
354
+ return if doc.nil?
355
+ doc.xpath('//li').each do |row|
356
+ results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
357
+ end
358
+ results
359
+ end
360
+
361
+ # fetches the latest 1000 releases, can be altered
362
+ def self.lautenberg(rows=1000)
363
+ results = []
364
+ base_url = 'http://www.lautenberg.senate.gov/newsroom/'
365
+ url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
366
+ doc = open_html(url)
367
+ return if doc.nil?
368
+ doc.xpath("//tr")[4..-2].each do |row|
369
+ results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
370
+ end
371
+ results
372
+ end
373
+
374
+ def self.crapo
375
+ results = []
376
+ base_url = "http://www.crapo.senate.gov/media/newsreleases/"
377
+ url = base_url + "release_all.cfm"
378
+ doc = open_html(url)
379
+ return if doc.nil?
380
+ doc.xpath("//tr").each do |row|
381
+ results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
382
+ end
383
+ results
384
+ end
385
+
386
+ def self.coburn(year=Date.today.year)
387
+ results = []
388
+ url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
389
+ doc = open_html(url)
390
+ return if doc.nil?
391
+ doc.xpath("//tr")[2..-1].each do |row|
392
+ next if row.text[0..3] == "Date"
393
+ results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
394
+ end
395
+ results
396
+ end
397
+
398
+ def self.boxer(start=1)
399
+ results = []
400
+ url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
401
+ domain = 'www.boxer.senate.gov'
402
+ doc = open_html(url)
403
+ return if doc.nil?
404
+ doc.xpath("//div[@class='left']")[1..-1].each do |row|
405
+ results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
406
+ end
407
+ results
408
+ end
409
+
410
+ def self.mccain(year=Date.today.year)
411
+ results = []
412
+ url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
413
+ domain = 'www.mccain.senate.gov'
414
+ doc = open_html(url)
415
+ return if doc.nil?
416
+ doc.xpath("//li")[7..-1].each do |row|
417
+ results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
418
+ end
419
+ results
420
+ end
421
+
422
+ def self.vitter_cowan(year=Date.today.year)
423
+ results = []
424
+ urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
425
+ urls.each do |url|
426
+ next if year < 2013 and url == "http://www.cowan.senate.gov/"
427
+ if url == "http://www.vitter.senate.gov/newsroom/"
428
+ domain = "www.vitter.senate.gov"
429
+ elsif url == "http://www.cowan.senate.gov/"
430
+ domain = "www.cowan.senate.gov"
431
+ end
432
+ doc = open_html(url+"press?year=#{year}")
433
+ return if doc.nil?
434
+ doc.xpath("//tr")[1..-1].each do |row|
435
+ next if row.text.strip.size < 30
436
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
437
+ end
438
+ end
439
+ results.flatten
440
+ end
441
+
442
+ def self.donnelly(year=Date.today.year)
443
+ results = []
444
+ url = "http://www.donnelly.senate.gov/newsroom/"
445
+ domain = "www.donnelly.senate.gov"
446
+ doc = open_html(url+"press?year=#{year}")
447
+ return if doc.nil?
448
+ doc.xpath("//tr")[1..-1].each do |row|
449
+ next if row.text.strip.size < 30
450
+ results << { :source => url, :url => "http://www.donnelly.senate.gov"+row.children[2].children[1]['href'].strip, :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => domain}
451
+ end
452
+ results
453
+ end
454
+
455
+ def self.inhofe(year=Date.today.year)
456
+ results = []
457
+ url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
458
+ domain = "www.inhofe.senate.gov"
459
+ doc = open_html(url)
460
+ return if doc.nil?
461
+ doc.xpath("//tr")[1..-1].each do |row|
462
+ next if row.text.strip.size < 30
463
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
464
+ end
465
+ results
466
+ end
467
+
468
+ def self.levin(page=1)
469
+ results = []
470
+ url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}&section=press"
471
+ domain = "www.levin.senate.gov"
472
+ doc = open_html(url)
473
+ return if doc.nil?
474
+ doc.xpath('//tr').each do |row|
475
+ results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
476
+ end
477
+ results
478
+ end
479
+
480
+ def self.reid
481
+ results = []
482
+ url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
483
+ domain = "www.reid.senate.gov"
484
+ doc = open_html(url)
485
+ return if doc.nil?
486
+ doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
487
+ results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
488
+ end
489
+ results
490
+ end
491
+
492
+ def self.palazzo(page=1)
493
+ results = []
494
+ domain = "palazzo.house.gov"
495
+ url = "http://palazzo.house.gov/news/documentquery.aspx?DocumentTypeID=2519&Page=#{page}"
496
+ doc = open_html(url)
497
+ return if doc.nil?
498
+ doc.xpath("//div[@class='middlecopy']//li").each do |row|
499
+ results << { :source => url, :url => "http://palazzo.house.gov/news/" + row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
500
+ end
501
+ results
502
+ end
503
+
504
+ def self.document_query(page=1)
505
+ results = []
506
+ domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
507
+ domains.each do |domain|
508
+ doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
509
+ return if doc.nil?
510
+ doc.xpath("//span[@class='middlecopy']").each do |row|
511
+ results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
512
+ end
513
+ end
514
+ results.flatten
515
+ end
516
+
517
+
518
+
519
+
520
+ end
521
+ end
@@ -0,0 +1,12 @@
1
+ require 'uri'
2
+
3
+ module Utils
4
+ def self.absolute_link(url, link)
5
+ return link if link =~ /^http:\/\//
6
+ ("http://"+URI.parse(url).host + "/"+link).to_s
7
+ end
8
+
9
+ def self.remove_generic_urls!(results)
10
+ results.reject{|r| URI.parse(URI.escape(r[:url])).path == '/news/' or URI.parse(URI.escape(r[:url])).path == '/news'}
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module Statement
2
- VERSION = "0.8.2"
2
+ VERSION = "0.9"
3
3
  end
data/lib/statement.rb CHANGED
@@ -1,551 +1,9 @@
1
- # encoding: utf-8
2
1
  require "statement/version"
3
- require 'uri'
4
- require 'open-uri'
5
- require 'american_date'
6
- require 'nokogiri'
2
+ require "statement/feed"
3
+ require "statement/scraper"
4
+ require "statement/utils"
5
+ include Statement
7
6
 
8
7
  module Statement
9
-
10
- class Link
11
- def self.absolute_link(url, link)
12
- return link if link =~ /^http:\/\//
13
- ("http://"+URI.parse(url).host + "/"+link).to_s
14
- end
15
-
16
- def self.open_rss(url)
17
- begin
18
- Nokogiri::XML(open(url))
19
- rescue
20
- nil
21
- end
22
- end
23
-
24
- def self.open_html(url)
25
- begin
26
- Nokogiri::HTML(open(url).read)
27
- rescue
28
- nil
29
- end
30
- end
31
-
32
- def self.remove_generic_urls!(results)
33
- results.reject{|r| URI.parse(r[:url]).path == '/news/' or URI.parse(r[:url]).path == '/news'}
34
- end
35
-
36
- def self.date_from_rss_item(link)
37
- if !link.xpath('pubDate').text.empty?
38
- Date.parse(link.xpath('pubDate').text)
39
- elsif !link.xpath('pubdate').empty?
40
- Date.parse(link.xpath('pubdate').text)
41
- else
42
- nil
43
- end
44
- end
45
-
46
- def self.from_rss(url)
47
- doc = open_rss(url)
48
- return unless doc
49
- links = doc.xpath('//item')
50
- results = links.map do |link|
51
- abs_link = absolute_link(url, link.xpath('link').text)
52
- abs_link = "http://www.burr.senate.gov/public/"+ link.xpath('link').text if url == 'http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed'
53
- abs_link = link.xpath('link').text[37..-1] if url == "http://www.johanns.senate.gov/public/?a=RSS.Feed"
54
- { :source => url, :url => abs_link, :title => link.xpath('title').text, :date => date_from_rss_item(link), :domain => URI.parse(url).host }
55
- end
56
- remove_generic_urls!(results)
57
- end
58
-
59
- def self.house_gop(url)
60
- doc = open_html(url)
61
- return unless doc
62
- uri = URI.parse(url)
63
- date = Date.parse(uri.query.split('=').last)
64
- links = doc.xpath("//ul[@id='membernews']").search('a')
65
- results = links.map do |link|
66
- abs_link = absolute_link(url, link["href"])
67
- { :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
68
- end
69
- remove_generic_urls!(results)
70
- end
71
-
72
- def self.from_scrapers
73
- year = Date.today.year
74
- results = [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, palazzo(page=1), billnelson(year=year),
75
- document_query(page=1), document_query(page=2), donnelly(year=year), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
76
- vitter_cowan(year=year), inhofe(year=year), reid].flatten
77
- remove_generic_urls!(results)
78
- end
79
-
80
- def self.backfill_from_scrapers
81
- results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
82
- document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
83
- boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
84
- ].flatten
85
- remove_generic_urls!(results)
86
- end
87
-
88
- def self.committee_scrapers
89
- year = Date.today.year
90
- results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
91
- senate_indian, senate_aging, senate_smallbiz_minority, senate_intel(113, 2013, 2014), house_energy_minority, house_homeland_security_minority,
92
- house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
93
- remove_generic_urls!(results)
94
- end
95
-
96
- ## special cases for committees without RSS feeds
97
-
98
- def self.senate_approps_majority
99
- results = []
100
- url = "http://www.appropriations.senate.gov/news.cfm"
101
- doc = open_html(url)
102
- return if doc.nil?
103
- doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
104
- date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
105
- results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'majority' }
106
- end
107
- end
108
- results
109
- end
110
-
111
- def self.senate_approps_minority
112
- results = []
113
- url = "http://www.appropriations.senate.gov/republican.cfm"
114
- doc = open_html(url)
115
- return if doc.nil?
116
- doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
117
- date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
118
- results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'minority' }
119
- end
120
- end
121
- results
122
- end
123
-
124
- def self.senate_banking(year)
125
- results = []
126
- url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
127
- doc = open_html(url)
128
- return if doc.nil?
129
- doc.xpath("//tr").each do |row|
130
- results << { :source => url, :url => "http://www.banking.senate.gov/public/" + row.children[2].children[1]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip+", #{year}"), :domain => "http://www.banking.senate.gov/", :party => 'majority' }
131
- end
132
- results
133
- end
134
-
135
- def self.senate_hsag_majority(year)
136
- results = []
137
- url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
138
- doc = open_html(url)
139
- return if doc.nil?
140
- doc.xpath("//tr").each do |row|
141
- next if row.text.strip.size < 30
142
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'majority' }
143
- end
144
- results
145
- end
146
-
147
- def self.senate_hsag_minority(year)
148
- results = []
149
- url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
150
- doc = open_html(url)
151
- return if doc.nil?
152
- doc.xpath("//tr").each do |row|
153
- next if row.text.strip.size < 30
154
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'minority' }
155
- end
156
- results
157
- end
158
-
159
- def self.senate_indian
160
- results = []
161
- url = "http://www.indian.senate.gov/news/index.cfm"
162
- doc = open_html(url)
163
- return if doc.nil?
164
- doc.xpath("//h3").each do |row|
165
- results << { :source => url, :url => "http://www.indian.senate.gov"+row.children[0]['href'], :title => row.children[0].text, :date => Date.parse(row.previous.previous.text), :domain => "http://www.indian.senate.gov/", :party => 'majority' }
166
- end
167
- results
168
- end
169
-
170
- def self.senate_aging
171
- results = []
172
- url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
173
- doc = open_html(url)
174
- return if doc.nil?
175
- doc.xpath("//tr")[6..104].each do |row|
176
- results << { :source => url, :url => "http://www.aging.senate.gov/"+row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.aging.senate.gov/" }
177
- end
178
- results
179
- end
180
-
181
- def self.senate_smallbiz_minority
182
- results = []
183
- url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
184
- doc = open_html(url)
185
- return if doc.nil?
186
- doc.xpath("//ul[@class='recordList']").each do |row|
187
- results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
188
- end
189
- results
190
- end
191
-
192
- def self.senate_intel(congress, start_year, end_year)
193
- results = []
194
- url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
195
- doc = open_html(url)
196
- return if doc.nil?
197
- doc.xpath("//tr[@valign='top']")[7..-1].each do |row|
198
- results << { :source => url, :url => "http://www.intelligence.senate.gov/press/"+row.children[2].children[0]['href'], :title => row.children[2].children[0].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.intelligence.senate.gov/" }
199
- end
200
- results
201
- end
202
-
203
- def self.house_energy_minority
204
- results = []
205
- url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
206
- doc = open_html(url)
207
- return if doc.nil?
208
- doc.xpath("//div[@class='views-field-title']").each do |row|
209
- results << { :source => url, :url => "http://democrats.energycommerce.house.gov"+row.children[1].children[0]['href'], :title => row.children[1].children[0].text, :date => Date.parse(row.next.next.text.strip), :domain => "http://energycommerce.house.gov/", :party => 'minority' }
210
- end
211
- results
212
- end
213
-
214
- def self.house_homeland_security_minority
215
- results = []
216
- url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
217
- doc = open_html(url)
218
- return if doc.nil?
219
- doc.xpath("//li[@class='article']").each do |row|
220
- results << { :source => url, :url => "http://chsdemocrats.house.gov"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text), :domain => "http://chsdemocrats.house.gov/", :party => 'minority' }
221
- end
222
- results
223
- end
224
-
225
- def self.house_judiciary_majority
226
- results = []
227
- url = "http://judiciary.house.gov/news/press2013.html"
228
- doc = open_html(url)
229
- return if doc.nil?
230
- doc.xpath("//p")[3..60].each do |row|
231
- next if row.text.size < 30
232
- results << { :source => url, :url => row.children[5]['href'], :title => row.children[0].text, :date => Date.parse(row.children[1].text.strip), :domain => "http://judiciary.house.gov/", :party => 'majority' }
233
- end
234
- results
235
- end
236
-
237
- def self.house_rules_majority
238
- results = []
239
- url = "http://www.rules.house.gov/News/Default.aspx"
240
- doc = open_html(url)
241
- return if doc.nil?
242
- doc.xpath("//tr")[1..-2].each do |row|
243
- next if row.text.strip.size < 30
244
- results << { :source => url, :url => "http://www.rules.house.gov/News/"+row.children[0].children[1].children[0]['href'], :title => row.children[0].children[1].children[0].text, :date => Date.parse(row.children[2].children[1].text.strip), :domain => "http://www.rules.house.gov/", :party => 'majority' }
245
- end
246
- results
247
- end
248
-
249
- def self.house_ways_means_majority
250
- results = []
251
- url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
252
- doc = open_html(url)
253
- return if doc.nil?
254
- doc.xpath("//ul[@class='UnorderedNewsList']").children.each do |row|
255
- next if row.text.strip.size < 10
256
- results << { :source => url, :url => "http://waysandmeans.house.gov"+row.children[1].children[1]['href'], :title => row.children[1].children[1].text, :date => Date.parse(row.children[3].children[0].text.strip), :domain => "http://waysandmeans.house.gov/", :party => 'majority' }
257
- end
258
- results
259
- end
260
-
261
- ## special cases for members without RSS feeds
262
-
263
- def self.capuano
264
- results = []
265
- base_url = "http://www.house.gov/capuano/news/"
266
- list_url = base_url + 'date.shtml'
267
- doc = open_html(list_url)
268
- return if doc.nil?
269
- doc.xpath("//a").each do |link|
270
- if link['href'] and link['href'].include?('/pr')
271
- begin
272
- date = Date.parse(link.text)
273
- rescue
274
- date = nil
275
- end
276
- results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
277
- end
278
- end
279
- return results[0..-5]
280
- end
281
-
282
- def self.cold_fusion(year, month)
283
- results = []
284
- year = Date.today.year if not year
285
- month = 0 if not month
286
- domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
287
- domains.each do |domain|
288
- if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
289
- url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
290
- elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
291
- url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
292
- else
293
- url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
294
- end
295
- doc = open_html(url)
296
- return if doc.nil?
297
- doc.xpath("//tr")[2..-1].each do |row|
298
- date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
299
- next if date_text == 'Date' or date_text.size > 8
300
- date = Date.parse(date_text)
301
- results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
302
- end
303
- end
304
- results.flatten
305
- end
306
-
307
- def self.conaway(page=1)
308
- results = []
309
- base_url = "http://conaway.house.gov/news/"
310
- page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
311
- doc = open_html(page_url)
312
- return if doc.nil?
313
- doc.xpath("//tr")[1..-1].each do |row|
314
- results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
315
- end
316
- results
317
- end
318
-
319
- def self.susandavis
320
- results = []
321
- base_url = "http://www.house.gov/susandavis/"
322
- doc = open_html(base_url+'news.shtml')
323
- return if doc.nil?
324
- doc.search("ul")[6].children.each do |row|
325
- next if row.text.strip == ''
326
- results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
327
- end
328
- results
329
- end
330
-
331
- def self.faleomavaega
332
- results = []
333
- base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
334
- doc = open_html(base_url)
335
- return if doc.nil?
336
- doc.xpath("//li[@type='disc']").each do |row|
337
- results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[u201cu201d]/, '').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text), :domain => "house.gov/faleomavaega" }
338
- end
339
- results
340
- end
341
-
342
- def self.freshman_senators
343
- results = []
344
- ['baldwin', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz'].each do |senator|
345
- base_url = "http://www.#{senator}.senate.gov/"
346
- doc = open_html(base_url+'press.cfm?maxrows=200&startrow=1&&type=1')
347
- return if doc.nil?
348
- doc.xpath("//tr")[3..-1].each do |row|
349
- next if row.text.strip == ''
350
- results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text), :domain => "#{senator}.senate.gov" }
351
- end
352
- end
353
- results.flatten
354
- end
355
-
356
- def self.klobuchar
357
- results = []
358
- base_url = "http://www.klobuchar.senate.gov/"
359
- [2012,2013].each do |year|
360
- year_url = base_url + "newsreleases.cfm?year=#{year}"
361
- doc = open_html(year_url)
362
- return if doc.nil?
363
- doc.xpath("//dt").each do |row|
364
- results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[u201cu201d]/, '').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
365
- end
366
- end
367
- results
368
- end
369
-
370
- def self.lujan
371
- results = []
372
- base_url = 'http://lujan.house.gov/'
373
- doc = open_html(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78')
374
- return if doc.nil?
375
- doc.xpath('//ul')[1].children.each do |row|
376
- next if row.text.strip == ''
377
- results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text, :date => nil, :domain => "lujan.house.gov" }
378
- end
379
- results
380
- end
381
-
382
- def self.billnelson(year=2013)
383
- results = []
384
- base_url = "http://www.billnelson.senate.gov/news/"
385
- year_url = base_url + "media.cfm?year=#{year}"
386
- doc = open_html(year_url)
387
- return if doc.nil?
388
- doc.xpath('//li').each do |row|
389
- results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
390
- end
391
- results
392
- end
393
-
394
- # fetches the latest 1000 releases, can be altered
395
- def self.lautenberg(rows=1000)
396
- results = []
397
- base_url = 'http://www.lautenberg.senate.gov/newsroom/'
398
- url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
399
- doc = open_html(url)
400
- return if doc.nil?
401
- doc.xpath("//tr")[4..-2].each do |row|
402
- results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
403
- end
404
- results
405
- end
406
-
407
- def self.crapo
408
- results = []
409
- base_url = "http://www.crapo.senate.gov/media/newsreleases/"
410
- url = base_url + "release_all.cfm"
411
- doc = open_html(url)
412
- return if doc.nil?
413
- doc.xpath("//tr").each do |row|
414
- results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
415
- end
416
- results
417
- end
418
-
419
- def self.coburn(year=Date.today.year)
420
- results = []
421
- url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
422
- doc = open_html(url)
423
- return if doc.nil?
424
- doc.xpath("//tr")[2..-1].each do |row|
425
- next if row.text[0..3] == "Date"
426
- results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
427
- end
428
- results
429
- end
430
-
431
- def self.boxer(start=1)
432
- results = []
433
- url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
434
- domain = 'www.boxer.senate.gov'
435
- doc = open_html(url)
436
- return if doc.nil?
437
- doc.xpath("//div[@class='left']")[1..-1].each do |row|
438
- results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
439
- end
440
- results
441
- end
442
-
443
- def self.mccain(year=Date.today.year)
444
- results = []
445
- url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
446
- domain = 'www.mccain.senate.gov'
447
- doc = open_html(url)
448
- return if doc.nil?
449
- doc.xpath("//li")[7..-1].each do |row|
450
- results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
451
- end
452
- results
453
- end
454
-
455
- def self.vitter_cowan(year=Date.today.year)
456
- results = []
457
- urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
458
- urls.each do |url|
459
- next if year < 2013 and url == "http://www.cowan.senate.gov/"
460
- if url == "http://www.vitter.senate.gov/newsroom/"
461
- domain = "www.vitter.senate.gov"
462
- elsif url == "http://www.cowan.senate.gov/"
463
- domain = "www.cowan.senate.gov"
464
- end
465
- doc = open_html(url+"press?year=#{year}")
466
- return if doc.nil?
467
- doc.xpath("//tr")[1..-1].each do |row|
468
- next if row.text.strip.size < 30
469
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
470
- end
471
- end
472
- results.flatten
473
- end
474
-
475
- def self.donnelly(year=Date.today.year)
476
- results = []
477
- url = "http://www.donnelly.senate.gov/newsroom/"
478
- domain = "www.donnelly.senate.gov"
479
- doc = open_html(url+"press?year=#{year}")
480
- return if doc.nil?
481
- doc.xpath("//tr")[1..-1].each do |row|
482
- next if row.text.strip.size < 30
483
- results << { :source => url, :url => "http://www.donnelly.senate.gov"+row.children[2].children[1]['href'].strip, :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => domain}
484
- end
485
- results
486
- end
487
-
488
- def self.inhofe(year=Date.today.year)
489
- results = []
490
- url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
491
- domain = "www.inhofe.senate.gov"
492
- doc = open_html(url)
493
- return if doc.nil?
494
- doc.xpath("//tr")[1..-1].each do |row|
495
- next if row.text.strip.size < 30
496
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
497
- end
498
- results
499
- end
500
-
501
- def self.levin(page=1)
502
- results = []
503
- url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}&section=press"
504
- domain = "www.levin.senate.gov"
505
- doc = open_html(url)
506
- return if doc.nil?
507
- doc.xpath('//tr').each do |row|
508
- results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
509
- end
510
- results
511
- end
512
-
513
- def self.reid
514
- results = []
515
- url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
516
- domain = "www.reid.senate.gov"
517
- doc = open_html(url)
518
- return if doc.nil?
519
- doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
520
- results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
521
- end
522
- results
523
- end
524
-
525
- def self.palazzo(page=1)
526
- results = []
527
- domain = "palazzo.house.gov"
528
- url = "http://palazzo.house.gov/news/documentquery.aspx?DocumentTypeID=2519&Page=#{page}"
529
- doc = open_html(url)
530
- return if doc.nil?
531
- doc.xpath("//div[@class='middlecopy']//li").each do |row|
532
- results << { :source => url, :url => "http://palazzo.house.gov/news/" + row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
533
- end
534
- results
535
- end
536
-
537
- def self.document_query(page=1)
538
- results = []
539
- domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
540
- domains.each do |domain|
541
- doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
542
- return if doc.nil?
543
- doc.xpath("//span[@class='middlecopy']").each do |row|
544
- results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
545
- end
546
- end
547
- results.flatten
548
- end
549
-
550
- end
8
+ extend Utils
551
9
  end
@@ -1,35 +1,34 @@
1
1
  require "minitest/autorun"
2
2
  require_relative "../lib/statement"
3
3
  require 'webmock/minitest'
4
+ include Statement
4
5
 
5
6
  describe Statement do
6
7
  it "parses an rss feed" do
7
8
  @feed_url = "http://ruiz.house.gov/rss.xml"
8
9
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "ruiz_rss.xml")), :status => 200)
9
- @results = Statement::Link.from_rss(@feed_url)
10
+ @results = Feed.from_rss(@feed_url)
10
11
  @results.first[:domain].must_equal "ruiz.house.gov"
11
12
  end
12
13
 
13
14
  it "parses House GOP press release page" do
14
15
  @feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
15
16
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
16
- @results = Statement::Link.house_gop(@feed_url)
17
+ @results = Scraper.house_gop(@feed_url)
17
18
  @results.first[:source].must_equal @feed_url
18
19
  end
19
20
 
20
21
  it "does not attempt to parse dates when none are present" do
21
22
  @feed_url = "http://culberson.house.gov/feed/rss/"
22
23
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "culberson_rss.xml")), :status => 200)
23
-
24
- @results = Statement::Link.from_rss(@feed_url)
24
+ @results = Feed.from_rss(@feed_url)
25
25
  @results.first[:date].must_equal nil
26
26
  end
27
27
 
28
28
  it "parses invalid RSS" do
29
29
  @feed_url = "http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed"
30
30
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "richard_burr.xml")), :status => 200)
31
-
32
- @results = Statement::Link.from_rss(@feed_url)
31
+ @results = Feed.from_rss(@feed_url)
33
32
  @results.first[:url].must_equal "http://www.burr.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&Type=Press Release&ContentRecord_id=65dbea38-d64c-6208-ef8f-2b000e899b3a"
34
33
  @results.first[:date].to_s.must_equal "2013-05-02"
35
34
  end
@@ -37,14 +36,14 @@ describe Statement do
37
36
  it "handles relative URLs" do
38
37
  @feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
39
38
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
40
- @results = Statement::Link.house_gop(@feed_url)
39
+ @results = Scraper.house_gop(@feed_url)
41
40
  @results.last[:url].must_equal "http://www.gop.gov/republicans/other/relative_url_test.html"
42
41
  end
43
42
 
44
43
  it "scrapes a senate cold fusion page" do
45
44
  @url = "http://www.billnelson.senate.gov/news/media.cfm?year=2013"
46
45
  stub_request(:any, @url).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'bill_nelson_press.html')), :status => 200)
47
- @results = Statement::Link.billnelson(year=2013)
46
+ @results = Scraper.billnelson(year=2013)
48
47
  @results.last[:url].must_equal "http://www.billnelson.senate.gov/news/details.cfm?id=338190&"
49
48
  end
50
49
 
@@ -53,7 +52,7 @@ describe Statement do
53
52
  @cowan = "http://www.cowan.senate.gov/press?year=2013"
54
53
  stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
55
54
  stub_request(:any, @cowan).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'cowan_press.html')), :status => 200)
56
- @results = Statement::Link.vitter_cowan(year=2013)
55
+ @results = Scraper.vitter_cowan(year=2013)
57
56
  @results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov", "www.cowan.senate.gov"]
58
57
  end
59
58
 
@@ -62,7 +61,7 @@ describe Statement do
62
61
  @cowan = "http://www.cowan.senate.gov/press?year=2012"
63
62
  stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
64
63
  stub_request(:any, @cowan).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'cowan_press.html')), :status => 200)
65
- @results = Statement::Link.vitter_cowan(year=2012)
64
+ @results = Scraper.vitter_cowan(year=2012)
66
65
  @results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov"]
67
66
  end
68
67
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: statement
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: '0.9'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-05-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
16
- requirement: &2151755980 !ruby/object:Gem::Requirement
16
+ requirement: &2155945560 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '1.3'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2151755980
24
+ version_requirements: *2155945560
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake
27
- requirement: &2151755400 !ruby/object:Gem::Requirement
27
+ requirement: &2155944480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2151755400
35
+ version_requirements: *2155944480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: webmock
38
- requirement: &2151754820 !ruby/object:Gem::Requirement
38
+ requirement: &2155943560 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2151754820
46
+ version_requirements: *2155943560
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: american_date
49
- requirement: &2151754360 !ruby/object:Gem::Requirement
49
+ requirement: &2155942580 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *2151754360
57
+ version_requirements: *2155942580
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: nokogiri
60
- requirement: &2151753880 !ruby/object:Gem::Requirement
60
+ requirement: &2155941660 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *2151753880
68
+ version_requirements: *2155941660
69
69
  description: Crawls congressional websites for press releases.
70
70
  email:
71
71
  - dwillis@gmail.com
@@ -79,6 +79,9 @@ files:
79
79
  - README.md
80
80
  - Rakefile
81
81
  - lib/statement.rb
82
+ - lib/statement/feed.rb
83
+ - lib/statement/scraper.rb
84
+ - lib/statement/utils.rb
82
85
  - lib/statement/version.rb
83
86
  - spec/bill_nelson_press.html
84
87
  - spec/cowan_press.html