statement 0.8.2 → 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ # encoding: utf-8
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'american_date'
5
+ require 'nokogiri'
6
+ include Statement
7
+
8
+ module Statement
9
+ class Feed
10
+
11
+ def self.open_rss(url)
12
+ begin
13
+ Nokogiri::XML(open(url))
14
+ rescue
15
+ nil
16
+ end
17
+ end
18
+
19
+ def self.date_from_rss_item(link)
20
+ if !link.xpath('pubDate').text.empty?
21
+ Date.parse(link.xpath('pubDate').text)
22
+ elsif !link.xpath('pubdate').empty?
23
+ Date.parse(link.xpath('pubdate').text)
24
+ else
25
+ nil
26
+ end
27
+ end
28
+
29
+ def self.from_rss(url)
30
+ doc = open_rss(url)
31
+ return unless doc
32
+ links = doc.xpath('//item')
33
+ results = links.map do |link|
34
+ abs_link = Utils.absolute_link(url, link.xpath('link').text)
35
+ abs_link = "http://www.burr.senate.gov/public/"+ link.xpath('link').text if url == 'http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed'
36
+ abs_link = link.xpath('link').text[37..-1] if url == "http://www.johanns.senate.gov/public/?a=RSS.Feed"
37
+ { :source => url, :url => abs_link, :title => link.xpath('title').text, :date => date_from_rss_item(link), :domain => URI.parse(url).host }
38
+ end
39
+ Utils.remove_generic_urls!(results)
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,521 @@
1
+ # encoding: utf-8
2
+ require 'uri'
3
+ require 'open-uri'
4
+ require 'american_date'
5
+ require 'nokogiri'
6
+
7
+ module Statement
8
+ class Scraper
9
+
10
+ def self.open_html(url)
11
+ begin
12
+ Nokogiri::HTML(open(url).read)
13
+ rescue
14
+ nil
15
+ end
16
+ end
17
+
18
+ def self.house_gop(url)
19
+ doc = open_html(url)
20
+ return unless doc
21
+ uri = URI.parse(url)
22
+ date = Date.parse(uri.query.split('=').last)
23
+ links = doc.xpath("//ul[@id='membernews']").search('a')
24
+ results = links.map do |link|
25
+ abs_link = Utils.absolute_link(url, link["href"])
26
+ { :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
27
+ end
28
+ Utils.remove_generic_urls!(results)
29
+ end
30
+
31
+ def self.member_methods
32
+ [:capuano, :cold_fusion, :conaway, :susandavis, :faleomavaega, :freshman_senators, :klobuchar, :lujan, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :mccain, :vitter_cowan, :donnelly, :inhofe, :levin, :reid, :palazzo, :document_query]
33
+ end
34
+
35
+ def self.committee_methods
36
+ [:senate_approps_majority, :senate_approps_minority, :senate_banking, :senate_hsag_majority, :senate_hsag_minority, :senate_indian, :senate_aging, :senate_smallbiz_minority, :senate_intel, :house_energy_minority, :house_homeland_security_minority, :house_judiciary_majority, :house_rules_majority, :house_ways_means_majority]
37
+ end
38
+
39
+ def self.member_scrapers
40
+ year = Date.today.year
41
+ results = [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, palazzo(page=1), billnelson(year=year),
42
+ document_query(page=1), document_query(page=2), donnelly(year=year), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
43
+ vitter_cowan(year=year), inhofe(year=year), reid].flatten
44
+ Utils.remove_generic_urls!(results)
45
+ end
46
+
47
+ def self.backfill_from_scrapers
48
+ results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
49
+ document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
50
+ boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
51
+ ].flatten
52
+ Utils.remove_generic_urls!(results)
53
+ end
54
+
55
+ def self.committee_scrapers
56
+ year = Date.today.year
57
+ results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
58
+ senate_indian, senate_aging, senate_smallbiz_minority, senate_intel(113, 2013, 2014), house_energy_minority, house_homeland_security_minority,
59
+ house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
60
+ Utils.remove_generic_urls!(results)
61
+ end
62
+
63
+ ## special cases for committees without RSS feeds
64
+
65
+ def self.senate_approps_majority
66
+ results = []
67
+ url = "http://www.appropriations.senate.gov/news.cfm"
68
+ doc = open_html(url)
69
+ return if doc.nil?
70
+ doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
71
+ date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
72
+ results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'majority' }
73
+ end
74
+ end
75
+ results
76
+ end
77
+
78
+ def self.senate_approps_minority
79
+ results = []
80
+ url = "http://www.appropriations.senate.gov/republican.cfm"
81
+ doc = open_html(url)
82
+ return if doc.nil?
83
+ doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
84
+ date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
85
+ results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'minority' }
86
+ end
87
+ end
88
+ results
89
+ end
90
+
91
+ def self.senate_banking(year=Date.today.year)
92
+ results = []
93
+ url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
94
+ doc = open_html(url)
95
+ return if doc.nil?
96
+ doc.xpath("//tr").each do |row|
97
+ results << { :source => url, :url => "http://www.banking.senate.gov/public/" + row.children[2].children[1]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip+", #{year}"), :domain => "http://www.banking.senate.gov/", :party => 'majority' }
98
+ end
99
+ results
100
+ end
101
+
102
+ def self.senate_hsag_majority(year=Date.today.year)
103
+ results = []
104
+ url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
105
+ doc = open_html(url)
106
+ return if doc.nil?
107
+ doc.xpath("//tr").each do |row|
108
+ next if row.text.strip.size < 30
109
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'majority' }
110
+ end
111
+ results
112
+ end
113
+
114
+ def self.senate_hsag_minority(year=Date.today.year)
115
+ results = []
116
+ url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
117
+ doc = open_html(url)
118
+ return if doc.nil?
119
+ doc.xpath("//tr").each do |row|
120
+ next if row.text.strip.size < 30
121
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'minority' }
122
+ end
123
+ results
124
+ end
125
+
126
+ def self.senate_indian
127
+ results = []
128
+ url = "http://www.indian.senate.gov/news/index.cfm"
129
+ doc = open_html(url)
130
+ return if doc.nil?
131
+ doc.xpath("//h3").each do |row|
132
+ results << { :source => url, :url => "http://www.indian.senate.gov"+row.children[0]['href'], :title => row.children[0].text, :date => Date.parse(row.previous.previous.text), :domain => "http://www.indian.senate.gov/", :party => 'majority' }
133
+ end
134
+ results
135
+ end
136
+
137
+ def self.senate_aging
138
+ results = []
139
+ url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
140
+ doc = open_html(url)
141
+ return if doc.nil?
142
+ doc.xpath("//tr")[6..104].each do |row|
143
+ results << { :source => url, :url => "http://www.aging.senate.gov/"+row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.aging.senate.gov/" }
144
+ end
145
+ results
146
+ end
147
+
148
+ def self.senate_smallbiz_minority
149
+ results = []
150
+ url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
151
+ doc = open_html(url)
152
+ return if doc.nil?
153
+ doc.xpath("//ul[@class='recordList']").each do |row|
154
+ results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
155
+ end
156
+ results
157
+ end
158
+
159
+ def self.senate_intel(congress=113, start_year=2013, end_year=2014)
160
+ results = []
161
+ url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
162
+ doc = open_html(url)
163
+ return if doc.nil?
164
+ doc.xpath("//tr[@valign='top']")[7..-1].each do |row|
165
+ results << { :source => url, :url => "http://www.intelligence.senate.gov/press/"+row.children[2].children[0]['href'], :title => row.children[2].children[0].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.intelligence.senate.gov/" }
166
+ end
167
+ results
168
+ end
169
+
170
+ def self.house_energy_minority
171
+ results = []
172
+ url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
173
+ doc = open_html(url)
174
+ return if doc.nil?
175
+ doc.xpath("//div[@class='views-field-title']").each do |row|
176
+ results << { :source => url, :url => "http://democrats.energycommerce.house.gov"+row.children[1].children[0]['href'], :title => row.children[1].children[0].text, :date => Date.parse(row.next.next.text.strip), :domain => "http://energycommerce.house.gov/", :party => 'minority' }
177
+ end
178
+ results
179
+ end
180
+
181
+ def self.house_homeland_security_minority
182
+ results = []
183
+ url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
184
+ doc = open_html(url)
185
+ return if doc.nil?
186
+ doc.xpath("//li[@class='article']").each do |row|
187
+ results << { :source => url, :url => "http://chsdemocrats.house.gov"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text), :domain => "http://chsdemocrats.house.gov/", :party => 'minority' }
188
+ end
189
+ results
190
+ end
191
+
192
+ def self.house_judiciary_majority
193
+ results = []
194
+ url = "http://judiciary.house.gov/news/press2013.html"
195
+ doc = open_html(url)
196
+ return if doc.nil?
197
+ doc.xpath("//p")[3..60].each do |row|
198
+ next if row.text.size < 30
199
+ results << { :source => url, :url => row.children[5]['href'], :title => row.children[0].text, :date => Date.parse(row.children[1].text.strip), :domain => "http://judiciary.house.gov/", :party => 'majority' }
200
+ end
201
+ results
202
+ end
203
+
204
+ def self.house_rules_majority
205
+ results = []
206
+ url = "http://www.rules.house.gov/News/Default.aspx"
207
+ doc = open_html(url)
208
+ return if doc.nil?
209
+ doc.xpath("//tr")[1..-2].each do |row|
210
+ next if row.text.strip.size < 30
211
+ results << { :source => url, :url => "http://www.rules.house.gov/News/"+row.children[0].children[1].children[0]['href'], :title => row.children[0].children[1].children[0].text, :date => Date.parse(row.children[2].children[1].text.strip), :domain => "http://www.rules.house.gov/", :party => 'majority' }
212
+ end
213
+ results
214
+ end
215
+
216
+ def self.house_ways_means_majority
217
+ results = []
218
+ url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
219
+ doc = open_html(url)
220
+ return if doc.nil?
221
+ doc.xpath("//ul[@class='UnorderedNewsList']").children.each do |row|
222
+ next if row.text.strip.size < 10
223
+ results << { :source => url, :url => "http://waysandmeans.house.gov"+row.children[1].children[1]['href'], :title => row.children[1].children[1].text, :date => Date.parse(row.children[3].children[0].text.strip), :domain => "http://waysandmeans.house.gov/", :party => 'majority' }
224
+ end
225
+ results
226
+ end
227
+
228
+ ## special cases for members without RSS feeds
229
+
230
+ def self.capuano
231
+ results = []
232
+ base_url = "http://www.house.gov/capuano/news/"
233
+ list_url = base_url + 'date.shtml'
234
+ doc = open_html(list_url)
235
+ return if doc.nil?
236
+ doc.xpath("//a").each do |link|
237
+ if link['href'] and link['href'].include?('/pr')
238
+ begin
239
+ date = Date.parse(link.text)
240
+ rescue
241
+ date = nil
242
+ end
243
+ results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
244
+ end
245
+ end
246
+ return results[0..-5]
247
+ end
248
+
249
+ def self.cold_fusion(year=Date.today.year, month=0)
250
+ results = []
251
+ year = Date.today.year if not year
252
+ month = 0 if not month
253
+ domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
254
+ domains.each do |domain|
255
+ if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
256
+ url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
257
+ elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
258
+ url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
259
+ else
260
+ url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
261
+ end
262
+ doc = open_html(url)
263
+ return if doc.nil?
264
+ doc.xpath("//tr")[2..-1].each do |row|
265
+ date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
266
+ next if date_text == 'Date' or date_text.size > 8
267
+ date = Date.parse(date_text)
268
+ results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
269
+ end
270
+ end
271
+ results.flatten
272
+ end
273
+
274
+ def self.conaway(page=1)
275
+ results = []
276
+ base_url = "http://conaway.house.gov/news/"
277
+ page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
278
+ doc = open_html(page_url)
279
+ return if doc.nil?
280
+ doc.xpath("//tr")[1..-1].each do |row|
281
+ results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
282
+ end
283
+ results
284
+ end
285
+
286
+ def self.susandavis
287
+ results = []
288
+ base_url = "http://www.house.gov/susandavis/"
289
+ doc = open_html(base_url+'news.shtml')
290
+ return if doc.nil?
291
+ doc.search("ul")[6].children.each do |row|
292
+ next if row.text.strip == ''
293
+ results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
294
+ end
295
+ results
296
+ end
297
+
298
+ def self.faleomavaega
299
+ results = []
300
+ base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
301
+ doc = open_html(base_url)
302
+ return if doc.nil?
303
+ doc.xpath("//li[@type='disc']").each do |row|
304
+ results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[u201cu201d]/, '').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text), :domain => "house.gov/faleomavaega" }
305
+ end
306
+ results
307
+ end
308
+
309
+ def self.freshman_senators
310
+ results = []
311
+ ['baldwin', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz'].each do |senator|
312
+ base_url = "http://www.#{senator}.senate.gov/"
313
+ doc = open_html(base_url+'press.cfm?maxrows=200&startrow=1&&type=1')
314
+ return if doc.nil?
315
+ doc.xpath("//tr")[3..-1].each do |row|
316
+ next if row.text.strip == ''
317
+ results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text), :domain => "#{senator}.senate.gov" }
318
+ end
319
+ end
320
+ results.flatten
321
+ end
322
+
323
+ def self.klobuchar
324
+ results = []
325
+ base_url = "http://www.klobuchar.senate.gov/"
326
+ [2012,2013].each do |year|
327
+ year_url = base_url + "newsreleases.cfm?year=#{year}"
328
+ doc = open_html(year_url)
329
+ return if doc.nil?
330
+ doc.xpath("//dt").each do |row|
331
+ results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[u201cu201d]/, '').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
332
+ end
333
+ end
334
+ results
335
+ end
336
+
337
+ def self.lujan
338
+ results = []
339
+ base_url = 'http://lujan.house.gov/'
340
+ doc = open_html(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78')
341
+ return if doc.nil?
342
+ doc.xpath('//ul')[1].children.each do |row|
343
+ next if row.text.strip == ''
344
+ results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text, :date => nil, :domain => "lujan.house.gov" }
345
+ end
346
+ results
347
+ end
348
+
349
+ def self.billnelson(year=2013)
350
+ results = []
351
+ base_url = "http://www.billnelson.senate.gov/news/"
352
+ year_url = base_url + "media.cfm?year=#{year}"
353
+ doc = open_html(year_url)
354
+ return if doc.nil?
355
+ doc.xpath('//li').each do |row|
356
+ results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
357
+ end
358
+ results
359
+ end
360
+
361
+ # fetches the latest 1000 releases, can be altered
362
+ def self.lautenberg(rows=1000)
363
+ results = []
364
+ base_url = 'http://www.lautenberg.senate.gov/newsroom/'
365
+ url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
366
+ doc = open_html(url)
367
+ return if doc.nil?
368
+ doc.xpath("//tr")[4..-2].each do |row|
369
+ results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
370
+ end
371
+ results
372
+ end
373
+
374
+ def self.crapo
375
+ results = []
376
+ base_url = "http://www.crapo.senate.gov/media/newsreleases/"
377
+ url = base_url + "release_all.cfm"
378
+ doc = open_html(url)
379
+ return if doc.nil?
380
+ doc.xpath("//tr").each do |row|
381
+ results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
382
+ end
383
+ results
384
+ end
385
+
386
+ def self.coburn(year=Date.today.year)
387
+ results = []
388
+ url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
389
+ doc = open_html(url)
390
+ return if doc.nil?
391
+ doc.xpath("//tr")[2..-1].each do |row|
392
+ next if row.text[0..3] == "Date"
393
+ results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
394
+ end
395
+ results
396
+ end
397
+
398
+ def self.boxer(start=1)
399
+ results = []
400
+ url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
401
+ domain = 'www.boxer.senate.gov'
402
+ doc = open_html(url)
403
+ return if doc.nil?
404
+ doc.xpath("//div[@class='left']")[1..-1].each do |row|
405
+ results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
406
+ end
407
+ results
408
+ end
409
+
410
+ def self.mccain(year=Date.today.year)
411
+ results = []
412
+ url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
413
+ domain = 'www.mccain.senate.gov'
414
+ doc = open_html(url)
415
+ return if doc.nil?
416
+ doc.xpath("//li")[7..-1].each do |row|
417
+ results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
418
+ end
419
+ results
420
+ end
421
+
422
+ def self.vitter_cowan(year=Date.today.year)
423
+ results = []
424
+ urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
425
+ urls.each do |url|
426
+ next if year < 2013 and url == "http://www.cowan.senate.gov/"
427
+ if url == "http://www.vitter.senate.gov/newsroom/"
428
+ domain = "www.vitter.senate.gov"
429
+ elsif url == "http://www.cowan.senate.gov/"
430
+ domain = "www.cowan.senate.gov"
431
+ end
432
+ doc = open_html(url+"press?year=#{year}")
433
+ return if doc.nil?
434
+ doc.xpath("//tr")[1..-1].each do |row|
435
+ next if row.text.strip.size < 30
436
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
437
+ end
438
+ end
439
+ results.flatten
440
+ end
441
+
442
+ def self.donnelly(year=Date.today.year)
443
+ results = []
444
+ url = "http://www.donnelly.senate.gov/newsroom/"
445
+ domain = "www.donnelly.senate.gov"
446
+ doc = open_html(url+"press?year=#{year}")
447
+ return if doc.nil?
448
+ doc.xpath("//tr")[1..-1].each do |row|
449
+ next if row.text.strip.size < 30
450
+ results << { :source => url, :url => "http://www.donnelly.senate.gov"+row.children[2].children[1]['href'].strip, :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => domain}
451
+ end
452
+ results
453
+ end
454
+
455
+ def self.inhofe(year=Date.today.year)
456
+ results = []
457
+ url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
458
+ domain = "www.inhofe.senate.gov"
459
+ doc = open_html(url)
460
+ return if doc.nil?
461
+ doc.xpath("//tr")[1..-1].each do |row|
462
+ next if row.text.strip.size < 30
463
+ results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
464
+ end
465
+ results
466
+ end
467
+
468
+ def self.levin(page=1)
469
+ results = []
470
+ url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}&section=press"
471
+ domain = "www.levin.senate.gov"
472
+ doc = open_html(url)
473
+ return if doc.nil?
474
+ doc.xpath('//tr').each do |row|
475
+ results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
476
+ end
477
+ results
478
+ end
479
+
480
+ def self.reid
481
+ results = []
482
+ url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
483
+ domain = "www.reid.senate.gov"
484
+ doc = open_html(url)
485
+ return if doc.nil?
486
+ doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
487
+ results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
488
+ end
489
+ results
490
+ end
491
+
492
+ def self.palazzo(page=1)
493
+ results = []
494
+ domain = "palazzo.house.gov"
495
+ url = "http://palazzo.house.gov/news/documentquery.aspx?DocumentTypeID=2519&Page=#{page}"
496
+ doc = open_html(url)
497
+ return if doc.nil?
498
+ doc.xpath("//div[@class='middlecopy']//li").each do |row|
499
+ results << { :source => url, :url => "http://palazzo.house.gov/news/" + row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
500
+ end
501
+ results
502
+ end
503
+
504
+ def self.document_query(page=1)
505
+ results = []
506
+ domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
507
+ domains.each do |domain|
508
+ doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
509
+ return if doc.nil?
510
+ doc.xpath("//span[@class='middlecopy']").each do |row|
511
+ results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
512
+ end
513
+ end
514
+ results.flatten
515
+ end
516
+
517
+
518
+
519
+
520
+ end
521
+ end
@@ -0,0 +1,12 @@
1
+ require 'uri'
2
+
3
+ module Utils
4
+ def self.absolute_link(url, link)
5
+ return link if link =~ /^http:\/\//
6
+ ("http://"+URI.parse(url).host + "/"+link).to_s
7
+ end
8
+
9
+ def self.remove_generic_urls!(results)
10
+ results.reject{|r| URI.parse(URI.escape(r[:url])).path == '/news/' or URI.parse(URI.escape(r[:url])).path == '/news'}
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module Statement
2
- VERSION = "0.8.2"
2
+ VERSION = "0.9"
3
3
  end
data/lib/statement.rb CHANGED
@@ -1,551 +1,9 @@
1
- # encoding: utf-8
2
1
  require "statement/version"
3
- require 'uri'
4
- require 'open-uri'
5
- require 'american_date'
6
- require 'nokogiri'
2
+ require "statement/feed"
3
+ require "statement/scraper"
4
+ require "statement/utils"
5
+ include Statement
7
6
 
8
7
  module Statement
9
-
10
- class Link
11
- def self.absolute_link(url, link)
12
- return link if link =~ /^http:\/\//
13
- ("http://"+URI.parse(url).host + "/"+link).to_s
14
- end
15
-
16
- def self.open_rss(url)
17
- begin
18
- Nokogiri::XML(open(url))
19
- rescue
20
- nil
21
- end
22
- end
23
-
24
- def self.open_html(url)
25
- begin
26
- Nokogiri::HTML(open(url).read)
27
- rescue
28
- nil
29
- end
30
- end
31
-
32
- def self.remove_generic_urls!(results)
33
- results.reject{|r| URI.parse(r[:url]).path == '/news/' or URI.parse(r[:url]).path == '/news'}
34
- end
35
-
36
- def self.date_from_rss_item(link)
37
- if !link.xpath('pubDate').text.empty?
38
- Date.parse(link.xpath('pubDate').text)
39
- elsif !link.xpath('pubdate').empty?
40
- Date.parse(link.xpath('pubdate').text)
41
- else
42
- nil
43
- end
44
- end
45
-
46
- def self.from_rss(url)
47
- doc = open_rss(url)
48
- return unless doc
49
- links = doc.xpath('//item')
50
- results = links.map do |link|
51
- abs_link = absolute_link(url, link.xpath('link').text)
52
- abs_link = "http://www.burr.senate.gov/public/"+ link.xpath('link').text if url == 'http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed'
53
- abs_link = link.xpath('link').text[37..-1] if url == "http://www.johanns.senate.gov/public/?a=RSS.Feed"
54
- { :source => url, :url => abs_link, :title => link.xpath('title').text, :date => date_from_rss_item(link), :domain => URI.parse(url).host }
55
- end
56
- remove_generic_urls!(results)
57
- end
58
-
59
- def self.house_gop(url)
60
- doc = open_html(url)
61
- return unless doc
62
- uri = URI.parse(url)
63
- date = Date.parse(uri.query.split('=').last)
64
- links = doc.xpath("//ul[@id='membernews']").search('a')
65
- results = links.map do |link|
66
- abs_link = absolute_link(url, link["href"])
67
- { :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
68
- end
69
- remove_generic_urls!(results)
70
- end
71
-
72
- def self.from_scrapers
73
- year = Date.today.year
74
- results = [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, palazzo(page=1), billnelson(year=year),
75
- document_query(page=1), document_query(page=2), donnelly(year=year), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
76
- vitter_cowan(year=year), inhofe(year=year), reid].flatten
77
- remove_generic_urls!(results)
78
- end
79
-
80
- def self.backfill_from_scrapers
81
- results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
82
- document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
83
- boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
84
- ].flatten
85
- remove_generic_urls!(results)
86
- end
87
-
88
- def self.committee_scrapers
89
- year = Date.today.year
90
- results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
91
- senate_indian, senate_aging, senate_smallbiz_minority, senate_intel(113, 2013, 2014), house_energy_minority, house_homeland_security_minority,
92
- house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
93
- remove_generic_urls!(results)
94
- end
95
-
96
- ## special cases for committees without RSS feeds
97
-
98
- def self.senate_approps_majority
99
- results = []
100
- url = "http://www.appropriations.senate.gov/news.cfm"
101
- doc = open_html(url)
102
- return if doc.nil?
103
- doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
104
- date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
105
- results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'majority' }
106
- end
107
- end
108
- results
109
- end
110
-
111
- def self.senate_approps_minority
112
- results = []
113
- url = "http://www.appropriations.senate.gov/republican.cfm"
114
- doc = open_html(url)
115
- return if doc.nil?
116
- doc.xpath("//div[@class='newsDateUnderlined']").each do |date|
117
- date.next.next.children.reject{|c| c.text.strip.empty?}.each do |row|
118
- results << { :source => url, :url => url + row.children[0]['href'], :title => row.text, :date => Date.parse(date.text), :domain => "http://www.appropriations.senate.gov/", :party => 'minority' }
119
- end
120
- end
121
- results
122
- end
123
-
124
- def self.senate_banking(year)
125
- results = []
126
- url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
127
- doc = open_html(url)
128
- return if doc.nil?
129
- doc.xpath("//tr").each do |row|
130
- results << { :source => url, :url => "http://www.banking.senate.gov/public/" + row.children[2].children[1]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip+", #{year}"), :domain => "http://www.banking.senate.gov/", :party => 'majority' }
131
- end
132
- results
133
- end
134
-
135
- def self.senate_hsag_majority(year)
136
- results = []
137
- url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
138
- doc = open_html(url)
139
- return if doc.nil?
140
- doc.xpath("//tr").each do |row|
141
- next if row.text.strip.size < 30
142
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'majority' }
143
- end
144
- results
145
- end
146
-
147
- def self.senate_hsag_minority(year)
148
- results = []
149
- url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
150
- doc = open_html(url)
151
- return if doc.nil?
152
- doc.xpath("//tr").each do |row|
153
- next if row.text.strip.size < 30
154
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => "http://www.hsgac.senate.gov/", :party => 'minority' }
155
- end
156
- results
157
- end
158
-
159
- def self.senate_indian
160
- results = []
161
- url = "http://www.indian.senate.gov/news/index.cfm"
162
- doc = open_html(url)
163
- return if doc.nil?
164
- doc.xpath("//h3").each do |row|
165
- results << { :source => url, :url => "http://www.indian.senate.gov"+row.children[0]['href'], :title => row.children[0].text, :date => Date.parse(row.previous.previous.text), :domain => "http://www.indian.senate.gov/", :party => 'majority' }
166
- end
167
- results
168
- end
169
-
170
- def self.senate_aging
171
- results = []
172
- url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
173
- doc = open_html(url)
174
- return if doc.nil?
175
- doc.xpath("//tr")[6..104].each do |row|
176
- results << { :source => url, :url => "http://www.aging.senate.gov/"+row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.aging.senate.gov/" }
177
- end
178
- results
179
- end
180
-
181
- def self.senate_smallbiz_minority
182
- results = []
183
- url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
184
- doc = open_html(url)
185
- return if doc.nil?
186
- doc.xpath("//ul[@class='recordList']").each do |row|
187
- results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
188
- end
189
- results
190
- end
191
-
192
- def self.senate_intel(congress, start_year, end_year)
193
- results = []
194
- url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
195
- doc = open_html(url)
196
- return if doc.nil?
197
- doc.xpath("//tr[@valign='top']")[7..-1].each do |row|
198
- results << { :source => url, :url => "http://www.intelligence.senate.gov/press/"+row.children[2].children[0]['href'], :title => row.children[2].children[0].text.strip, :date => Date.parse(row.children[0].text), :domain => "http://www.intelligence.senate.gov/" }
199
- end
200
- results
201
- end
202
-
203
- def self.house_energy_minority
204
- results = []
205
- url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
206
- doc = open_html(url)
207
- return if doc.nil?
208
- doc.xpath("//div[@class='views-field-title']").each do |row|
209
- results << { :source => url, :url => "http://democrats.energycommerce.house.gov"+row.children[1].children[0]['href'], :title => row.children[1].children[0].text, :date => Date.parse(row.next.next.text.strip), :domain => "http://energycommerce.house.gov/", :party => 'minority' }
210
- end
211
- results
212
- end
213
-
214
- def self.house_homeland_security_minority
215
- results = []
216
- url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
217
- doc = open_html(url)
218
- return if doc.nil?
219
- doc.xpath("//li[@class='article']").each do |row|
220
- results << { :source => url, :url => "http://chsdemocrats.house.gov"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text), :domain => "http://chsdemocrats.house.gov/", :party => 'minority' }
221
- end
222
- results
223
- end
224
-
225
- def self.house_judiciary_majority
226
- results = []
227
- url = "http://judiciary.house.gov/news/press2013.html"
228
- doc = open_html(url)
229
- return if doc.nil?
230
- doc.xpath("//p")[3..60].each do |row|
231
- next if row.text.size < 30
232
- results << { :source => url, :url => row.children[5]['href'], :title => row.children[0].text, :date => Date.parse(row.children[1].text.strip), :domain => "http://judiciary.house.gov/", :party => 'majority' }
233
- end
234
- results
235
- end
236
-
237
- def self.house_rules_majority
238
- results = []
239
- url = "http://www.rules.house.gov/News/Default.aspx"
240
- doc = open_html(url)
241
- return if doc.nil?
242
- doc.xpath("//tr")[1..-2].each do |row|
243
- next if row.text.strip.size < 30
244
- results << { :source => url, :url => "http://www.rules.house.gov/News/"+row.children[0].children[1].children[0]['href'], :title => row.children[0].children[1].children[0].text, :date => Date.parse(row.children[2].children[1].text.strip), :domain => "http://www.rules.house.gov/", :party => 'majority' }
245
- end
246
- results
247
- end
248
-
249
- def self.house_ways_means_majority
250
- results = []
251
- url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
252
- doc = open_html(url)
253
- return if doc.nil?
254
- doc.xpath("//ul[@class='UnorderedNewsList']").children.each do |row|
255
- next if row.text.strip.size < 10
256
- results << { :source => url, :url => "http://waysandmeans.house.gov"+row.children[1].children[1]['href'], :title => row.children[1].children[1].text, :date => Date.parse(row.children[3].children[0].text.strip), :domain => "http://waysandmeans.house.gov/", :party => 'majority' }
257
- end
258
- results
259
- end
260
-
261
- ## special cases for members without RSS feeds
262
-
263
- def self.capuano
264
- results = []
265
- base_url = "http://www.house.gov/capuano/news/"
266
- list_url = base_url + 'date.shtml'
267
- doc = open_html(list_url)
268
- return if doc.nil?
269
- doc.xpath("//a").each do |link|
270
- if link['href'] and link['href'].include?('/pr')
271
- begin
272
- date = Date.parse(link.text)
273
- rescue
274
- date = nil
275
- end
276
- results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
277
- end
278
- end
279
- return results[0..-5]
280
- end
281
-
282
- def self.cold_fusion(year, month)
283
- results = []
284
- year = Date.today.year if not year
285
- month = 0 if not month
286
- domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
287
- domains.each do |domain|
288
- if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
289
- url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
290
- elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
291
- url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
292
- else
293
- url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
294
- end
295
- doc = open_html(url)
296
- return if doc.nil?
297
- doc.xpath("//tr")[2..-1].each do |row|
298
- date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
299
- next if date_text == 'Date' or date_text.size > 8
300
- date = Date.parse(date_text)
301
- results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
302
- end
303
- end
304
- results.flatten
305
- end
306
-
307
- def self.conaway(page=1)
308
- results = []
309
- base_url = "http://conaway.house.gov/news/"
310
- page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
311
- doc = open_html(page_url)
312
- return if doc.nil?
313
- doc.xpath("//tr")[1..-1].each do |row|
314
- results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
315
- end
316
- results
317
- end
318
-
319
- def self.susandavis
320
- results = []
321
- base_url = "http://www.house.gov/susandavis/"
322
- doc = open_html(base_url+'news.shtml')
323
- return if doc.nil?
324
- doc.search("ul")[6].children.each do |row|
325
- next if row.text.strip == ''
326
- results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
327
- end
328
- results
329
- end
330
-
331
- def self.faleomavaega
332
- results = []
333
- base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
334
- doc = open_html(base_url)
335
- return if doc.nil?
336
- doc.xpath("//li[@type='disc']").each do |row|
337
- results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[u201cu201d]/, '').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text), :domain => "house.gov/faleomavaega" }
338
- end
339
- results
340
- end
341
-
342
- def self.freshman_senators
343
- results = []
344
- ['baldwin', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz'].each do |senator|
345
- base_url = "http://www.#{senator}.senate.gov/"
346
- doc = open_html(base_url+'press.cfm?maxrows=200&startrow=1&&type=1')
347
- return if doc.nil?
348
- doc.xpath("//tr")[3..-1].each do |row|
349
- next if row.text.strip == ''
350
- results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text), :domain => "#{senator}.senate.gov" }
351
- end
352
- end
353
- results.flatten
354
- end
355
-
356
- def self.klobuchar
357
- results = []
358
- base_url = "http://www.klobuchar.senate.gov/"
359
- [2012,2013].each do |year|
360
- year_url = base_url + "newsreleases.cfm?year=#{year}"
361
- doc = open_html(year_url)
362
- return if doc.nil?
363
- doc.xpath("//dt").each do |row|
364
- results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[u201cu201d]/, '').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
365
- end
366
- end
367
- results
368
- end
369
-
370
- def self.lujan
371
- results = []
372
- base_url = 'http://lujan.house.gov/'
373
- doc = open_html(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78')
374
- return if doc.nil?
375
- doc.xpath('//ul')[1].children.each do |row|
376
- next if row.text.strip == ''
377
- results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text, :date => nil, :domain => "lujan.house.gov" }
378
- end
379
- results
380
- end
381
-
382
- def self.billnelson(year=2013)
383
- results = []
384
- base_url = "http://www.billnelson.senate.gov/news/"
385
- year_url = base_url + "media.cfm?year=#{year}"
386
- doc = open_html(year_url)
387
- return if doc.nil?
388
- doc.xpath('//li').each do |row|
389
- results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
390
- end
391
- results
392
- end
393
-
394
- # fetches the latest 1000 releases, can be altered
395
- def self.lautenberg(rows=1000)
396
- results = []
397
- base_url = 'http://www.lautenberg.senate.gov/newsroom/'
398
- url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
399
- doc = open_html(url)
400
- return if doc.nil?
401
- doc.xpath("//tr")[4..-2].each do |row|
402
- results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
403
- end
404
- results
405
- end
406
-
407
- def self.crapo
408
- results = []
409
- base_url = "http://www.crapo.senate.gov/media/newsreleases/"
410
- url = base_url + "release_all.cfm"
411
- doc = open_html(url)
412
- return if doc.nil?
413
- doc.xpath("//tr").each do |row|
414
- results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
415
- end
416
- results
417
- end
418
-
419
- def self.coburn(year=Date.today.year)
420
- results = []
421
- url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
422
- doc = open_html(url)
423
- return if doc.nil?
424
- doc.xpath("//tr")[2..-1].each do |row|
425
- next if row.text[0..3] == "Date"
426
- results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
427
- end
428
- results
429
- end
430
-
431
- def self.boxer(start=1)
432
- results = []
433
- url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
434
- domain = 'www.boxer.senate.gov'
435
- doc = open_html(url)
436
- return if doc.nil?
437
- doc.xpath("//div[@class='left']")[1..-1].each do |row|
438
- results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
439
- end
440
- results
441
- end
442
-
443
- def self.mccain(year=Date.today.year)
444
- results = []
445
- url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
446
- domain = 'www.mccain.senate.gov'
447
- doc = open_html(url)
448
- return if doc.nil?
449
- doc.xpath("//li")[7..-1].each do |row|
450
- results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
451
- end
452
- results
453
- end
454
-
455
- def self.vitter_cowan(year=Date.today.year)
456
- results = []
457
- urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
458
- urls.each do |url|
459
- next if year < 2013 and url == "http://www.cowan.senate.gov/"
460
- if url == "http://www.vitter.senate.gov/newsroom/"
461
- domain = "www.vitter.senate.gov"
462
- elsif url == "http://www.cowan.senate.gov/"
463
- domain = "www.cowan.senate.gov"
464
- end
465
- doc = open_html(url+"press?year=#{year}")
466
- return if doc.nil?
467
- doc.xpath("//tr")[1..-1].each do |row|
468
- next if row.text.strip.size < 30
469
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
470
- end
471
- end
472
- results.flatten
473
- end
474
-
475
- def self.donnelly(year=Date.today.year)
476
- results = []
477
- url = "http://www.donnelly.senate.gov/newsroom/"
478
- domain = "www.donnelly.senate.gov"
479
- doc = open_html(url+"press?year=#{year}")
480
- return if doc.nil?
481
- doc.xpath("//tr")[1..-1].each do |row|
482
- next if row.text.strip.size < 30
483
- results << { :source => url, :url => "http://www.donnelly.senate.gov"+row.children[2].children[1]['href'].strip, :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text), :domain => domain}
484
- end
485
- results
486
- end
487
-
488
- def self.inhofe(year=Date.today.year)
489
- results = []
490
- url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
491
- domain = "www.inhofe.senate.gov"
492
- doc = open_html(url)
493
- return if doc.nil?
494
- doc.xpath("//tr")[1..-1].each do |row|
495
- next if row.text.strip.size < 30
496
- results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
497
- end
498
- results
499
- end
500
-
501
- def self.levin(page=1)
502
- results = []
503
- url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}&section=press"
504
- domain = "www.levin.senate.gov"
505
- doc = open_html(url)
506
- return if doc.nil?
507
- doc.xpath('//tr').each do |row|
508
- results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
509
- end
510
- results
511
- end
512
-
513
- def self.reid
514
- results = []
515
- url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
516
- domain = "www.reid.senate.gov"
517
- doc = open_html(url)
518
- return if doc.nil?
519
- doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
520
- results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
521
- end
522
- results
523
- end
524
-
525
- def self.palazzo(page=1)
526
- results = []
527
- domain = "palazzo.house.gov"
528
- url = "http://palazzo.house.gov/news/documentquery.aspx?DocumentTypeID=2519&Page=#{page}"
529
- doc = open_html(url)
530
- return if doc.nil?
531
- doc.xpath("//div[@class='middlecopy']//li").each do |row|
532
- results << { :source => url, :url => "http://palazzo.house.gov/news/" + row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
533
- end
534
- results
535
- end
536
-
537
- def self.document_query(page=1)
538
- results = []
539
- domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
540
- domains.each do |domain|
541
- doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
542
- return if doc.nil?
543
- doc.xpath("//span[@class='middlecopy']").each do |row|
544
- results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
545
- end
546
- end
547
- results.flatten
548
- end
549
-
550
- end
8
+ extend Utils
551
9
  end
@@ -1,35 +1,34 @@
1
1
  require "minitest/autorun"
2
2
  require_relative "../lib/statement"
3
3
  require 'webmock/minitest'
4
+ include Statement
4
5
 
5
6
  describe Statement do
6
7
  it "parses an rss feed" do
7
8
  @feed_url = "http://ruiz.house.gov/rss.xml"
8
9
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "ruiz_rss.xml")), :status => 200)
9
- @results = Statement::Link.from_rss(@feed_url)
10
+ @results = Feed.from_rss(@feed_url)
10
11
  @results.first[:domain].must_equal "ruiz.house.gov"
11
12
  end
12
13
 
13
14
  it "parses House GOP press release page" do
14
15
  @feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
15
16
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
16
- @results = Statement::Link.house_gop(@feed_url)
17
+ @results = Scraper.house_gop(@feed_url)
17
18
  @results.first[:source].must_equal @feed_url
18
19
  end
19
20
 
20
21
  it "does not attempt to parse dates when none are present" do
21
22
  @feed_url = "http://culberson.house.gov/feed/rss/"
22
23
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "culberson_rss.xml")), :status => 200)
23
-
24
- @results = Statement::Link.from_rss(@feed_url)
24
+ @results = Feed.from_rss(@feed_url)
25
25
  @results.first[:date].must_equal nil
26
26
  end
27
27
 
28
28
  it "parses invalid RSS" do
29
29
  @feed_url = "http://www.burr.senate.gov/public/index.cfm?FuseAction=RSS.Feed"
30
30
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "richard_burr.xml")), :status => 200)
31
-
32
- @results = Statement::Link.from_rss(@feed_url)
31
+ @results = Feed.from_rss(@feed_url)
33
32
  @results.first[:url].must_equal "http://www.burr.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&Type=Press Release&ContentRecord_id=65dbea38-d64c-6208-ef8f-2b000e899b3a"
34
33
  @results.first[:date].to_s.must_equal "2013-05-02"
35
34
  end
@@ -37,14 +36,14 @@ describe Statement do
37
36
  it "handles relative URLs" do
38
37
  @feed_url = "http://www.gop.gov/republicans/news?offset=03/29/13"
39
38
  stub_request(:any, @feed_url).to_return(:body => File.new(File.join(File.dirname(__FILE__), "house_gop_releases.html")), :status => 200)
40
- @results = Statement::Link.house_gop(@feed_url)
39
+ @results = Scraper.house_gop(@feed_url)
41
40
  @results.last[:url].must_equal "http://www.gop.gov/republicans/other/relative_url_test.html"
42
41
  end
43
42
 
44
43
  it "scrapes a senate cold fusion page" do
45
44
  @url = "http://www.billnelson.senate.gov/news/media.cfm?year=2013"
46
45
  stub_request(:any, @url).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'bill_nelson_press.html')), :status => 200)
47
- @results = Statement::Link.billnelson(year=2013)
46
+ @results = Scraper.billnelson(year=2013)
48
47
  @results.last[:url].must_equal "http://www.billnelson.senate.gov/news/details.cfm?id=338190&"
49
48
  end
50
49
 
@@ -53,7 +52,7 @@ describe Statement do
53
52
  @cowan = "http://www.cowan.senate.gov/press?year=2013"
54
53
  stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
55
54
  stub_request(:any, @cowan).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'cowan_press.html')), :status => 200)
56
- @results = Statement::Link.vitter_cowan(year=2013)
55
+ @results = Scraper.vitter_cowan(year=2013)
57
56
  @results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov", "www.cowan.senate.gov"]
58
57
  end
59
58
 
@@ -62,7 +61,7 @@ describe Statement do
62
61
  @cowan = "http://www.cowan.senate.gov/press?year=2012"
63
62
  stub_request(:any, @vitter).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'vitter_press.html')), :status => 200)
64
63
  stub_request(:any, @cowan).to_return(:body => File.new(File.join(File.dirname(__FILE__), 'cowan_press.html')), :status => 200)
65
- @results = Statement::Link.vitter_cowan(year=2012)
64
+ @results = Scraper.vitter_cowan(year=2012)
66
65
  @results.map{|r| r[:domain]}.uniq.must_equal ["www.vitter.senate.gov"]
67
66
  end
68
67
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: statement
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.2
4
+ version: '0.9'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2013-05-17 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler
16
- requirement: &2151755980 !ruby/object:Gem::Requirement
16
+ requirement: &2155945560 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '1.3'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2151755980
24
+ version_requirements: *2155945560
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rake
27
- requirement: &2151755400 !ruby/object:Gem::Requirement
27
+ requirement: &2155944480 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2151755400
35
+ version_requirements: *2155944480
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: webmock
38
- requirement: &2151754820 !ruby/object:Gem::Requirement
38
+ requirement: &2155943560 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *2151754820
46
+ version_requirements: *2155943560
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: american_date
49
- requirement: &2151754360 !ruby/object:Gem::Requirement
49
+ requirement: &2155942580 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *2151754360
57
+ version_requirements: *2155942580
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: nokogiri
60
- requirement: &2151753880 !ruby/object:Gem::Requirement
60
+ requirement: &2155941660 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,7 +65,7 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *2151753880
68
+ version_requirements: *2155941660
69
69
  description: Crawls congressional websites for press releases.
70
70
  email:
71
71
  - dwillis@gmail.com
@@ -79,6 +79,9 @@ files:
79
79
  - README.md
80
80
  - Rakefile
81
81
  - lib/statement.rb
82
+ - lib/statement/feed.rb
83
+ - lib/statement/scraper.rb
84
+ - lib/statement/utils.rb
82
85
  - lib/statement/version.rb
83
86
  - spec/bill_nelson_press.html
84
87
  - spec/cowan_press.html