statement 1.8.10 → 1.8.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +104 -57
- data/lib/statement/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d7870c8335be4f10c96f6b9ca858ace626fd77a
|
4
|
+
data.tar.gz: c4742a7f24a2ab59291d4a4ba331c5f356f84889
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac2b0fef268e2991a80fa4b7a5df1c51c7b30869a0dd966d7c673bf3f7b39879b16c66df0437e7aeda57fa3786428947236e6a6931f7fc67c9eb0e30e1f956e9
|
7
|
+
data.tar.gz: ef3c6a1d6fdc12ac29b862de24ffac76c697c5ee0e9aa486c0adafb79d9e8c00c1e932aa90fd4580c3b3c1310a1e8ed8a755d6745e5e81988efe59b435f24ef9
|
data/lib/statement/scraper.rb
CHANGED
@@ -6,7 +6,7 @@ require 'nokogiri'
|
|
6
6
|
|
7
7
|
module Statement
|
8
8
|
class Scraper
|
9
|
-
|
9
|
+
|
10
10
|
def self.open_html(url)
|
11
11
|
begin
|
12
12
|
Nokogiri::HTML(open(url).read)
|
@@ -14,47 +14,47 @@ module Statement
|
|
14
14
|
nil
|
15
15
|
end
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def self.house_gop(url)
|
19
19
|
doc = open_html(url)
|
20
20
|
return unless doc
|
21
21
|
uri = URI.parse(url)
|
22
22
|
date = Date.parse(uri.query.split('=').last)
|
23
23
|
links = doc.xpath("//ul[@id='membernews']").search('a')
|
24
|
-
results = links.map do |link|
|
24
|
+
results = links.map do |link|
|
25
25
|
abs_link = Utils.absolute_link(url, link["href"])
|
26
26
|
{ :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
|
27
27
|
end
|
28
28
|
Utils.remove_generic_urls!(results)
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
def self.member_methods
|
32
|
-
[:capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :vitter, :donnelly, :inhofe, :palazzo, :roe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :wolf_sherman_mccaul, :welch, :sessions, :gabbard, :ellison, :costa, :farr, :mcclintock, :mcnerney, :olson]
|
32
|
+
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :vitter, :donnelly, :inhofe, :palazzo, :roe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :wolf_sherman_mccaul, :welch, :sessions, :gabbard, :ellison, :costa, :farr, :mcclintock, :mcnerney, :olson]
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
def self.committee_methods
|
36
36
|
[:senate_approps_majority, :senate_approps_minority, :senate_banking, :senate_hsag_majority, :senate_hsag_minority, :senate_indian, :senate_aging, :senate_smallbiz_minority, :senate_intel, :house_energy_minority, :house_homeland_security_minority, :house_judiciary_majority, :house_rules_majority, :house_ways_means_majority]
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
def self.member_scrapers
|
40
40
|
year = Date.today.year
|
41
|
-
results = [capuano, cold_fusion(year, 0), conaway, chabot, susandavis, klobuchar(year), palazzo(page=1), roe(page=1), billnelson(year=year),
|
41
|
+
results = [crenshaw, capuano, cold_fusion(year, 0), conaway, chabot, susandavis, klobuchar(year), palazzo(page=1), roe(page=1), billnelson(year=year),
|
42
42
|
document_query(page=1), document_query(page=2), swalwell(page=1), donnelly(year=year), crapo, coburn, boxer(start=1),
|
43
43
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, wolf_sherman_mccaul, welch,
|
44
44
|
sessions(year=year), gabbard, pryor, ellison(page=0), costa, farr, mcclintock, olson, mcnerney].flatten
|
45
45
|
results = results.compact
|
46
46
|
Utils.remove_generic_urls!(results)
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
def self.backfill_from_scrapers
|
50
|
-
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
51
|
-
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
50
|
+
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
51
|
+
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
52
52
|
boxer(start=31), boxer(start=41), vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3), clark(year=2013), culberson_chabot_grisham(page=2),
|
53
|
-
wolf_sherman_mccaul(page=1), sessions(year=2013), pryor(page=1), ellison(page=1), ellison(page=2), ellison(page=3), farr(year=2013), farr(year=2012), farr(year=2011),
|
54
|
-
mcnerney(page
|
53
|
+
wolf_sherman_mccaul(page=1), sessions(year=2013), pryor(page=1), ellison(page=1), ellison(page=2), ellison(page=3), farr(year=2013), farr(year=2012), farr(year=2011),
|
54
|
+
mcnerney(page=2), mcnerney(page=3), mcnerney(page=4), mcnerney(page=5), mcnerney(page=6), olson(year=2013)].flatten
|
55
55
|
Utils.remove_generic_urls!(results)
|
56
56
|
end
|
57
|
-
|
57
|
+
|
58
58
|
def self.committee_scrapers
|
59
59
|
year = Date.today.year
|
60
60
|
results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
|
@@ -62,9 +62,9 @@ module Statement
|
|
62
62
|
house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
|
63
63
|
Utils.remove_generic_urls!(results)
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
## special cases for committees without RSS feeds
|
67
|
-
|
67
|
+
|
68
68
|
def self.senate_approps_majority
|
69
69
|
results = []
|
70
70
|
url = "http://www.appropriations.senate.gov/news.cfm"
|
@@ -77,7 +77,7 @@ module Statement
|
|
77
77
|
end
|
78
78
|
results
|
79
79
|
end
|
80
|
-
|
80
|
+
|
81
81
|
def self.senate_approps_minority
|
82
82
|
results = []
|
83
83
|
url = "http://www.appropriations.senate.gov/republican.cfm"
|
@@ -90,7 +90,7 @@ module Statement
|
|
90
90
|
end
|
91
91
|
results
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
def self.senate_banking(year=Date.today.year)
|
95
95
|
results = []
|
96
96
|
url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
@@ -101,7 +101,7 @@ module Statement
|
|
101
101
|
end
|
102
102
|
results
|
103
103
|
end
|
104
|
-
|
104
|
+
|
105
105
|
def self.senate_hsag_majority(year=Date.today.year)
|
106
106
|
results = []
|
107
107
|
url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
|
@@ -113,7 +113,7 @@ module Statement
|
|
113
113
|
end
|
114
114
|
results
|
115
115
|
end
|
116
|
-
|
116
|
+
|
117
117
|
def self.senate_hsag_minority(year=Date.today.year)
|
118
118
|
results = []
|
119
119
|
url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
|
@@ -125,7 +125,7 @@ module Statement
|
|
125
125
|
end
|
126
126
|
results
|
127
127
|
end
|
128
|
-
|
128
|
+
|
129
129
|
def self.senate_indian
|
130
130
|
results = []
|
131
131
|
url = "http://www.indian.senate.gov/news/index.cfm"
|
@@ -136,7 +136,7 @@ module Statement
|
|
136
136
|
end
|
137
137
|
results
|
138
138
|
end
|
139
|
-
|
139
|
+
|
140
140
|
def self.senate_aging
|
141
141
|
results = []
|
142
142
|
url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
|
@@ -147,18 +147,18 @@ module Statement
|
|
147
147
|
end
|
148
148
|
results
|
149
149
|
end
|
150
|
-
|
150
|
+
|
151
151
|
def self.senate_smallbiz_minority
|
152
152
|
results = []
|
153
153
|
url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
|
154
154
|
doc = open_html(url)
|
155
|
-
return if doc.nil?
|
155
|
+
return if doc.nil?
|
156
156
|
doc.xpath("//ul[@class='recordList']").each do |row|
|
157
157
|
results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
|
158
158
|
end
|
159
159
|
results
|
160
160
|
end
|
161
|
-
|
161
|
+
|
162
162
|
def self.senate_intel(congress=113, start_year=2013, end_year=2014)
|
163
163
|
results = []
|
164
164
|
url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
|
@@ -169,7 +169,7 @@ module Statement
|
|
169
169
|
end
|
170
170
|
results
|
171
171
|
end
|
172
|
-
|
172
|
+
|
173
173
|
def self.house_energy_minority
|
174
174
|
results = []
|
175
175
|
url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
|
@@ -180,7 +180,7 @@ module Statement
|
|
180
180
|
end
|
181
181
|
results
|
182
182
|
end
|
183
|
-
|
183
|
+
|
184
184
|
def self.house_homeland_security_minority
|
185
185
|
results = []
|
186
186
|
url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
|
@@ -191,7 +191,7 @@ module Statement
|
|
191
191
|
end
|
192
192
|
results
|
193
193
|
end
|
194
|
-
|
194
|
+
|
195
195
|
def self.house_judiciary_majority
|
196
196
|
results = []
|
197
197
|
url = "http://judiciary.house.gov/news/press2013.html"
|
@@ -203,7 +203,7 @@ module Statement
|
|
203
203
|
end
|
204
204
|
results
|
205
205
|
end
|
206
|
-
|
206
|
+
|
207
207
|
def self.house_rules_majority
|
208
208
|
results = []
|
209
209
|
url = "http://www.rules.house.gov/News/Default.aspx"
|
@@ -215,7 +215,7 @@ module Statement
|
|
215
215
|
end
|
216
216
|
results
|
217
217
|
end
|
218
|
-
|
218
|
+
|
219
219
|
def self.house_ways_means_majority
|
220
220
|
results = []
|
221
221
|
url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
|
@@ -227,9 +227,9 @@ module Statement
|
|
227
227
|
end
|
228
228
|
results
|
229
229
|
end
|
230
|
-
|
230
|
+
|
231
231
|
## special cases for members without RSS feeds
|
232
|
-
|
232
|
+
|
233
233
|
def self.swalwell(page=1)
|
234
234
|
results = []
|
235
235
|
url = "http://swalwell.house.gov/category/press-releases/page/#{page}/"
|
@@ -250,7 +250,7 @@ module Statement
|
|
250
250
|
doc.xpath("//a").select{|l| !l['href'].nil? and l['href'].include?('/pr')}[1..-5].each do |link|
|
251
251
|
begin
|
252
252
|
year = link['href'].split('/').first
|
253
|
-
date = Date.parse(link.text.split(' ').first+'/'+year)
|
253
|
+
date = Date.parse(link.text.split(' ').first+'/'+year)
|
254
254
|
rescue
|
255
255
|
date = nil
|
256
256
|
end
|
@@ -258,17 +258,44 @@ module Statement
|
|
258
258
|
end
|
259
259
|
return results[0..-5]
|
260
260
|
end
|
261
|
-
|
262
|
-
def self.
|
261
|
+
|
262
|
+
def self.crenshaw(year=Date.today.year, month=nil)
|
263
263
|
results = []
|
264
264
|
year = Date.today.year if not year
|
265
|
-
|
266
|
-
|
265
|
+
domain = 'crenshaw.house.gov'
|
266
|
+
if month
|
267
|
+
url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
268
|
+
else
|
269
|
+
url = "http://crenshaw.house.gov/index.cfm/pressreleases"
|
270
|
+
end
|
271
|
+
doc = Statement::Scraper.open_html(url)
|
272
|
+
return if doc.nil?
|
273
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
274
|
+
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
275
|
+
next if date_text == 'Date' or date_text.size > 10
|
276
|
+
date = Date.parse(date_text)
|
277
|
+
results << { :source => url, :url => row.children[3].children.first['href'], :title => title, :date => date, :domain => domain }
|
278
|
+
end
|
279
|
+
results
|
280
|
+
end
|
281
|
+
|
282
|
+
def self.cold_fusion(year=Date.today.year, month=nil)
|
283
|
+
results = []
|
284
|
+
year = Date.today.year if not year
|
285
|
+
domains = ['www.ronjohnson.senate.gov/public/','www.risch.senate.gov/public/']
|
267
286
|
domains.each do |domain|
|
268
|
-
if domain == '
|
269
|
-
|
287
|
+
if domain == 'www.risch.senate.gov/public/'
|
288
|
+
if not month
|
289
|
+
url = "http://www.risch.senate.gov/public/index.cfm/pressreleases"
|
290
|
+
else
|
291
|
+
url = "http://www.risch.senate.gov/public/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
292
|
+
end
|
270
293
|
else
|
271
|
-
|
294
|
+
if not month
|
295
|
+
url = "http://www.ronjohnson.senate.gov/public/index.cfm/press-releases"
|
296
|
+
else
|
297
|
+
url = "http://www.ronjohnson.senate.gov/public/index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
298
|
+
end
|
272
299
|
end
|
273
300
|
doc = Statement::Scraper.open_html(url)
|
274
301
|
return if doc.nil?
|
@@ -281,7 +308,7 @@ module Statement
|
|
281
308
|
end
|
282
309
|
results.flatten
|
283
310
|
end
|
284
|
-
|
311
|
+
|
285
312
|
def self.conaway(page=1)
|
286
313
|
results = []
|
287
314
|
base_url = "http://conaway.house.gov/news/"
|
@@ -293,7 +320,7 @@ module Statement
|
|
293
320
|
end
|
294
321
|
results
|
295
322
|
end
|
296
|
-
|
323
|
+
|
297
324
|
def self.chabot(year=Date.today.year)
|
298
325
|
results = []
|
299
326
|
base_url = "http://chabot.house.gov/news/"
|
@@ -306,7 +333,7 @@ module Statement
|
|
306
333
|
end
|
307
334
|
results
|
308
335
|
end
|
309
|
-
|
336
|
+
|
310
337
|
def self.susandavis
|
311
338
|
results = []
|
312
339
|
base_url = "http://www.house.gov/susandavis/"
|
@@ -318,7 +345,7 @@ module Statement
|
|
318
345
|
end
|
319
346
|
results
|
320
347
|
end
|
321
|
-
|
348
|
+
|
322
349
|
def self.klobuchar(year)
|
323
350
|
results = []
|
324
351
|
base_url = "http://www.klobuchar.senate.gov/"
|
@@ -333,7 +360,7 @@ module Statement
|
|
333
360
|
end
|
334
361
|
results
|
335
362
|
end
|
336
|
-
|
363
|
+
|
337
364
|
def self.lujan
|
338
365
|
results = []
|
339
366
|
base_url = 'http://lujan.house.gov/'
|
@@ -345,7 +372,7 @@ module Statement
|
|
345
372
|
end
|
346
373
|
results
|
347
374
|
end
|
348
|
-
|
375
|
+
|
349
376
|
def self.billnelson(year=2013)
|
350
377
|
results = []
|
351
378
|
base_url = "http://www.billnelson.senate.gov/news/"
|
@@ -357,7 +384,7 @@ module Statement
|
|
357
384
|
end
|
358
385
|
results
|
359
386
|
end
|
360
|
-
|
387
|
+
|
361
388
|
# fetches the latest 1000 releases, can be altered
|
362
389
|
def self.lautenberg(rows=1000)
|
363
390
|
results = []
|
@@ -370,7 +397,7 @@ module Statement
|
|
370
397
|
end
|
371
398
|
results
|
372
399
|
end
|
373
|
-
|
400
|
+
|
374
401
|
def self.crapo
|
375
402
|
results = []
|
376
403
|
base_url = "http://www.crapo.senate.gov/media/newsreleases/"
|
@@ -394,7 +421,7 @@ module Statement
|
|
394
421
|
end
|
395
422
|
results
|
396
423
|
end
|
397
|
-
|
424
|
+
|
398
425
|
def self.coburn(year=Date.today.year)
|
399
426
|
results = []
|
400
427
|
url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
|
@@ -406,7 +433,7 @@ module Statement
|
|
406
433
|
end
|
407
434
|
results
|
408
435
|
end
|
409
|
-
|
436
|
+
|
410
437
|
def self.boxer(start=1)
|
411
438
|
results = []
|
412
439
|
url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
|
@@ -418,7 +445,7 @@ module Statement
|
|
418
445
|
end
|
419
446
|
results
|
420
447
|
end
|
421
|
-
|
448
|
+
|
422
449
|
def self.vitter(year=Date.today.year)
|
423
450
|
results = []
|
424
451
|
url = "http://www.vitter.senate.gov/newsroom/"
|
@@ -431,7 +458,7 @@ module Statement
|
|
431
458
|
end
|
432
459
|
results
|
433
460
|
end
|
434
|
-
|
461
|
+
|
435
462
|
def self.donnelly(year=Date.today.year)
|
436
463
|
results = []
|
437
464
|
url = "http://www.donnelly.senate.gov/newsroom/"
|
@@ -444,7 +471,7 @@ module Statement
|
|
444
471
|
end
|
445
472
|
results
|
446
473
|
end
|
447
|
-
|
474
|
+
|
448
475
|
def self.inhofe(year=Date.today.year)
|
449
476
|
results = []
|
450
477
|
url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
|
@@ -457,7 +484,7 @@ module Statement
|
|
457
484
|
end
|
458
485
|
results
|
459
486
|
end
|
460
|
-
|
487
|
+
|
461
488
|
def self.palazzo(page=1)
|
462
489
|
results = []
|
463
490
|
domain = "palazzo.house.gov"
|
@@ -598,7 +625,7 @@ module Statement
|
|
598
625
|
doc = open_html(url)
|
599
626
|
return if doc.nil?
|
600
627
|
doc.css('ul.fc_leading li').each do |row|
|
601
|
-
results << {:source => url, :url => "http://gabbard.house.gov"+row.children[0].children[1]['href'], :title => row.children[0].children[1].text.strip, :date => Date.parse(row.children[2].text), :domain => domain}
|
628
|
+
results << {:source => url, :url => "http://gabbard.house.gov"+row.children[0].children[1]['href'], :title => row.children[0].children[1].text.strip, :date => Date.parse(row.children[2].text), :domain => domain}
|
602
629
|
end
|
603
630
|
results
|
604
631
|
end
|
@@ -692,6 +719,26 @@ module Statement
|
|
692
719
|
end
|
693
720
|
results.flatten
|
694
721
|
end
|
695
|
-
|
722
|
+
|
723
|
+
def self.backfill_bilirakis
|
724
|
+
results = []
|
725
|
+
domain = 'bilirakis.house.gov'
|
726
|
+
url = 'http://bilirakis.house.gov/press-releases/'
|
727
|
+
doc = open_html(url)
|
728
|
+
return if doc.nil?
|
729
|
+
doc.css("ul li[@class='article articleright']").each do |row|
|
730
|
+
results << {:source => url, :url => 'http://bilirakis.house.gov' + row.children[3].children[1]['href'], :title => row.children[3].text.strip, :date => Date.parse(row.children[5].text), :domain => domain }
|
731
|
+
end
|
732
|
+
end
|
733
|
+
|
734
|
+
def self.backfill_boustany
|
735
|
+
results = []
|
736
|
+
domain = 'boustany.house.gov'
|
737
|
+
url = 'http://boustany.house.gov/113th-congress/showallitems/'
|
738
|
+
doc = open_html(url)
|
739
|
+
return if doc.nil?
|
740
|
+
|
741
|
+
end
|
742
|
+
|
696
743
|
end
|
697
|
-
end
|
744
|
+
end
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.8.
|
4
|
+
version: 1.8.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Derek Willis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|