statement 1.8.10 → 1.8.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +104 -57
- data/lib/statement/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d7870c8335be4f10c96f6b9ca858ace626fd77a
|
4
|
+
data.tar.gz: c4742a7f24a2ab59291d4a4ba331c5f356f84889
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ac2b0fef268e2991a80fa4b7a5df1c51c7b30869a0dd966d7c673bf3f7b39879b16c66df0437e7aeda57fa3786428947236e6a6931f7fc67c9eb0e30e1f956e9
|
7
|
+
data.tar.gz: ef3c6a1d6fdc12ac29b862de24ffac76c697c5ee0e9aa486c0adafb79d9e8c00c1e932aa90fd4580c3b3c1310a1e8ed8a755d6745e5e81988efe59b435f24ef9
|
data/lib/statement/scraper.rb
CHANGED
@@ -6,7 +6,7 @@ require 'nokogiri'
|
|
6
6
|
|
7
7
|
module Statement
|
8
8
|
class Scraper
|
9
|
-
|
9
|
+
|
10
10
|
def self.open_html(url)
|
11
11
|
begin
|
12
12
|
Nokogiri::HTML(open(url).read)
|
@@ -14,47 +14,47 @@ module Statement
|
|
14
14
|
nil
|
15
15
|
end
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def self.house_gop(url)
|
19
19
|
doc = open_html(url)
|
20
20
|
return unless doc
|
21
21
|
uri = URI.parse(url)
|
22
22
|
date = Date.parse(uri.query.split('=').last)
|
23
23
|
links = doc.xpath("//ul[@id='membernews']").search('a')
|
24
|
-
results = links.map do |link|
|
24
|
+
results = links.map do |link|
|
25
25
|
abs_link = Utils.absolute_link(url, link["href"])
|
26
26
|
{ :source => url, :url => abs_link, :title => link.text.strip, :date => date, :domain => URI.parse(link["href"]).host }
|
27
27
|
end
|
28
28
|
Utils.remove_generic_urls!(results)
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
def self.member_methods
|
32
|
-
[:capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :vitter, :donnelly, :inhofe, :palazzo, :roe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :wolf_sherman_mccaul, :welch, :sessions, :gabbard, :ellison, :costa, :farr, :mcclintock, :mcnerney, :olson]
|
32
|
+
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :vitter, :donnelly, :inhofe, :palazzo, :roe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :wolf_sherman_mccaul, :welch, :sessions, :gabbard, :ellison, :costa, :farr, :mcclintock, :mcnerney, :olson]
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
def self.committee_methods
|
36
36
|
[:senate_approps_majority, :senate_approps_minority, :senate_banking, :senate_hsag_majority, :senate_hsag_minority, :senate_indian, :senate_aging, :senate_smallbiz_minority, :senate_intel, :house_energy_minority, :house_homeland_security_minority, :house_judiciary_majority, :house_rules_majority, :house_ways_means_majority]
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
def self.member_scrapers
|
40
40
|
year = Date.today.year
|
41
|
-
results = [capuano, cold_fusion(year, 0), conaway, chabot, susandavis, klobuchar(year), palazzo(page=1), roe(page=1), billnelson(year=year),
|
41
|
+
results = [crenshaw, capuano, cold_fusion(year, 0), conaway, chabot, susandavis, klobuchar(year), palazzo(page=1), roe(page=1), billnelson(year=year),
|
42
42
|
document_query(page=1), document_query(page=2), swalwell(page=1), donnelly(year=year), crapo, coburn, boxer(start=1),
|
43
43
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, wolf_sherman_mccaul, welch,
|
44
44
|
sessions(year=year), gabbard, pryor, ellison(page=0), costa, farr, mcclintock, olson, mcnerney].flatten
|
45
45
|
results = results.compact
|
46
46
|
Utils.remove_generic_urls!(results)
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
def self.backfill_from_scrapers
|
50
|
-
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
51
|
-
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
50
|
+
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
51
|
+
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
52
52
|
boxer(start=31), boxer(start=41), vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3), clark(year=2013), culberson_chabot_grisham(page=2),
|
53
|
-
wolf_sherman_mccaul(page=1), sessions(year=2013), pryor(page=1), ellison(page=1), ellison(page=2), ellison(page=3), farr(year=2013), farr(year=2012), farr(year=2011),
|
54
|
-
mcnerney(page
|
53
|
+
wolf_sherman_mccaul(page=1), sessions(year=2013), pryor(page=1), ellison(page=1), ellison(page=2), ellison(page=3), farr(year=2013), farr(year=2012), farr(year=2011),
|
54
|
+
mcnerney(page=2), mcnerney(page=3), mcnerney(page=4), mcnerney(page=5), mcnerney(page=6), olson(year=2013)].flatten
|
55
55
|
Utils.remove_generic_urls!(results)
|
56
56
|
end
|
57
|
-
|
57
|
+
|
58
58
|
def self.committee_scrapers
|
59
59
|
year = Date.today.year
|
60
60
|
results = [senate_approps_majority, senate_approps_minority, senate_banking(year), senate_hsag_majority(year), senate_hsag_minority(year),
|
@@ -62,9 +62,9 @@ module Statement
|
|
62
62
|
house_judiciary_majority, house_rules_majority, house_ways_means_majority].flatten
|
63
63
|
Utils.remove_generic_urls!(results)
|
64
64
|
end
|
65
|
-
|
65
|
+
|
66
66
|
## special cases for committees without RSS feeds
|
67
|
-
|
67
|
+
|
68
68
|
def self.senate_approps_majority
|
69
69
|
results = []
|
70
70
|
url = "http://www.appropriations.senate.gov/news.cfm"
|
@@ -77,7 +77,7 @@ module Statement
|
|
77
77
|
end
|
78
78
|
results
|
79
79
|
end
|
80
|
-
|
80
|
+
|
81
81
|
def self.senate_approps_minority
|
82
82
|
results = []
|
83
83
|
url = "http://www.appropriations.senate.gov/republican.cfm"
|
@@ -90,7 +90,7 @@ module Statement
|
|
90
90
|
end
|
91
91
|
results
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
def self.senate_banking(year=Date.today.year)
|
95
95
|
results = []
|
96
96
|
url = "http://www.banking.senate.gov/public/index.cfm?FuseAction=Newsroom.PressReleases&ContentRecordType_id=b94acc28-404a-4fc6-b143-a9e15bf92da4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
|
@@ -101,7 +101,7 @@ module Statement
|
|
101
101
|
end
|
102
102
|
results
|
103
103
|
end
|
104
|
-
|
104
|
+
|
105
105
|
def self.senate_hsag_majority(year=Date.today.year)
|
106
106
|
results = []
|
107
107
|
url = "http://www.hsgac.senate.gov/media/majority-media?year=#{year}"
|
@@ -113,7 +113,7 @@ module Statement
|
|
113
113
|
end
|
114
114
|
results
|
115
115
|
end
|
116
|
-
|
116
|
+
|
117
117
|
def self.senate_hsag_minority(year=Date.today.year)
|
118
118
|
results = []
|
119
119
|
url = "http://www.hsgac.senate.gov/media/minority-media?year=#{year}"
|
@@ -125,7 +125,7 @@ module Statement
|
|
125
125
|
end
|
126
126
|
results
|
127
127
|
end
|
128
|
-
|
128
|
+
|
129
129
|
def self.senate_indian
|
130
130
|
results = []
|
131
131
|
url = "http://www.indian.senate.gov/news/index.cfm"
|
@@ -136,7 +136,7 @@ module Statement
|
|
136
136
|
end
|
137
137
|
results
|
138
138
|
end
|
139
|
-
|
139
|
+
|
140
140
|
def self.senate_aging
|
141
141
|
results = []
|
142
142
|
url = "http://www.aging.senate.gov/pressroom.cfm?maxrows=100&startrow=1&&type=1"
|
@@ -147,18 +147,18 @@ module Statement
|
|
147
147
|
end
|
148
148
|
results
|
149
149
|
end
|
150
|
-
|
150
|
+
|
151
151
|
def self.senate_smallbiz_minority
|
152
152
|
results = []
|
153
153
|
url = "http://www.sbc.senate.gov/public/index.cfm?p=RepublicanPressRoom"
|
154
154
|
doc = open_html(url)
|
155
|
-
return if doc.nil?
|
155
|
+
return if doc.nil?
|
156
156
|
doc.xpath("//ul[@class='recordList']").each do |row|
|
157
157
|
results << { :source => url, :url => row.children[0].children[2].children[0]['href'], :title => row.children[0].children[2].children[0].text, :date => Date.parse(row.children[0].children[0].text), :domain => "http://www.sbc.senate.gov/", :party => 'minority' }
|
158
158
|
end
|
159
159
|
results
|
160
160
|
end
|
161
|
-
|
161
|
+
|
162
162
|
def self.senate_intel(congress=113, start_year=2013, end_year=2014)
|
163
163
|
results = []
|
164
164
|
url = "http://www.intelligence.senate.gov/press/releases.cfm?congress=#{congress}&y1=#{start_year}&y2=#{end_year}"
|
@@ -169,7 +169,7 @@ module Statement
|
|
169
169
|
end
|
170
170
|
results
|
171
171
|
end
|
172
|
-
|
172
|
+
|
173
173
|
def self.house_energy_minority
|
174
174
|
results = []
|
175
175
|
url = "http://democrats.energycommerce.house.gov/index.php?q=news-releases"
|
@@ -180,7 +180,7 @@ module Statement
|
|
180
180
|
end
|
181
181
|
results
|
182
182
|
end
|
183
|
-
|
183
|
+
|
184
184
|
def self.house_homeland_security_minority
|
185
185
|
results = []
|
186
186
|
url = "http://chsdemocrats.house.gov/press/index.asp?subsection=1"
|
@@ -191,7 +191,7 @@ module Statement
|
|
191
191
|
end
|
192
192
|
results
|
193
193
|
end
|
194
|
-
|
194
|
+
|
195
195
|
def self.house_judiciary_majority
|
196
196
|
results = []
|
197
197
|
url = "http://judiciary.house.gov/news/press2013.html"
|
@@ -203,7 +203,7 @@ module Statement
|
|
203
203
|
end
|
204
204
|
results
|
205
205
|
end
|
206
|
-
|
206
|
+
|
207
207
|
def self.house_rules_majority
|
208
208
|
results = []
|
209
209
|
url = "http://www.rules.house.gov/News/Default.aspx"
|
@@ -215,7 +215,7 @@ module Statement
|
|
215
215
|
end
|
216
216
|
results
|
217
217
|
end
|
218
|
-
|
218
|
+
|
219
219
|
def self.house_ways_means_majority
|
220
220
|
results = []
|
221
221
|
url = "http://waysandmeans.house.gov/news/documentquery.aspx?DocumentTypeID=1496"
|
@@ -227,9 +227,9 @@ module Statement
|
|
227
227
|
end
|
228
228
|
results
|
229
229
|
end
|
230
|
-
|
230
|
+
|
231
231
|
## special cases for members without RSS feeds
|
232
|
-
|
232
|
+
|
233
233
|
def self.swalwell(page=1)
|
234
234
|
results = []
|
235
235
|
url = "http://swalwell.house.gov/category/press-releases/page/#{page}/"
|
@@ -250,7 +250,7 @@ module Statement
|
|
250
250
|
doc.xpath("//a").select{|l| !l['href'].nil? and l['href'].include?('/pr')}[1..-5].each do |link|
|
251
251
|
begin
|
252
252
|
year = link['href'].split('/').first
|
253
|
-
date = Date.parse(link.text.split(' ').first+'/'+year)
|
253
|
+
date = Date.parse(link.text.split(' ').first+'/'+year)
|
254
254
|
rescue
|
255
255
|
date = nil
|
256
256
|
end
|
@@ -258,17 +258,44 @@ module Statement
|
|
258
258
|
end
|
259
259
|
return results[0..-5]
|
260
260
|
end
|
261
|
-
|
262
|
-
def self.
|
261
|
+
|
262
|
+
def self.crenshaw(year=Date.today.year, month=nil)
|
263
263
|
results = []
|
264
264
|
year = Date.today.year if not year
|
265
|
-
|
266
|
-
|
265
|
+
domain = 'crenshaw.house.gov'
|
266
|
+
if month
|
267
|
+
url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
268
|
+
else
|
269
|
+
url = "http://crenshaw.house.gov/index.cfm/pressreleases"
|
270
|
+
end
|
271
|
+
doc = Statement::Scraper.open_html(url)
|
272
|
+
return if doc.nil?
|
273
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
274
|
+
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
275
|
+
next if date_text == 'Date' or date_text.size > 10
|
276
|
+
date = Date.parse(date_text)
|
277
|
+
results << { :source => url, :url => row.children[3].children.first['href'], :title => title, :date => date, :domain => domain }
|
278
|
+
end
|
279
|
+
results
|
280
|
+
end
|
281
|
+
|
282
|
+
def self.cold_fusion(year=Date.today.year, month=nil)
|
283
|
+
results = []
|
284
|
+
year = Date.today.year if not year
|
285
|
+
domains = ['www.ronjohnson.senate.gov/public/','www.risch.senate.gov/public/']
|
267
286
|
domains.each do |domain|
|
268
|
-
if domain == '
|
269
|
-
|
287
|
+
if domain == 'www.risch.senate.gov/public/'
|
288
|
+
if not month
|
289
|
+
url = "http://www.risch.senate.gov/public/index.cfm/pressreleases"
|
290
|
+
else
|
291
|
+
url = "http://www.risch.senate.gov/public/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
292
|
+
end
|
270
293
|
else
|
271
|
-
|
294
|
+
if not month
|
295
|
+
url = "http://www.ronjohnson.senate.gov/public/index.cfm/press-releases"
|
296
|
+
else
|
297
|
+
url = "http://www.ronjohnson.senate.gov/public/index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
298
|
+
end
|
272
299
|
end
|
273
300
|
doc = Statement::Scraper.open_html(url)
|
274
301
|
return if doc.nil?
|
@@ -281,7 +308,7 @@ module Statement
|
|
281
308
|
end
|
282
309
|
results.flatten
|
283
310
|
end
|
284
|
-
|
311
|
+
|
285
312
|
def self.conaway(page=1)
|
286
313
|
results = []
|
287
314
|
base_url = "http://conaway.house.gov/news/"
|
@@ -293,7 +320,7 @@ module Statement
|
|
293
320
|
end
|
294
321
|
results
|
295
322
|
end
|
296
|
-
|
323
|
+
|
297
324
|
def self.chabot(year=Date.today.year)
|
298
325
|
results = []
|
299
326
|
base_url = "http://chabot.house.gov/news/"
|
@@ -306,7 +333,7 @@ module Statement
|
|
306
333
|
end
|
307
334
|
results
|
308
335
|
end
|
309
|
-
|
336
|
+
|
310
337
|
def self.susandavis
|
311
338
|
results = []
|
312
339
|
base_url = "http://www.house.gov/susandavis/"
|
@@ -318,7 +345,7 @@ module Statement
|
|
318
345
|
end
|
319
346
|
results
|
320
347
|
end
|
321
|
-
|
348
|
+
|
322
349
|
def self.klobuchar(year)
|
323
350
|
results = []
|
324
351
|
base_url = "http://www.klobuchar.senate.gov/"
|
@@ -333,7 +360,7 @@ module Statement
|
|
333
360
|
end
|
334
361
|
results
|
335
362
|
end
|
336
|
-
|
363
|
+
|
337
364
|
def self.lujan
|
338
365
|
results = []
|
339
366
|
base_url = 'http://lujan.house.gov/'
|
@@ -345,7 +372,7 @@ module Statement
|
|
345
372
|
end
|
346
373
|
results
|
347
374
|
end
|
348
|
-
|
375
|
+
|
349
376
|
def self.billnelson(year=2013)
|
350
377
|
results = []
|
351
378
|
base_url = "http://www.billnelson.senate.gov/news/"
|
@@ -357,7 +384,7 @@ module Statement
|
|
357
384
|
end
|
358
385
|
results
|
359
386
|
end
|
360
|
-
|
387
|
+
|
361
388
|
# fetches the latest 1000 releases, can be altered
|
362
389
|
def self.lautenberg(rows=1000)
|
363
390
|
results = []
|
@@ -370,7 +397,7 @@ module Statement
|
|
370
397
|
end
|
371
398
|
results
|
372
399
|
end
|
373
|
-
|
400
|
+
|
374
401
|
def self.crapo
|
375
402
|
results = []
|
376
403
|
base_url = "http://www.crapo.senate.gov/media/newsreleases/"
|
@@ -394,7 +421,7 @@ module Statement
|
|
394
421
|
end
|
395
422
|
results
|
396
423
|
end
|
397
|
-
|
424
|
+
|
398
425
|
def self.coburn(year=Date.today.year)
|
399
426
|
results = []
|
400
427
|
url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
|
@@ -406,7 +433,7 @@ module Statement
|
|
406
433
|
end
|
407
434
|
results
|
408
435
|
end
|
409
|
-
|
436
|
+
|
410
437
|
def self.boxer(start=1)
|
411
438
|
results = []
|
412
439
|
url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
|
@@ -418,7 +445,7 @@ module Statement
|
|
418
445
|
end
|
419
446
|
results
|
420
447
|
end
|
421
|
-
|
448
|
+
|
422
449
|
def self.vitter(year=Date.today.year)
|
423
450
|
results = []
|
424
451
|
url = "http://www.vitter.senate.gov/newsroom/"
|
@@ -431,7 +458,7 @@ module Statement
|
|
431
458
|
end
|
432
459
|
results
|
433
460
|
end
|
434
|
-
|
461
|
+
|
435
462
|
def self.donnelly(year=Date.today.year)
|
436
463
|
results = []
|
437
464
|
url = "http://www.donnelly.senate.gov/newsroom/"
|
@@ -444,7 +471,7 @@ module Statement
|
|
444
471
|
end
|
445
472
|
results
|
446
473
|
end
|
447
|
-
|
474
|
+
|
448
475
|
def self.inhofe(year=Date.today.year)
|
449
476
|
results = []
|
450
477
|
url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
|
@@ -457,7 +484,7 @@ module Statement
|
|
457
484
|
end
|
458
485
|
results
|
459
486
|
end
|
460
|
-
|
487
|
+
|
461
488
|
def self.palazzo(page=1)
|
462
489
|
results = []
|
463
490
|
domain = "palazzo.house.gov"
|
@@ -598,7 +625,7 @@ module Statement
|
|
598
625
|
doc = open_html(url)
|
599
626
|
return if doc.nil?
|
600
627
|
doc.css('ul.fc_leading li').each do |row|
|
601
|
-
results << {:source => url, :url => "http://gabbard.house.gov"+row.children[0].children[1]['href'], :title => row.children[0].children[1].text.strip, :date => Date.parse(row.children[2].text), :domain => domain}
|
628
|
+
results << {:source => url, :url => "http://gabbard.house.gov"+row.children[0].children[1]['href'], :title => row.children[0].children[1].text.strip, :date => Date.parse(row.children[2].text), :domain => domain}
|
602
629
|
end
|
603
630
|
results
|
604
631
|
end
|
@@ -692,6 +719,26 @@ module Statement
|
|
692
719
|
end
|
693
720
|
results.flatten
|
694
721
|
end
|
695
|
-
|
722
|
+
|
723
|
+
def self.backfill_bilirakis
|
724
|
+
results = []
|
725
|
+
domain = 'bilirakis.house.gov'
|
726
|
+
url = 'http://bilirakis.house.gov/press-releases/'
|
727
|
+
doc = open_html(url)
|
728
|
+
return if doc.nil?
|
729
|
+
doc.css("ul li[@class='article articleright']").each do |row|
|
730
|
+
results << {:source => url, :url => 'http://bilirakis.house.gov' + row.children[3].children[1]['href'], :title => row.children[3].text.strip, :date => Date.parse(row.children[5].text), :domain => domain }
|
731
|
+
end
|
732
|
+
end
|
733
|
+
|
734
|
+
def self.backfill_boustany
|
735
|
+
results = []
|
736
|
+
domain = 'boustany.house.gov'
|
737
|
+
url = 'http://boustany.house.gov/113th-congress/showallitems/'
|
738
|
+
doc = open_html(url)
|
739
|
+
return if doc.nil?
|
740
|
+
|
741
|
+
end
|
742
|
+
|
696
743
|
end
|
697
|
-
end
|
744
|
+
end
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.8.
|
4
|
+
version: 1.8.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Derek Willis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2015-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|