statement 2.0.4 → 2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +113 -78
- data/lib/statement/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 92cb454019d854f249abc59ab4a0568b7bf98c0e
|
4
|
+
data.tar.gz: bc6b887f904c4e7bca2990bf897ad67e694cd4ef
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ea5f3994b277210e343c59c2fdbeb10b36194036ec6c0e3bd78de9d528542c7105674e440118482e63317f17f24906fc36784cf15d19506b7a6afe95ba1aa31
|
7
|
+
data.tar.gz: 651466b0481dcf50682bae862f3f8f5d346636db21ede45c96e8cfe7b08644b9ce9efe77bcf1286a5121a619401a2b83151a5dd3c242fda5ecded34370f029bb
|
data/lib/statement/scraper.rb
CHANGED
@@ -29,10 +29,10 @@ module Statement
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.member_methods
|
32
|
-
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :
|
33
|
-
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton,
|
34
|
-
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :
|
35
|
-
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :
|
32
|
+
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :klobuchar, :billnelson, :crapo, :boxer, :burr, :ellison,
|
33
|
+
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :schiff,
|
34
|
+
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :cassidy, :lowey, :mcmorris, :takano,
|
35
|
+
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin, :rand_paul]
|
36
36
|
end
|
37
37
|
|
38
38
|
def self.committee_methods
|
@@ -41,11 +41,11 @@ module Statement
|
|
41
41
|
|
42
42
|
def self.member_scrapers
|
43
43
|
year = Date.today.year
|
44
|
-
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, klobuchar(year), billnelson(page=0),
|
45
|
-
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
44
|
+
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, klobuchar(year), billnelson(page=0), ellison,
|
45
|
+
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0), burr, cassidy,
|
46
46
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, welch,
|
47
|
-
sessions(year=year), gabbard, costa, farr, olson, schumer,
|
48
|
-
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins,
|
47
|
+
sessions(year=year), gabbard, costa, farr, olson, schumer, bennie_thompson, speier, lowey, mcmorris, schiff, takano,
|
48
|
+
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin(page=1),
|
49
49
|
rand_paul(page = 1)].flatten
|
50
50
|
results = results.compact
|
51
51
|
Utils.remove_generic_urls!(results)
|
@@ -53,11 +53,12 @@ module Statement
|
|
53
53
|
|
54
54
|
def self.backfill_from_scrapers
|
55
55
|
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
56
|
-
document_query(page=4), grassley(page=1), grassley(page=2), grassley(page=3),
|
56
|
+
document_query(page=4), grassley(page=1), grassley(page=2), grassley(page=3), burr(page=2), burr(page=3), burr(page=4),
|
57
57
|
vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3), clark(year=2013), culberson_chabot_grisham(page=2),
|
58
|
-
sessions(year=2013), pryor(page=1), farr(year=2013), farr(year=2012), farr(year=2011),
|
59
|
-
olson(year=2013), schumer(page=2), schumer(page=3), poe(year=2015, month=2),
|
60
|
-
poe(year=2015, month=1)
|
58
|
+
sessions(year=2013), pryor(page=1), farr(year=2013), farr(year=2012), farr(year=2011), cassidy(page=2), cassidy(page=3),
|
59
|
+
olson(year=2013), schumer(page=2), schumer(page=3), poe(year=2015, month=2), ellison(page=1), ellison(page=2), lowey(page=1),
|
60
|
+
lowey(page=2), lowey(page=3), poe(year=2015, month=1), mcmorris(page=2), mcmorris(page=3), schiff(page=2), schiff(page=3),
|
61
|
+
takano(page=2), takano(page=3)].flatten
|
61
62
|
Utils.remove_generic_urls!(results)
|
62
63
|
end
|
63
64
|
|
@@ -247,19 +248,6 @@ module Statement
|
|
247
248
|
results
|
248
249
|
end
|
249
250
|
|
250
|
-
def self.boehner(page=1, year=Date.today.year)
|
251
|
-
results = []
|
252
|
-
url = "http://boehner.house.gov/category/press-releases/page/#{page}/"
|
253
|
-
doc = open_html(url)
|
254
|
-
return if doc.nil?
|
255
|
-
(doc/:article).each do |row|
|
256
|
-
month = row.children[1].children[1].children[1].children[0].text
|
257
|
-
day = row.children[1].children[1].children[1].children[1].text
|
258
|
-
results << { :source => url, :url => "http://boehner.house.gov"+row.children[12].children[1].children[0]['href'], :title => row.children[1].children[1].children[3].text, :date => Date.parse(month+" "+day+" "+year.to_s), :domain => 'boehner.house.gov'}
|
259
|
-
end
|
260
|
-
results
|
261
|
-
end
|
262
|
-
|
263
251
|
def self.capuano
|
264
252
|
results = []
|
265
253
|
base_url = "http://www.house.gov/capuano/news/"
|
@@ -365,6 +353,17 @@ module Statement
|
|
365
353
|
results
|
366
354
|
end
|
367
355
|
|
356
|
+
def self.mcmorris(page=1)
|
357
|
+
results = []
|
358
|
+
url = "http://mcmorris.house.gov/issues/page/#{page}/?tax=types&term=news_releases"
|
359
|
+
doc = open_html(url)
|
360
|
+
return if doc.nil?
|
361
|
+
doc.css(".feed-result").each do |row|
|
362
|
+
results << { :source => url, :url => row.children[3].children[3].children.first['href'], :title => row.children[3].children[3].children.first.text.strip, :date => Date.parse(row.children[3].children[1].text), :domain => "mcmorris.house.gov" }
|
363
|
+
end
|
364
|
+
results
|
365
|
+
end
|
366
|
+
|
368
367
|
def self.klobuchar(year)
|
369
368
|
results = []
|
370
369
|
base_url = "http://www.klobuchar.senate.gov/"
|
@@ -461,16 +460,62 @@ module Statement
|
|
461
460
|
results
|
462
461
|
end
|
463
462
|
|
463
|
+
def self.schiff(page=1)
|
464
|
+
results = []
|
465
|
+
url = "http://schiff.house.gov/news/press-releases?PageNum_rs=#{page}&"
|
466
|
+
doc = open_html(url)
|
467
|
+
return if doc.nil?
|
468
|
+
rows = doc.css("#press").first.css('h2')
|
469
|
+
rows.each do |row|
|
470
|
+
results << { :source => url, :url => "http://schiff.house.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "schiff.house.gov" }
|
471
|
+
end
|
472
|
+
results
|
473
|
+
end
|
464
474
|
|
465
|
-
|
466
|
-
def self.lautenberg(rows=1000)
|
475
|
+
def self.takano(page=1)
|
467
476
|
results = []
|
468
|
-
|
469
|
-
url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
|
477
|
+
url = "http://takano.house.gov/newsroom/press-releases?PageNum_rs=#{page}"
|
470
478
|
doc = open_html(url)
|
471
479
|
return if doc.nil?
|
472
|
-
doc.
|
473
|
-
|
480
|
+
rows = doc.css("#press").first.css('h2')
|
481
|
+
rows.each do |row|
|
482
|
+
results << { :source => url, :url => "http://takano.house.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "takano.house.gov" }
|
483
|
+
end
|
484
|
+
results
|
485
|
+
end
|
486
|
+
|
487
|
+
def self.speier
|
488
|
+
results = []
|
489
|
+
url = "http://speier.house.gov/index.php?option=com_content&view=category&id=20&Itemid=14"
|
490
|
+
doc = open_html(url)
|
491
|
+
return if doc.nil?
|
492
|
+
rows = doc.css("table.category tr")
|
493
|
+
rows.each do |row|
|
494
|
+
results << { :source => url, :url => "http://speier.house.gov" + row.children[1].children[1]['href'], :title => row.children[1].children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => "speier.house.gov" }
|
495
|
+
end
|
496
|
+
results
|
497
|
+
end
|
498
|
+
|
499
|
+
def self.burr(page=1)
|
500
|
+
results = []
|
501
|
+
url = "http://www.burr.senate.gov/press/releases?PageNum_rs=#{page}&"
|
502
|
+
doc = open_html(url)
|
503
|
+
return if doc.nil?
|
504
|
+
rows = doc.css("#press").first.css('h2')
|
505
|
+
rows.each do |row|
|
506
|
+
results << { :source => url, :url => "http://www.burr.senate.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "burr.senate.gov" }
|
507
|
+
end
|
508
|
+
results
|
509
|
+
end
|
510
|
+
|
511
|
+
def self.cassidy(page=1)
|
512
|
+
results = []
|
513
|
+
url = "http://www.cassidy.senate.gov/newsroom/press-releases?PageNum_rs=#{page}&"
|
514
|
+
doc = open_html(url)
|
515
|
+
return if doc.nil?
|
516
|
+
rows = doc.css("#press").first.css('h2')
|
517
|
+
rows.each do |row|
|
518
|
+
results << { :source => url, :url => "http://www.cassidy.senate.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "cassidy.senate.gov" }
|
474
519
|
end
|
475
520
|
results
|
476
521
|
end
|
@@ -510,6 +555,17 @@ module Statement
|
|
510
555
|
results
|
511
556
|
end
|
512
557
|
|
558
|
+
def self.ellison(page=0)
|
559
|
+
results = []
|
560
|
+
url = "http://ellison.house.gov/media-center/press-releases?page=#{page}"
|
561
|
+
doc = open_html(url)
|
562
|
+
return if doc.nil?
|
563
|
+
doc.xpath("//div[@class='views-field views-field-created datebar']").each do |row|
|
564
|
+
results << { :source => url, :url => "http://ellison.house.gov" + row.next.next.children[1].children[0]['href'], :title => row.next.next.text.strip, :date => Date.parse(row.text.strip), :domain => "ellison.house.gov" }
|
565
|
+
end
|
566
|
+
results
|
567
|
+
end
|
568
|
+
|
513
569
|
def self.boxer
|
514
570
|
results = []
|
515
571
|
url = "http://www.boxer.senate.gov/press/release"
|
@@ -550,16 +606,13 @@ module Statement
|
|
550
606
|
results
|
551
607
|
end
|
552
608
|
|
553
|
-
def self.
|
609
|
+
def self.durbin(page=1)
|
554
610
|
results = []
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
doc.xpath("//div[@id='press']//h2").each do |row|
|
561
|
-
results << { :source => url, :url => "http://#{domain}"+row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.previous.previous.text.gsub(".","/")), :domain => domain}
|
562
|
-
end
|
611
|
+
url = "http://www.durbin.senate.gov/newsroom/press-releases?PageNum_rs=#{page}&"
|
612
|
+
doc = open_html(url)
|
613
|
+
return if doc.nil?
|
614
|
+
doc.xpath("//div[@id='press']//h2").each do |row|
|
615
|
+
results << { :source => url, :url => "http://www.durbin.senate.gov"+row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.previous.previous.text.gsub(".","/")), :domain => 'www.durbin.senate.gov'}
|
563
616
|
end
|
564
617
|
results
|
565
618
|
end
|
@@ -730,7 +783,7 @@ module Statement
|
|
730
783
|
{"clawson.house.gov" => 2641},
|
731
784
|
{"palazzo.house.gov" => 2519},
|
732
785
|
{"roe.house.gov" => 1532},
|
733
|
-
{"perry.house.gov" =>
|
786
|
+
{"perry.house.gov" => 2607},
|
734
787
|
{"rodneydavis.house.gov" => 2427},
|
735
788
|
{"kevinbrady.house.gov" => 2657},
|
736
789
|
{"loudermilk.house.gov" => 27},
|
@@ -742,7 +795,13 @@ module Statement
|
|
742
795
|
{"grothman.house.gov" => 27},
|
743
796
|
{"beyer.house.gov" => 27},
|
744
797
|
{"kathleenrice.house.gov" => 27},
|
745
|
-
{"hanna.house.gov" => 27}
|
798
|
+
{"hanna.house.gov" => 27},
|
799
|
+
{"trentkelly.house.gov" => 27},
|
800
|
+
{"lamborn.house.gov" => 27},
|
801
|
+
{"wittman.house.gov" => 2670},
|
802
|
+
{"kinzinger.house.gov" => 2665},
|
803
|
+
{"ellmers.house.gov" => 27},
|
804
|
+
{"frankel.house.gov" => 27}
|
746
805
|
]
|
747
806
|
domains.each do |domain|
|
748
807
|
doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
|
@@ -792,23 +851,6 @@ module Statement
|
|
792
851
|
results
|
793
852
|
end
|
794
853
|
|
795
|
-
def self.lamborn(limit=nil)
|
796
|
-
results = []
|
797
|
-
domain = 'lamborn.house.gov'
|
798
|
-
url = "http://lamborn.house.gov/2015-press-releases/"
|
799
|
-
doc = open_html(url)
|
800
|
-
return if doc.nil?
|
801
|
-
links = (doc/:h3).map{|h| { "http://lamborn.house.gov"+h.children[1]['href'] => h.text.strip} }
|
802
|
-
links = links.first(limit) if limit
|
803
|
-
links.each do |link|
|
804
|
-
page = open_html(link.keys.first)
|
805
|
-
print_path = page.search("a").detect{|a| a['onclick'] && a['onclick'].include?('popup')}['onclick'].split("'")[1]
|
806
|
-
print_page = open_html("http://lamborn.house.gov"+print_path)
|
807
|
-
results << {:source => url, :url => link.keys.first, :title => link.values.first, :date => Date.parse(print_page.xpath('//*[@class="PopupNewsDetailsDate"]').text), :domain => domain }
|
808
|
-
end
|
809
|
-
results
|
810
|
-
end
|
811
|
-
|
812
854
|
def self.jenkins
|
813
855
|
results = []
|
814
856
|
domain = 'lynnjenkins.house.gov/'
|
@@ -821,18 +863,6 @@ module Statement
|
|
821
863
|
results
|
822
864
|
end
|
823
865
|
|
824
|
-
def self.walden
|
825
|
-
results = []
|
826
|
-
domain = 'walden.house.gov'
|
827
|
-
url = "http://walden.house.gov/s2015/"
|
828
|
-
doc = open_html(url)
|
829
|
-
return if doc.nil?
|
830
|
-
doc.xpath('//*[@id="centerbox"]/div[1]/ul/li').each do |row|
|
831
|
-
results << {:source => url, :url => 'http://walden.house.gov' + row.children[3].children[1]['href'], :title => row.children[3].text.strip, :date => Date.parse(row.children[5].text), :domain => domain }
|
832
|
-
end
|
833
|
-
results
|
834
|
-
end
|
835
|
-
|
836
866
|
def self.bennie_thompson
|
837
867
|
results = []
|
838
868
|
domain = "benniethompson.house.gov"
|
@@ -845,14 +875,14 @@ module Statement
|
|
845
875
|
results
|
846
876
|
end
|
847
877
|
|
848
|
-
def self.
|
878
|
+
def self.lowey(page=0)
|
849
879
|
results = []
|
850
|
-
domain = "
|
851
|
-
url = "
|
880
|
+
domain = "lowey.house.gov"
|
881
|
+
url = "https://lowey.house.gov/media-center/press-releases?page=#{page}"
|
852
882
|
doc = open_html(url)
|
853
883
|
return if doc.nil?
|
854
|
-
doc.
|
855
|
-
results << {:source => url, :url => 'http://
|
884
|
+
doc.css(".view-content .views-row").first(10).each do |row|
|
885
|
+
results << {:source => url, :url => 'http://lowey.house.gov' + row.css('h3').first.children.first['href'], :title => row.css('h3').first.children.first.text.strip, :date => Date.parse(row.css(".views-field .field-content")[1].text), :domain => domain }
|
856
886
|
end
|
857
887
|
results
|
858
888
|
end
|
@@ -928,14 +958,19 @@ module Statement
|
|
928
958
|
"http://butterfield.house.gov/media-center/press-releases",
|
929
959
|
"http://walz.house.gov/media-center/press-releases",
|
930
960
|
"https://pingree.house.gov/media-center/press-releases",
|
931
|
-
"http://sarbanes.house.gov/media-center/press-releases",
|
932
961
|
"http://wilson.house.gov/media-center/press-releases",
|
933
962
|
"https://bilirakis.house.gov/press-releases",
|
934
963
|
"http://quigley.house.gov/media-center/press-releases",
|
935
964
|
"https://denham.house.gov/media-center/press-releases",
|
936
965
|
"https://sewell.house.gov/media-center/press-releases",
|
937
966
|
"https://buchanan.house.gov/media-center/press-releases",
|
938
|
-
"https://meehan.house.gov/media-center/press-releases"
|
967
|
+
"https://meehan.house.gov/media-center/press-releases",
|
968
|
+
"https://olson.house.gov/media-center/press-releases",
|
969
|
+
"https://louise.house.gov/media-center/press-releases",
|
970
|
+
"https://waters.house.gov/media-center/press-releases",
|
971
|
+
"https://walden.house.gov/media-center/press-releases",
|
972
|
+
"https://brooks.house.gov/media-center/news-releases",
|
973
|
+
"https://swalwell.house.gov/media-center/press-releases"
|
939
974
|
]
|
940
975
|
end
|
941
976
|
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: '2.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Derek Willis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -205,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
205
205
|
version: '0'
|
206
206
|
requirements: []
|
207
207
|
rubyforge_project:
|
208
|
-
rubygems_version: 2.
|
208
|
+
rubygems_version: 2.4.5
|
209
209
|
signing_key:
|
210
210
|
specification_version: 4
|
211
211
|
summary: Given a url, Statement returns links to press releases and official statements.
|