statement 2.0.4 → 2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +113 -78
- data/lib/statement/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 92cb454019d854f249abc59ab4a0568b7bf98c0e
|
|
4
|
+
data.tar.gz: bc6b887f904c4e7bca2990bf897ad67e694cd4ef
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7ea5f3994b277210e343c59c2fdbeb10b36194036ec6c0e3bd78de9d528542c7105674e440118482e63317f17f24906fc36784cf15d19506b7a6afe95ba1aa31
|
|
7
|
+
data.tar.gz: 651466b0481dcf50682bae862f3f8f5d346636db21ede45c96e8cfe7b08644b9ce9efe77bcf1286a5121a619401a2b83151a5dd3c242fda5ecded34370f029bb
|
data/lib/statement/scraper.rb
CHANGED
|
@@ -29,10 +29,10 @@ module Statement
|
|
|
29
29
|
end
|
|
30
30
|
|
|
31
31
|
def self.member_methods
|
|
32
|
-
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :
|
|
33
|
-
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton,
|
|
34
|
-
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :
|
|
35
|
-
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :
|
|
32
|
+
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :klobuchar, :billnelson, :crapo, :boxer, :burr, :ellison,
|
|
33
|
+
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :schiff,
|
|
34
|
+
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :cassidy, :lowey, :mcmorris, :takano,
|
|
35
|
+
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin, :rand_paul]
|
|
36
36
|
end
|
|
37
37
|
|
|
38
38
|
def self.committee_methods
|
|
@@ -41,11 +41,11 @@ module Statement
|
|
|
41
41
|
|
|
42
42
|
def self.member_scrapers
|
|
43
43
|
year = Date.today.year
|
|
44
|
-
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, klobuchar(year), billnelson(page=0),
|
|
45
|
-
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
|
44
|
+
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, klobuchar(year), billnelson(page=0), ellison,
|
|
45
|
+
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0), burr, cassidy,
|
|
46
46
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, welch,
|
|
47
|
-
sessions(year=year), gabbard, costa, farr, olson, schumer,
|
|
48
|
-
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins,
|
|
47
|
+
sessions(year=year), gabbard, costa, farr, olson, schumer, bennie_thompson, speier, lowey, mcmorris, schiff, takano,
|
|
48
|
+
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin(page=1),
|
|
49
49
|
rand_paul(page = 1)].flatten
|
|
50
50
|
results = results.compact
|
|
51
51
|
Utils.remove_generic_urls!(results)
|
|
@@ -53,11 +53,12 @@ module Statement
|
|
|
53
53
|
|
|
54
54
|
def self.backfill_from_scrapers
|
|
55
55
|
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
|
56
|
-
document_query(page=4), grassley(page=1), grassley(page=2), grassley(page=3),
|
|
56
|
+
document_query(page=4), grassley(page=1), grassley(page=2), grassley(page=3), burr(page=2), burr(page=3), burr(page=4),
|
|
57
57
|
vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3), clark(year=2013), culberson_chabot_grisham(page=2),
|
|
58
|
-
sessions(year=2013), pryor(page=1), farr(year=2013), farr(year=2012), farr(year=2011),
|
|
59
|
-
olson(year=2013), schumer(page=2), schumer(page=3), poe(year=2015, month=2),
|
|
60
|
-
poe(year=2015, month=1)
|
|
58
|
+
sessions(year=2013), pryor(page=1), farr(year=2013), farr(year=2012), farr(year=2011), cassidy(page=2), cassidy(page=3),
|
|
59
|
+
olson(year=2013), schumer(page=2), schumer(page=3), poe(year=2015, month=2), ellison(page=1), ellison(page=2), lowey(page=1),
|
|
60
|
+
lowey(page=2), lowey(page=3), poe(year=2015, month=1), mcmorris(page=2), mcmorris(page=3), schiff(page=2), schiff(page=3),
|
|
61
|
+
takano(page=2), takano(page=3)].flatten
|
|
61
62
|
Utils.remove_generic_urls!(results)
|
|
62
63
|
end
|
|
63
64
|
|
|
@@ -247,19 +248,6 @@ module Statement
|
|
|
247
248
|
results
|
|
248
249
|
end
|
|
249
250
|
|
|
250
|
-
def self.boehner(page=1, year=Date.today.year)
|
|
251
|
-
results = []
|
|
252
|
-
url = "http://boehner.house.gov/category/press-releases/page/#{page}/"
|
|
253
|
-
doc = open_html(url)
|
|
254
|
-
return if doc.nil?
|
|
255
|
-
(doc/:article).each do |row|
|
|
256
|
-
month = row.children[1].children[1].children[1].children[0].text
|
|
257
|
-
day = row.children[1].children[1].children[1].children[1].text
|
|
258
|
-
results << { :source => url, :url => "http://boehner.house.gov"+row.children[12].children[1].children[0]['href'], :title => row.children[1].children[1].children[3].text, :date => Date.parse(month+" "+day+" "+year.to_s), :domain => 'boehner.house.gov'}
|
|
259
|
-
end
|
|
260
|
-
results
|
|
261
|
-
end
|
|
262
|
-
|
|
263
251
|
def self.capuano
|
|
264
252
|
results = []
|
|
265
253
|
base_url = "http://www.house.gov/capuano/news/"
|
|
@@ -365,6 +353,17 @@ module Statement
|
|
|
365
353
|
results
|
|
366
354
|
end
|
|
367
355
|
|
|
356
|
+
def self.mcmorris(page=1)
|
|
357
|
+
results = []
|
|
358
|
+
url = "http://mcmorris.house.gov/issues/page/#{page}/?tax=types&term=news_releases"
|
|
359
|
+
doc = open_html(url)
|
|
360
|
+
return if doc.nil?
|
|
361
|
+
doc.css(".feed-result").each do |row|
|
|
362
|
+
results << { :source => url, :url => row.children[3].children[3].children.first['href'], :title => row.children[3].children[3].children.first.text.strip, :date => Date.parse(row.children[3].children[1].text), :domain => "mcmorris.house.gov" }
|
|
363
|
+
end
|
|
364
|
+
results
|
|
365
|
+
end
|
|
366
|
+
|
|
368
367
|
def self.klobuchar(year)
|
|
369
368
|
results = []
|
|
370
369
|
base_url = "http://www.klobuchar.senate.gov/"
|
|
@@ -461,16 +460,62 @@ module Statement
|
|
|
461
460
|
results
|
|
462
461
|
end
|
|
463
462
|
|
|
463
|
+
def self.schiff(page=1)
|
|
464
|
+
results = []
|
|
465
|
+
url = "http://schiff.house.gov/news/press-releases?PageNum_rs=#{page}&"
|
|
466
|
+
doc = open_html(url)
|
|
467
|
+
return if doc.nil?
|
|
468
|
+
rows = doc.css("#press").first.css('h2')
|
|
469
|
+
rows.each do |row|
|
|
470
|
+
results << { :source => url, :url => "http://schiff.house.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "schiff.house.gov" }
|
|
471
|
+
end
|
|
472
|
+
results
|
|
473
|
+
end
|
|
464
474
|
|
|
465
|
-
|
|
466
|
-
def self.lautenberg(rows=1000)
|
|
475
|
+
def self.takano(page=1)
|
|
467
476
|
results = []
|
|
468
|
-
|
|
469
|
-
url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
|
|
477
|
+
url = "http://takano.house.gov/newsroom/press-releases?PageNum_rs=#{page}"
|
|
470
478
|
doc = open_html(url)
|
|
471
479
|
return if doc.nil?
|
|
472
|
-
doc.
|
|
473
|
-
|
|
480
|
+
rows = doc.css("#press").first.css('h2')
|
|
481
|
+
rows.each do |row|
|
|
482
|
+
results << { :source => url, :url => "http://takano.house.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "takano.house.gov" }
|
|
483
|
+
end
|
|
484
|
+
results
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
def self.speier
|
|
488
|
+
results = []
|
|
489
|
+
url = "http://speier.house.gov/index.php?option=com_content&view=category&id=20&Itemid=14"
|
|
490
|
+
doc = open_html(url)
|
|
491
|
+
return if doc.nil?
|
|
492
|
+
rows = doc.css("table.category tr")
|
|
493
|
+
rows.each do |row|
|
|
494
|
+
results << { :source => url, :url => "http://speier.house.gov" + row.children[1].children[1]['href'], :title => row.children[1].children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => "speier.house.gov" }
|
|
495
|
+
end
|
|
496
|
+
results
|
|
497
|
+
end
|
|
498
|
+
|
|
499
|
+
def self.burr(page=1)
|
|
500
|
+
results = []
|
|
501
|
+
url = "http://www.burr.senate.gov/press/releases?PageNum_rs=#{page}&"
|
|
502
|
+
doc = open_html(url)
|
|
503
|
+
return if doc.nil?
|
|
504
|
+
rows = doc.css("#press").first.css('h2')
|
|
505
|
+
rows.each do |row|
|
|
506
|
+
results << { :source => url, :url => "http://www.burr.senate.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "burr.senate.gov" }
|
|
507
|
+
end
|
|
508
|
+
results
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
def self.cassidy(page=1)
|
|
512
|
+
results = []
|
|
513
|
+
url = "http://www.cassidy.senate.gov/newsroom/press-releases?PageNum_rs=#{page}&"
|
|
514
|
+
doc = open_html(url)
|
|
515
|
+
return if doc.nil?
|
|
516
|
+
rows = doc.css("#press").first.css('h2')
|
|
517
|
+
rows.each do |row|
|
|
518
|
+
results << { :source => url, :url => "http://www.cassidy.senate.gov" + row.children.first['href'], :title => row.children.last.text.strip, :date => Date.strptime(row.previous.previous.text, "%m.%d.%y"), :domain => "cassidy.senate.gov" }
|
|
474
519
|
end
|
|
475
520
|
results
|
|
476
521
|
end
|
|
@@ -510,6 +555,17 @@ module Statement
|
|
|
510
555
|
results
|
|
511
556
|
end
|
|
512
557
|
|
|
558
|
+
def self.ellison(page=0)
|
|
559
|
+
results = []
|
|
560
|
+
url = "http://ellison.house.gov/media-center/press-releases?page=#{page}"
|
|
561
|
+
doc = open_html(url)
|
|
562
|
+
return if doc.nil?
|
|
563
|
+
doc.xpath("//div[@class='views-field views-field-created datebar']").each do |row|
|
|
564
|
+
results << { :source => url, :url => "http://ellison.house.gov" + row.next.next.children[1].children[0]['href'], :title => row.next.next.text.strip, :date => Date.parse(row.text.strip), :domain => "ellison.house.gov" }
|
|
565
|
+
end
|
|
566
|
+
results
|
|
567
|
+
end
|
|
568
|
+
|
|
513
569
|
def self.boxer
|
|
514
570
|
results = []
|
|
515
571
|
url = "http://www.boxer.senate.gov/press/release"
|
|
@@ -550,16 +606,13 @@ module Statement
|
|
|
550
606
|
results
|
|
551
607
|
end
|
|
552
608
|
|
|
553
|
-
def self.
|
|
609
|
+
def self.durbin(page=1)
|
|
554
610
|
results = []
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
doc.xpath("//div[@id='press']//h2").each do |row|
|
|
561
|
-
results << { :source => url, :url => "http://#{domain}"+row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.previous.previous.text.gsub(".","/")), :domain => domain}
|
|
562
|
-
end
|
|
611
|
+
url = "http://www.durbin.senate.gov/newsroom/press-releases?PageNum_rs=#{page}&"
|
|
612
|
+
doc = open_html(url)
|
|
613
|
+
return if doc.nil?
|
|
614
|
+
doc.xpath("//div[@id='press']//h2").each do |row|
|
|
615
|
+
results << { :source => url, :url => "http://www.durbin.senate.gov"+row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.previous.previous.text.gsub(".","/")), :domain => 'www.durbin.senate.gov'}
|
|
563
616
|
end
|
|
564
617
|
results
|
|
565
618
|
end
|
|
@@ -730,7 +783,7 @@ module Statement
|
|
|
730
783
|
{"clawson.house.gov" => 2641},
|
|
731
784
|
{"palazzo.house.gov" => 2519},
|
|
732
785
|
{"roe.house.gov" => 1532},
|
|
733
|
-
{"perry.house.gov" =>
|
|
786
|
+
{"perry.house.gov" => 2607},
|
|
734
787
|
{"rodneydavis.house.gov" => 2427},
|
|
735
788
|
{"kevinbrady.house.gov" => 2657},
|
|
736
789
|
{"loudermilk.house.gov" => 27},
|
|
@@ -742,7 +795,13 @@ module Statement
|
|
|
742
795
|
{"grothman.house.gov" => 27},
|
|
743
796
|
{"beyer.house.gov" => 27},
|
|
744
797
|
{"kathleenrice.house.gov" => 27},
|
|
745
|
-
{"hanna.house.gov" => 27}
|
|
798
|
+
{"hanna.house.gov" => 27},
|
|
799
|
+
{"trentkelly.house.gov" => 27},
|
|
800
|
+
{"lamborn.house.gov" => 27},
|
|
801
|
+
{"wittman.house.gov" => 2670},
|
|
802
|
+
{"kinzinger.house.gov" => 2665},
|
|
803
|
+
{"ellmers.house.gov" => 27},
|
|
804
|
+
{"frankel.house.gov" => 27}
|
|
746
805
|
]
|
|
747
806
|
domains.each do |domain|
|
|
748
807
|
doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
|
|
@@ -792,23 +851,6 @@ module Statement
|
|
|
792
851
|
results
|
|
793
852
|
end
|
|
794
853
|
|
|
795
|
-
def self.lamborn(limit=nil)
|
|
796
|
-
results = []
|
|
797
|
-
domain = 'lamborn.house.gov'
|
|
798
|
-
url = "http://lamborn.house.gov/2015-press-releases/"
|
|
799
|
-
doc = open_html(url)
|
|
800
|
-
return if doc.nil?
|
|
801
|
-
links = (doc/:h3).map{|h| { "http://lamborn.house.gov"+h.children[1]['href'] => h.text.strip} }
|
|
802
|
-
links = links.first(limit) if limit
|
|
803
|
-
links.each do |link|
|
|
804
|
-
page = open_html(link.keys.first)
|
|
805
|
-
print_path = page.search("a").detect{|a| a['onclick'] && a['onclick'].include?('popup')}['onclick'].split("'")[1]
|
|
806
|
-
print_page = open_html("http://lamborn.house.gov"+print_path)
|
|
807
|
-
results << {:source => url, :url => link.keys.first, :title => link.values.first, :date => Date.parse(print_page.xpath('//*[@class="PopupNewsDetailsDate"]').text), :domain => domain }
|
|
808
|
-
end
|
|
809
|
-
results
|
|
810
|
-
end
|
|
811
|
-
|
|
812
854
|
def self.jenkins
|
|
813
855
|
results = []
|
|
814
856
|
domain = 'lynnjenkins.house.gov/'
|
|
@@ -821,18 +863,6 @@ module Statement
|
|
|
821
863
|
results
|
|
822
864
|
end
|
|
823
865
|
|
|
824
|
-
def self.walden
|
|
825
|
-
results = []
|
|
826
|
-
domain = 'walden.house.gov'
|
|
827
|
-
url = "http://walden.house.gov/s2015/"
|
|
828
|
-
doc = open_html(url)
|
|
829
|
-
return if doc.nil?
|
|
830
|
-
doc.xpath('//*[@id="centerbox"]/div[1]/ul/li').each do |row|
|
|
831
|
-
results << {:source => url, :url => 'http://walden.house.gov' + row.children[3].children[1]['href'], :title => row.children[3].text.strip, :date => Date.parse(row.children[5].text), :domain => domain }
|
|
832
|
-
end
|
|
833
|
-
results
|
|
834
|
-
end
|
|
835
|
-
|
|
836
866
|
def self.bennie_thompson
|
|
837
867
|
results = []
|
|
838
868
|
domain = "benniethompson.house.gov"
|
|
@@ -845,14 +875,14 @@ module Statement
|
|
|
845
875
|
results
|
|
846
876
|
end
|
|
847
877
|
|
|
848
|
-
def self.
|
|
878
|
+
def self.lowey(page=0)
|
|
849
879
|
results = []
|
|
850
|
-
domain = "
|
|
851
|
-
url = "
|
|
880
|
+
domain = "lowey.house.gov"
|
|
881
|
+
url = "https://lowey.house.gov/media-center/press-releases?page=#{page}"
|
|
852
882
|
doc = open_html(url)
|
|
853
883
|
return if doc.nil?
|
|
854
|
-
doc.
|
|
855
|
-
results << {:source => url, :url => 'http://
|
|
884
|
+
doc.css(".view-content .views-row").first(10).each do |row|
|
|
885
|
+
results << {:source => url, :url => 'http://lowey.house.gov' + row.css('h3').first.children.first['href'], :title => row.css('h3').first.children.first.text.strip, :date => Date.parse(row.css(".views-field .field-content")[1].text), :domain => domain }
|
|
856
886
|
end
|
|
857
887
|
results
|
|
858
888
|
end
|
|
@@ -928,14 +958,19 @@ module Statement
|
|
|
928
958
|
"http://butterfield.house.gov/media-center/press-releases",
|
|
929
959
|
"http://walz.house.gov/media-center/press-releases",
|
|
930
960
|
"https://pingree.house.gov/media-center/press-releases",
|
|
931
|
-
"http://sarbanes.house.gov/media-center/press-releases",
|
|
932
961
|
"http://wilson.house.gov/media-center/press-releases",
|
|
933
962
|
"https://bilirakis.house.gov/press-releases",
|
|
934
963
|
"http://quigley.house.gov/media-center/press-releases",
|
|
935
964
|
"https://denham.house.gov/media-center/press-releases",
|
|
936
965
|
"https://sewell.house.gov/media-center/press-releases",
|
|
937
966
|
"https://buchanan.house.gov/media-center/press-releases",
|
|
938
|
-
"https://meehan.house.gov/media-center/press-releases"
|
|
967
|
+
"https://meehan.house.gov/media-center/press-releases",
|
|
968
|
+
"https://olson.house.gov/media-center/press-releases",
|
|
969
|
+
"https://louise.house.gov/media-center/press-releases",
|
|
970
|
+
"https://waters.house.gov/media-center/press-releases",
|
|
971
|
+
"https://walden.house.gov/media-center/press-releases",
|
|
972
|
+
"https://brooks.house.gov/media-center/news-releases",
|
|
973
|
+
"https://swalwell.house.gov/media-center/press-releases"
|
|
939
974
|
]
|
|
940
975
|
end
|
|
941
976
|
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: statement
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: '2.1'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Derek Willis
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2015-
|
|
11
|
+
date: 2015-12-03 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -205,7 +205,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
205
205
|
version: '0'
|
|
206
206
|
requirements: []
|
|
207
207
|
rubyforge_project:
|
|
208
|
-
rubygems_version: 2.
|
|
208
|
+
rubygems_version: 2.4.5
|
|
209
209
|
signing_key:
|
|
210
210
|
specification_version: 4
|
|
211
211
|
summary: Given a url, Statement returns links to press releases and official statements.
|