statement 2.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/statement/feed.rb +15 -1
- data/lib/statement/scraper.rb +38 -4
- data/lib/statement/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74eeb1be0ee4ada11df34f5b8ddce00ffd5649cf
|
4
|
+
data.tar.gz: dd7fa7d9e6b0ccf0ae0bd3cdfa3f61f9306f4ef4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 392340713cdf259bff3e60816d3ea12966a09127b0cad75c65f71307b18a692812bb7dc594fc332bbe9c49df15d52f6ab9d396d27795890d736de65ff10330a7
|
7
|
+
data.tar.gz: a80659fe9a3a651f8db4585112dd193fe413762d1409afd4703ff1d63454599298c22416e4f1f8735897a04169e4e09d5c3436a3e8699c67d7f124654423033f
|
data/lib/statement/feed.rb
CHANGED
@@ -17,6 +17,7 @@ module Statement
|
|
17
17
|
req.on_complete do |response|
|
18
18
|
if response.success?
|
19
19
|
doc = Nokogiri::XML(response.body)
|
20
|
+
results << parse_atom(doc, url) if url == "http://larson.house.gov/index.php?option=com_ninjarsssyndicator&feed_id=1&format=raw"
|
20
21
|
results << parse_rss(doc, url)
|
21
22
|
else
|
22
23
|
failures << url
|
@@ -51,7 +52,11 @@ module Statement
|
|
51
52
|
def self.from_rss(url)
|
52
53
|
doc = open_rss(url)
|
53
54
|
return unless doc
|
54
|
-
|
55
|
+
if url == "http://larson.house.gov/index.php?option=com_ninjarsssyndicator&feed_id=1&format=raw"
|
56
|
+
parse_atom(doc, url)
|
57
|
+
else
|
58
|
+
parse_rss(doc, url)
|
59
|
+
end
|
55
60
|
end
|
56
61
|
|
57
62
|
def self.parse_rss(doc, url)
|
@@ -65,5 +70,14 @@ module Statement
|
|
65
70
|
end
|
66
71
|
Utils.remove_generic_urls!(results)
|
67
72
|
end
|
73
|
+
|
74
|
+
def self.parse_atom(doc, url)
|
75
|
+
links = (doc/:entry)
|
76
|
+
return if links.empty?
|
77
|
+
results = links.map do |link|
|
78
|
+
{ :source => url, :url => link.children[3]['href'], :title => link.children[1].text, :date => Date.parse(link.children[5].text), :domain => URI.parse(url).host }
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
68
82
|
end
|
69
83
|
end
|
data/lib/statement/scraper.rb
CHANGED
@@ -32,7 +32,7 @@ module Statement
|
|
32
32
|
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :freshman_senators, :klobuchar, :billnelson, :crapo, :boxer,
|
33
33
|
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton,
|
34
34
|
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :lamborn, :walden,
|
35
|
-
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins]
|
35
|
+
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin_burr]
|
36
36
|
end
|
37
37
|
|
38
38
|
def self.committee_methods
|
@@ -45,7 +45,7 @@ module Statement
|
|
45
45
|
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
46
46
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, welch,
|
47
47
|
sessions(year=year), gabbard, costa, farr, olson, schumer, lamborn(limit=10), walden, bennie_thompson, speier,
|
48
|
-
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins].flatten
|
48
|
+
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin_burr(page=1)].flatten
|
49
49
|
results = results.compact
|
50
50
|
Utils.remove_generic_urls!(results)
|
51
51
|
end
|
@@ -491,6 +491,20 @@ module Statement
|
|
491
491
|
results
|
492
492
|
end
|
493
493
|
|
494
|
+
def self.durbin_burr(page=1)
|
495
|
+
results = []
|
496
|
+
domains = ["www.durbin.senate.gov", "www.burr.senate.gov"]
|
497
|
+
domains.each do |domain|
|
498
|
+
url = "http://#{domain}/newsroom/press-releases?PageNum_rs=#{page}&"
|
499
|
+
doc = open_html(url)
|
500
|
+
return if doc.nil?
|
501
|
+
doc.xpath("//div[@id='press']//h2").each do |row|
|
502
|
+
results << { :source => url, :url => "http://#{domain}"+row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.previous.previous.text.gsub(".","/")), :domain => domain}
|
503
|
+
end
|
504
|
+
end
|
505
|
+
results
|
506
|
+
end
|
507
|
+
|
494
508
|
def self.inhofe(year=Date.today.year)
|
495
509
|
results = []
|
496
510
|
url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
|
@@ -651,7 +665,25 @@ module Statement
|
|
651
665
|
|
652
666
|
def self.document_query(page=1)
|
653
667
|
results = []
|
654
|
-
domains = [
|
668
|
+
domains = [
|
669
|
+
{"thornberry.house.gov" => 1776},
|
670
|
+
{"wenstrup.house.gov" => 2491},
|
671
|
+
{"clawson.house.gov" => 2641},
|
672
|
+
{"palazzo.house.gov" => 2519},
|
673
|
+
{"roe.house.gov" => 1532},
|
674
|
+
{"perry.house.gov" => 2608},
|
675
|
+
{"rodneydavis.house.gov" => 2427},
|
676
|
+
{"kevinbrady.house.gov" => 2657},
|
677
|
+
{"loudermilk.house.gov" => 27},
|
678
|
+
{"babin.house.gov" => 27},
|
679
|
+
{"bridenstine.house.gov" => 2412},
|
680
|
+
{"allen.house.gov" => 27},
|
681
|
+
{"davidscott.house.gov" => 377},
|
682
|
+
{"buddycarter.house.gov" => 27},
|
683
|
+
{"grothman.house.gov" => 27},
|
684
|
+
{"beyer.house.gov" => 27},
|
685
|
+
{"kathleenrice.house.gov" => 27}
|
686
|
+
]
|
655
687
|
domains.each do |domain|
|
656
688
|
doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
|
657
689
|
return if doc.nil?
|
@@ -827,7 +859,9 @@ module Statement
|
|
827
859
|
"http://sarbanes.house.gov/media-center/press-releases",
|
828
860
|
"http://wilson.house.gov/media-center/press-releases",
|
829
861
|
"https://bilirakis.house.gov/press-releases",
|
830
|
-
"http://quigley.house.gov/media-center/press-releases"
|
862
|
+
"http://quigley.house.gov/media-center/press-releases",
|
863
|
+
"https://denham.house.gov/media-center/press-releases",
|
864
|
+
"https://sewell.house.gov/media-center/press-releases"
|
831
865
|
]
|
832
866
|
end
|
833
867
|
|
data/lib/statement/version.rb
CHANGED