statement 1.9.1 → 1.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/statement/feed.rb +5 -5
- data/lib/statement/scraper.rb +89 -16
- data/lib/statement/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0be73a63b6661e11498742d7877c36ed2da5e182
|
4
|
+
data.tar.gz: 9368bd6373159d94b9463e57e657616c440d9c01
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4ecf8b06def3327c5eafb8987b3037c793a08cf1ff85a54ceaf8823c3dce1a814289fecda74d359b49da75197ff57e104b60db42899cd2f2d7307c25bd930f0c
|
7
|
+
data.tar.gz: d70e2b77e900bbbd3122cce1665bc9d92293063c87bd4ce39b7f82b732e672152e8f4f1750160790942142f39cd4773beb8b8c082baf4714647fd4152c63aebf
|
data/lib/statement/feed.rb
CHANGED
@@ -7,7 +7,7 @@ require 'typhoeus'
|
|
7
7
|
|
8
8
|
module Statement
|
9
9
|
class Feed
|
10
|
-
|
10
|
+
|
11
11
|
def self.batch(urls)
|
12
12
|
results = []
|
13
13
|
failures = []
|
@@ -27,7 +27,7 @@ module Statement
|
|
27
27
|
hydra.run
|
28
28
|
[results.flatten, failures]
|
29
29
|
end
|
30
|
-
|
30
|
+
|
31
31
|
def self.open_rss(url)
|
32
32
|
begin
|
33
33
|
Nokogiri::XML(open(url))
|
@@ -35,10 +35,10 @@ module Statement
|
|
35
35
|
nil
|
36
36
|
end
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
def self.date_from_rss_item(link)
|
40
40
|
if !link.xpath('pubDate').text.empty?
|
41
|
-
|
41
|
+
Date.parse(link.xpath('pubDate').text)
|
42
42
|
elsif !link.xpath('pubdate').empty?
|
43
43
|
Date.parse(link.xpath('pubdate').text)
|
44
44
|
elsif link.xpath('link').text.include?("mikulski.senate.gov") and link.xpath('link').text.include?("-2014")
|
@@ -53,7 +53,7 @@ module Statement
|
|
53
53
|
return unless doc
|
54
54
|
parse_rss(doc, url)
|
55
55
|
end
|
56
|
-
|
56
|
+
|
57
57
|
def self.parse_rss(doc, url)
|
58
58
|
links = doc.xpath('//item')
|
59
59
|
return if links.empty?
|
data/lib/statement/scraper.rb
CHANGED
@@ -29,7 +29,10 @@ module Statement
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.member_methods
|
32
|
-
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :billnelson, :crapo, :boxer,
|
32
|
+
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :billnelson, :crapo, :boxer,
|
33
|
+
:vitter, :inhofe, :palazzo, :roe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton,
|
34
|
+
:sherman_mccaul, :welch, :sessions, :gabbard, :ellison, :costa, :farr, :mcclintock, :mcnerney, :olson, :schumer, :lamborn, :walden,
|
35
|
+
:bennie_thompson, :speier, :poe]
|
33
36
|
end
|
34
37
|
|
35
38
|
def self.committee_methods
|
@@ -40,8 +43,9 @@ module Statement
|
|
40
43
|
year = Date.today.year
|
41
44
|
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, susandavis, klobuchar(year), palazzo(page=1), roe(page=1), billnelson(year=year),
|
42
45
|
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, coburn, boxer(start=1),
|
43
|
-
vitter(year=year), inhofe(year=
|
44
|
-
sessions(year=year), gabbard, ellison(page=0), costa, farr, olson, mcnerney
|
46
|
+
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, sherman_mccaul, welch,
|
47
|
+
sessions(year=year), gabbard, ellison(page=0), costa, farr, olson, mcnerney, schumer, lamborn(limit=10), walden, bennie_thompson, speier,
|
48
|
+
poe(year=year, month=0)].flatten
|
45
49
|
results = results.compact
|
46
50
|
Utils.remove_generic_urls!(results)
|
47
51
|
end
|
@@ -51,7 +55,8 @@ module Statement
|
|
51
55
|
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
52
56
|
boxer(start=31), boxer(start=41), vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3), clark(year=2013), culberson_chabot_grisham(page=2),
|
53
57
|
sherman_mccaul(page=1), sessions(year=2013), pryor(page=1), ellison(page=1), ellison(page=2), ellison(page=3), farr(year=2013), farr(year=2012), farr(year=2011),
|
54
|
-
mcnerney(page=2), mcnerney(page=3), mcnerney(page=4), mcnerney(page=5), mcnerney(page=6), olson(year=2013)
|
58
|
+
mcnerney(page=2), mcnerney(page=3), mcnerney(page=4), mcnerney(page=5), mcnerney(page=6), olson(year=2013), schumer(page=2), schumer(page=3), poe(year=2015, month=2),
|
59
|
+
poe(year=2015, month=1)].flatten
|
55
60
|
Utils.remove_generic_urls!(results)
|
56
61
|
end
|
57
62
|
|
@@ -334,18 +339,6 @@ module Statement
|
|
334
339
|
results
|
335
340
|
end
|
336
341
|
|
337
|
-
def self.susandavis
|
338
|
-
results = []
|
339
|
-
base_url = "http://www.house.gov/susandavis/"
|
340
|
-
doc = open_html(base_url+'news.shtml')
|
341
|
-
return if doc.nil?
|
342
|
-
doc.search("ul")[6].children.each do |row|
|
343
|
-
next if row.text.strip == ''
|
344
|
-
results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
|
345
|
-
end
|
346
|
-
results
|
347
|
-
end
|
348
|
-
|
349
342
|
def self.klobuchar(year)
|
350
343
|
results = []
|
351
344
|
base_url = "http://www.klobuchar.senate.gov/"
|
@@ -361,6 +354,20 @@ module Statement
|
|
361
354
|
results
|
362
355
|
end
|
363
356
|
|
357
|
+
def self.poe(year, month=0)
|
358
|
+
results = []
|
359
|
+
base_url = "http://poe.house.gov"
|
360
|
+
month_url = base_url + "/press-releases?MonthDisplay=#{month}&YearDisplay=#{year}"
|
361
|
+
doc = open_html(year_url)
|
362
|
+
return if doc.nil?
|
363
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
364
|
+
next if row.children[3].children[0].text.strip == 'Title'
|
365
|
+
results << { :source => month_url, :url => base_url + row.children[3].children[0]['href'], :title => row.children[3].children[0].text.strip, :date => Date.strptime(row.children[1].text, "%m/%d/%y"), :domain => "poe.house.gov" }
|
366
|
+
end
|
367
|
+
end
|
368
|
+
results
|
369
|
+
end
|
370
|
+
|
364
371
|
def self.lujan
|
365
372
|
results = []
|
366
373
|
base_url = 'http://lujan.house.gov/'
|
@@ -709,6 +716,72 @@ module Statement
|
|
709
716
|
results.flatten
|
710
717
|
end
|
711
718
|
|
719
|
+
def self.schumer(page=1)
|
720
|
+
results = []
|
721
|
+
domain = 'www.schumer.senate.gov'
|
722
|
+
url = "http://www.schumer.senate.gov/newsroom/press-releases/table?PageNum_rs=#{page}"
|
723
|
+
doc = open_html(url)
|
724
|
+
return if doc.nil?
|
725
|
+
rows = (doc/:table/:tr).select{|r| !r.children[3].nil?}
|
726
|
+
rows.each do |row|
|
727
|
+
results << {:source => url, :url => row.children[3].children[1]['href'].strip, :title => row.children[3].text.strip, :date => Date.parse(row.children[1].text.strip), :domain => domain }
|
728
|
+
end
|
729
|
+
results
|
730
|
+
end
|
731
|
+
|
732
|
+
def self.lamborn(limit=nil)
|
733
|
+
results = []
|
734
|
+
domain = 'lamborn.house.gov'
|
735
|
+
url = "http://lamborn.house.gov/2015-press-releases/"
|
736
|
+
doc = open_html(url)
|
737
|
+
return if doc.nil?
|
738
|
+
links = (doc/:h3).map{|h| { "http://lamborn.house.gov"+h.children[1]['href'] => h.text.strip} }
|
739
|
+
links = links.first(limit) if limit
|
740
|
+
links.each do |link|
|
741
|
+
page = open_html(link.keys.first)
|
742
|
+
print_path = page.search("a").detect{|a| a['onclick'] && a['onclick'].include?('popup')}['onclick'].split("'")[1]
|
743
|
+
print_page = open_html("http://lamborn.house.gov"+print_path)
|
744
|
+
results << {:source => url, :url => link.keys.first, :title => link.values.first, :date => Date.parse(print_page.xpath('//*[@class="PopupNewsDetailsDate"]').text), :domain => domain }
|
745
|
+
end
|
746
|
+
results
|
747
|
+
end
|
748
|
+
|
749
|
+
def self.walden
|
750
|
+
results = []
|
751
|
+
domain = 'walden.house.gov'
|
752
|
+
url = "http://walden.house.gov/s2015/"
|
753
|
+
doc = open_html(url)
|
754
|
+
return if doc.nil?
|
755
|
+
doc.xpath('//*[@id="centerbox"]/div[1]/ul/li').each do |row|
|
756
|
+
results << {:source => url, :url => 'http://walden.house.gov' + row.children[3].children[1]['href'], :title => row.children[3].text.strip, :date => Date.parse(row.children[5].text), :domain => domain }
|
757
|
+
end
|
758
|
+
results
|
759
|
+
end
|
760
|
+
|
761
|
+
def self.bennie_thompson
|
762
|
+
results = []
|
763
|
+
domain = "benniethompson.house.gov"
|
764
|
+
url = "http://benniethompson.house.gov/index.php?option=com_content&view=category&id=41&Itemid=148"
|
765
|
+
doc = open_html(url)
|
766
|
+
return if doc.nil?
|
767
|
+
doc.xpath('//*[@id="adminForm"]/table/tbody/tr').each do |row|
|
768
|
+
results << {:source => url, :url => 'http://benniethompson.house.gov' + row.children[1].children[1]['href'], :title => row.children[1].children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
|
769
|
+
end
|
770
|
+
results
|
771
|
+
end
|
772
|
+
|
773
|
+
def self.speier
|
774
|
+
results = []
|
775
|
+
domain = "speier.house.gov"
|
776
|
+
url = "http://speier.house.gov/index.php?option=com_content&view=category&id=20&Itemid=14"
|
777
|
+
doc = open_html(url)
|
778
|
+
return if doc.nil?
|
779
|
+
doc.xpath('//*[@id="adminForm"]/table/tbody/tr').each do |row|
|
780
|
+
results << {:source => url, :url => 'http://speier.house.gov' + row.children[1].children[1]['href'], :title => row.children[1].children[1].text.strip, :date => Date.parse(row.children[3].text.strip), :domain => domain }
|
781
|
+
end
|
782
|
+
results
|
783
|
+
end
|
784
|
+
|
712
785
|
def self.backfill_bilirakis
|
713
786
|
results = []
|
714
787
|
domain = 'bilirakis.house.gov'
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Derek Willis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|