statement 1.7.1 → 1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +113 -5
- data/lib/statement/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 57d601c1eeffddfe7479bc0a220b1bd8b0af5477
|
4
|
+
data.tar.gz: 9e117589cbc76f087ae42b0077575ace1055dd49
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 162ecc8fe267c4871a55bc594c4588dac40843558cc02ad301bf35d609f40f594fa6df6e08a74495942a0af362a200e5c4ede420b43d65442f5475414052930f
|
7
|
+
data.tar.gz: b1ae99e5b127c005358b20943d7b11f9742f83ced8242568137842c68f64a09379dca14a857e9768aa9eb20b62d37b91f67ecc2a420f8089b4fcc6b360b48e05
|
data/lib/statement/scraper.rb
CHANGED
@@ -29,7 +29,7 @@ module Statement
|
|
29
29
|
end
|
30
30
|
|
31
31
|
def self.member_methods
|
32
|
-
[:capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :lujan, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :vitter, :donnelly, :inhofe, :reid, :palazzo, :roe, :document_query, :swalwell, :fischer]
|
32
|
+
[:capuano, :cold_fusion, :conaway, :chabot, :susandavis, :freshman_senators, :klobuchar, :lujan, :billnelson, :lautenberg, :crapo, :coburn, :boxer, :vitter, :donnelly, :inhofe, :reid, :palazzo, :roe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton, :wolf_sherman_mccaul, :welch, :sessions, :gabbard]
|
33
33
|
end
|
34
34
|
|
35
35
|
def self.committee_methods
|
@@ -40,7 +40,8 @@ module Statement
|
|
40
40
|
year = Date.today.year
|
41
41
|
results = [capuano, cold_fusion(year, 0), conaway, chabot, susandavis, klobuchar, lujan, palazzo(page=1), roe(page=1), billnelson(year=year),
|
42
42
|
document_query(page=1), document_query(page=2), swalwell(page=1), donnelly(year=year), crapo, coburn, boxer(start=1),
|
43
|
-
vitter(year=year), inhofe(year=year), reid, fischer
|
43
|
+
vitter(year=year), inhofe(year=year), reid, fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, wolf_sherman_mccaul, welch,
|
44
|
+
sessions(year=year), gabbard].flatten
|
44
45
|
results = results.compact
|
45
46
|
Utils.remove_generic_urls!(results)
|
46
47
|
end
|
@@ -48,8 +49,8 @@ module Statement
|
|
48
49
|
def self.backfill_from_scrapers
|
49
50
|
results = [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
|
50
51
|
document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
|
51
|
-
boxer(start=31), boxer(start=41), vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3)
|
52
|
-
].flatten
|
52
|
+
boxer(start=31), boxer(start=41), vitter(year=2012), vitter(year=2011), swalwell(page=2), swalwell(page=3), clark(year=2013), culberson_chabot_grisham(page=2),
|
53
|
+
wolf_sherman_mccaul(page=1), sessions(year=2013)].flatten
|
53
54
|
Utils.remove_generic_urls!(results)
|
54
55
|
end
|
55
56
|
|
@@ -493,7 +494,114 @@ module Statement
|
|
493
494
|
end
|
494
495
|
results
|
495
496
|
end
|
496
|
-
|
497
|
+
|
498
|
+
def self.clark(year=Date.today.year)
|
499
|
+
results = []
|
500
|
+
domain = 'katherineclark.house.gov'
|
501
|
+
url = "http://katherineclark.house.gov/index.cfm/press-releases?MonthDisplay=0&YearDisplay=#{year}"
|
502
|
+
doc = open_html(url)
|
503
|
+
return if doc.nil?
|
504
|
+
(doc/:tr)[1..-1].each do |row|
|
505
|
+
next if row.children.first.text == 'Date'
|
506
|
+
results << { :source => url, :date => Date.parse(row.children.first.text), :title => row.children[2].children.text, :url => row.children[2].children[0]['href'], :domain => domain}
|
507
|
+
end
|
508
|
+
results
|
509
|
+
end
|
510
|
+
|
511
|
+
def self.sessions(year=Date.today.year)
|
512
|
+
results = []
|
513
|
+
domain = 'sessions.senate.gov'
|
514
|
+
url = "http://www.sessions.senate.gov/public/index.cfm/news-releases?YearDisplay=#{year}"
|
515
|
+
doc = open_html(url)
|
516
|
+
return if doc.nil?
|
517
|
+
(doc/:tr)[1..-1].each do |row|
|
518
|
+
next if row.children.first.text == 'Date'
|
519
|
+
results << { :source => url, :date => Date.parse(row.children.first.text), :title => row.children[2].children.text, :url => row.children[2].children[0]['href'], :domain => domain}
|
520
|
+
end
|
521
|
+
results
|
522
|
+
end
|
523
|
+
|
524
|
+
def self.edwards
|
525
|
+
results = []
|
526
|
+
domain = 'donnaedwards.house.gov'
|
527
|
+
url = "http://donnaedwards.house.gov/index.php?option=com_content&view=category&id=10&Itemid=18"
|
528
|
+
doc = open_html(url)
|
529
|
+
return if doc.nil?
|
530
|
+
table = (doc/:table)[4]
|
531
|
+
(table/:tr).each do |row|
|
532
|
+
results << { :source => url, :url => "http://donnaedwards.house.gov/"+row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[3].text.strip), :domain => domain}
|
533
|
+
end
|
534
|
+
results
|
535
|
+
end
|
536
|
+
|
537
|
+
def self.culberson_chabot_grisham(page=1)
|
538
|
+
results = []
|
539
|
+
domains = [{'culberson.house.gov' => 2573}, {'chabot.house.gov' => 2508}, {'lujangrisham.house.gov' => 2447}]
|
540
|
+
domains.each do |domain|
|
541
|
+
doc = open_html("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}")
|
542
|
+
return if doc.nil?
|
543
|
+
doc.css('ul.UnorderedNewsList li').each do |row|
|
544
|
+
link = "http://"+domain.keys.first+"/news/" + row.children[1]['href']
|
545
|
+
title = row.children[1].text.strip
|
546
|
+
date = Date.parse(row.children[3].text.strip)
|
547
|
+
results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :title => title, :url => link, :date => date, :domain => domain }
|
548
|
+
end
|
549
|
+
end
|
550
|
+
results.flatten
|
551
|
+
end
|
552
|
+
|
553
|
+
def self.barton
|
554
|
+
results = []
|
555
|
+
domain = 'joebarton.house.gov'
|
556
|
+
url = "http://joebarton.house.gov/press-releasescolumns/"
|
557
|
+
doc = open_html(url)
|
558
|
+
return if doc.nil?
|
559
|
+
(doc/:h3)[0..-3].each do |row|
|
560
|
+
results << { :source => url, :url => "http://joebarton.house.gov/"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.next.next.text), :domain => domain}
|
561
|
+
end
|
562
|
+
results
|
563
|
+
end
|
564
|
+
|
565
|
+
def self.wolf_sherman_mccaul(page=0)
|
566
|
+
results = []
|
567
|
+
domains = ['wolf.house.gov', 'sherman.house.gov', 'mccaul.house.gov']
|
568
|
+
domains.each do |domain|
|
569
|
+
url = "http://#{domain}/media-center/press-releases?page=#{page}"
|
570
|
+
doc = open_html(url)
|
571
|
+
return if doc.nil?
|
572
|
+
dates = doc.xpath('//span[@class="field-content"]').map {|s| s.text if s.text.strip.include?("201")}.compact!
|
573
|
+
(doc/:h3).first(10).each_with_index do |row, i|
|
574
|
+
date = Date.parse(dates[i])
|
575
|
+
results << {:source => url, :url => "http://"+domain+row.children.first['href'], :title => row.children.first.text.strip, :date => date, :domain => domain}
|
576
|
+
end
|
577
|
+
end
|
578
|
+
results.flatten
|
579
|
+
end
|
580
|
+
|
581
|
+
def self.welch
|
582
|
+
results = []
|
583
|
+
domain = 'welch.house.gov'
|
584
|
+
url = "http://www.welch.house.gov/press-releases/"
|
585
|
+
doc = open_html(url)
|
586
|
+
return if doc.nil?
|
587
|
+
(doc/:h3).each do |row|
|
588
|
+
results << { :source => url, :url => "http://www.welch.house.gov/"+row.children[1]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.next.next.text), :domain => domain}
|
589
|
+
end
|
590
|
+
results
|
591
|
+
end
|
592
|
+
|
593
|
+
def self.gabbard
|
594
|
+
results = []
|
595
|
+
domain = 'gabbard.house.gov'
|
596
|
+
url = "http://gabbard.house.gov/index.php/news/press-releases"
|
597
|
+
doc = open_html(url)
|
598
|
+
return if doc.nil?
|
599
|
+
doc.css('ul.fc_leading li').each do |row|
|
600
|
+
results << {:source => url, :url => "http://gabbard.house.gov"+row.children[0].children[1]['href'], :title => row.children[0].children[1].text.strip, :date => Date.parse(row.children[2].text), :domain => domain}
|
601
|
+
end
|
602
|
+
results
|
603
|
+
end
|
604
|
+
|
497
605
|
def self.document_query(page=1)
|
498
606
|
results = []
|
499
607
|
domains = [{"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: '1.8'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Derek Willis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -198,7 +198,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
198
198
|
version: '0'
|
199
199
|
requirements: []
|
200
200
|
rubyforge_project:
|
201
|
-
rubygems_version: 2.2.
|
201
|
+
rubygems_version: 2.2.0
|
202
202
|
signing_key:
|
203
203
|
specification_version: 4
|
204
204
|
summary: Given a url, Statement returns links to press releases and official statements.
|