statement 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +71 -11
- data/lib/statement/version.rb +1 -1
- data/spec/bill_nelson_press-coldfusion.html +593 -0
- data/spec/bill_nelson_press.html +453 -572
- data/spec/rand_paul_press.html +5492 -0
- data/spec/statement_spec.rb +31 -3
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c771519d9fc8c1d3b906d0b09695205a92f893f2
|
4
|
+
data.tar.gz: 737c1f6530c5d7d9424be1f5629df0042c6abc8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1c53d467e8b878ff6995b6aacbc1618904a263bf55fe2d50e702334afbfbd0d323979d26be9e1d39abf83c2e21dc9258dcce98330cfc8a962620714bebd5707
|
7
|
+
data.tar.gz: e2ef4d6db0d2276a089c8e4bace60d859bc60d584177c4ec862bc55ddf2973ea3aa1b10aecf19769b4e86a5cc6143ae7206eef7688fe81c0a4f3ea57a1505827
|
data/lib/statement/scraper.rb
CHANGED
@@ -32,7 +32,7 @@ module Statement
|
|
32
32
|
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :freshman_senators, :klobuchar, :billnelson, :crapo, :boxer,
|
33
33
|
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton,
|
34
34
|
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :lamborn, :walden, :boehner,
|
35
|
-
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin_burr]
|
35
|
+
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin_burr, :rand_paul]
|
36
36
|
end
|
37
37
|
|
38
38
|
def self.committee_methods
|
@@ -42,10 +42,11 @@ module Statement
|
|
42
42
|
def self.member_scrapers
|
43
43
|
year = Date.today.year
|
44
44
|
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, klobuchar(year), billnelson(page=0),
|
45
|
-
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
45
|
+
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
46
46
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, welch,
|
47
47
|
sessions(year=year), gabbard, costa, farr, olson, schumer, lamborn(limit=10), walden, bennie_thompson, speier,
|
48
|
-
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin_burr(page=1)
|
48
|
+
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin_burr(page=1),
|
49
|
+
rand_paul(page = 1)].flatten
|
49
50
|
results = results.compact
|
50
51
|
Utils.remove_generic_urls!(results)
|
51
52
|
end
|
@@ -416,6 +417,51 @@ module Statement
|
|
416
417
|
results
|
417
418
|
end
|
418
419
|
|
420
|
+
def self.rand_paul(page = 1)
|
421
|
+
# each page contains a max of 20 results
|
422
|
+
page_url = "http://www.paul.senate.gov/news/press?PageNum_rs=#{page}"
|
423
|
+
doc = open_html(page_url)
|
424
|
+
return if doc.nil?
|
425
|
+
results = doc.search('#press .title').inject([]) do |arr, title|
|
426
|
+
article_url = URI.join(page_url, title.search('a')[0]['href'])
|
427
|
+
article_datestr = title.previous_element.text # e.g. "05.11.15"
|
428
|
+
arr << {
|
429
|
+
:source => page_url,
|
430
|
+
:url => article_url.to_s,
|
431
|
+
:domain => article_url.host,
|
432
|
+
:title => title.text,
|
433
|
+
:date => Date.strptime(article_datestr, '%m.%d.%y')
|
434
|
+
}
|
435
|
+
end
|
436
|
+
results
|
437
|
+
end
|
438
|
+
|
439
|
+
|
440
|
+
def self.patrick_meehan(page = 0)
|
441
|
+
# This is a Drupal page and it uses the View plugin, but unlike the other
|
442
|
+
# Drupal pages, it does not make use of .views-field-created, and instead, the
|
443
|
+
# only Month-Year is given (03 Feb).
|
444
|
+
page_url = "https://meehan.house.gov/media-center/press-releases?page=#{page}"
|
445
|
+
doc = open_html(page_url)
|
446
|
+
return if doc.nil?
|
447
|
+
results = doc.search('.view-congress-press-releases .views-row').inject([]) do |arr, article|
|
448
|
+
title = article.search('.views-field-title a')[0]
|
449
|
+
article_url = URI.join(page_url, title['href'])
|
450
|
+
raise "Date still needs to be parsed; thanks a lot Drupal"
|
451
|
+
article_datestr = title.previous_element.text
|
452
|
+
arr << {
|
453
|
+
:source => page_url,
|
454
|
+
:url => article_url.to_s,
|
455
|
+
:domain => article_url.host,
|
456
|
+
:title => title.text,
|
457
|
+
:date => Date.strptime(article_datestr, 'SOMETHING')
|
458
|
+
}
|
459
|
+
end
|
460
|
+
|
461
|
+
results
|
462
|
+
end
|
463
|
+
|
464
|
+
|
419
465
|
# fetches the latest 1000 releases, can be altered
|
420
466
|
def self.lautenberg(rows=1000)
|
421
467
|
results = []
|
@@ -811,24 +857,36 @@ module Statement
|
|
811
857
|
results
|
812
858
|
end
|
813
859
|
|
814
|
-
def self.backfill_bilirakis
|
860
|
+
def self.backfill_bilirakis(page=1)
|
815
861
|
results = []
|
816
862
|
domain = 'bilirakis.house.gov'
|
817
|
-
url =
|
863
|
+
url = "https://bilirakis.house.gov/press-releases?page=#{page}"
|
818
864
|
doc = open_html(url)
|
819
865
|
return if doc.nil?
|
820
|
-
doc.css("
|
821
|
-
|
866
|
+
doc.css("#region-content .views-row").each do |row|
|
867
|
+
title_anchor = row.css("h3 a")
|
868
|
+
title = title_anchor.text
|
869
|
+
release_url = "http://#{domain + title_anchor.attr('href')}"
|
870
|
+
raw_date = row.css(".views-field-created").text
|
871
|
+
results << { :source => url,
|
872
|
+
:url => release_url,
|
873
|
+
:title => title,
|
874
|
+
:date => begin Date.parse(raw_date) rescue nil end,
|
875
|
+
:domain => domain }
|
822
876
|
end
|
877
|
+
results
|
823
878
|
end
|
824
879
|
|
825
|
-
def self.backfill_boustany
|
880
|
+
def self.backfill_boustany(congress)
|
826
881
|
results = []
|
827
882
|
domain = 'boustany.house.gov'
|
828
|
-
url =
|
883
|
+
url = "http://boustany.house.gov/#{congress}th-congress/showallitems/"
|
829
884
|
doc = open_html(url)
|
830
885
|
return if doc.nil?
|
831
|
-
|
886
|
+
(doc/:ul)[13].search(:li).each do |row|
|
887
|
+
results << {:source => url, :url => 'http://boustany.house.gov' + row.children.search(:a)[0]['href'], :title => row.children.search(:a)[0].text, :date => Date.parse(row.children[5].text), :domain => domain }
|
888
|
+
end
|
889
|
+
results
|
832
890
|
end
|
833
891
|
|
834
892
|
def self.perlmutter
|
@@ -875,7 +933,9 @@ module Statement
|
|
875
933
|
"https://bilirakis.house.gov/press-releases",
|
876
934
|
"http://quigley.house.gov/media-center/press-releases",
|
877
935
|
"https://denham.house.gov/media-center/press-releases",
|
878
|
-
"https://sewell.house.gov/media-center/press-releases"
|
936
|
+
"https://sewell.house.gov/media-center/press-releases",
|
937
|
+
"https://buchanan.house.gov/media-center/press-releases",
|
938
|
+
"https://meehan.house.gov/media-center/press-releases"
|
879
939
|
]
|
880
940
|
end
|
881
941
|
|
data/lib/statement/version.rb
CHANGED