statement 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/statement/scraper.rb +71 -11
- data/lib/statement/version.rb +1 -1
- data/spec/bill_nelson_press-coldfusion.html +593 -0
- data/spec/bill_nelson_press.html +453 -572
- data/spec/rand_paul_press.html +5492 -0
- data/spec/statement_spec.rb +31 -3
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c771519d9fc8c1d3b906d0b09695205a92f893f2
|
4
|
+
data.tar.gz: 737c1f6530c5d7d9424be1f5629df0042c6abc8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1c53d467e8b878ff6995b6aacbc1618904a263bf55fe2d50e702334afbfbd0d323979d26be9e1d39abf83c2e21dc9258dcce98330cfc8a962620714bebd5707
|
7
|
+
data.tar.gz: e2ef4d6db0d2276a089c8e4bace60d859bc60d584177c4ec862bc55ddf2973ea3aa1b10aecf19769b4e86a5cc6143ae7206eef7688fe81c0a4f3ea57a1505827
|
data/lib/statement/scraper.rb
CHANGED
@@ -32,7 +32,7 @@ module Statement
|
|
32
32
|
[:crenshaw, :capuano, :cold_fusion, :conaway, :chabot, :freshman_senators, :klobuchar, :billnelson, :crapo, :boxer,
|
33
33
|
:vitter, :inhofe, :document_query, :swalwell, :fischer, :clark, :edwards, :culberson_chabot_grisham, :barton,
|
34
34
|
:welch, :sessions, :gabbard, :costa, :farr, :mcclintock, :olson, :schumer, :lamborn, :walden, :boehner,
|
35
|
-
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin_burr]
|
35
|
+
:bennie_thompson, :speier, :poe, :grassley, :bennet, :shaheen, :keating, :drupal, :jenkins, :durbin_burr, :rand_paul]
|
36
36
|
end
|
37
37
|
|
38
38
|
def self.committee_methods
|
@@ -42,10 +42,11 @@ module Statement
|
|
42
42
|
def self.member_scrapers
|
43
43
|
year = Date.today.year
|
44
44
|
results = [crenshaw, capuano, cold_fusion(year, nil), conaway, chabot, klobuchar(year), billnelson(page=0),
|
45
|
-
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
45
|
+
document_query(page=1), document_query(page=2), swalwell(page=1), crapo, boxer, grassley(page=0),
|
46
46
|
vitter(year=year), inhofe(year=year), fischer, clark(year=year), edwards, culberson_chabot_grisham(page=1), barton, welch,
|
47
47
|
sessions(year=year), gabbard, costa, farr, olson, schumer, lamborn(limit=10), walden, bennie_thompson, speier,
|
48
|
-
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin_burr(page=1)
|
48
|
+
poe(year=year, month=0), bennet(page=1), shaheen(page=1), perlmutter, keating, drupal, jenkins, durbin_burr(page=1),
|
49
|
+
rand_paul(page = 1)].flatten
|
49
50
|
results = results.compact
|
50
51
|
Utils.remove_generic_urls!(results)
|
51
52
|
end
|
@@ -416,6 +417,51 @@ module Statement
|
|
416
417
|
results
|
417
418
|
end
|
418
419
|
|
420
|
+
def self.rand_paul(page = 1)
|
421
|
+
# each page contains a max of 20 results
|
422
|
+
page_url = "http://www.paul.senate.gov/news/press?PageNum_rs=#{page}"
|
423
|
+
doc = open_html(page_url)
|
424
|
+
return if doc.nil?
|
425
|
+
results = doc.search('#press .title').inject([]) do |arr, title|
|
426
|
+
article_url = URI.join(page_url, title.search('a')[0]['href'])
|
427
|
+
article_datestr = title.previous_element.text # e.g. "05.11.15"
|
428
|
+
arr << {
|
429
|
+
:source => page_url,
|
430
|
+
:url => article_url.to_s,
|
431
|
+
:domain => article_url.host,
|
432
|
+
:title => title.text,
|
433
|
+
:date => Date.strptime(article_datestr, '%m.%d.%y')
|
434
|
+
}
|
435
|
+
end
|
436
|
+
results
|
437
|
+
end
|
438
|
+
|
439
|
+
|
440
|
+
def self.patrick_meehan(page = 0)
|
441
|
+
# This is a Drupal page and it uses the View plugin, but unlike the other
|
442
|
+
# Drupal pages, it does not make use of .views-field-created, and instead, the
|
443
|
+
# only Month-Year is given (03 Feb).
|
444
|
+
page_url = "https://meehan.house.gov/media-center/press-releases?page=#{page}"
|
445
|
+
doc = open_html(page_url)
|
446
|
+
return if doc.nil?
|
447
|
+
results = doc.search('.view-congress-press-releases .views-row').inject([]) do |arr, article|
|
448
|
+
title = article.search('.views-field-title a')[0]
|
449
|
+
article_url = URI.join(page_url, title['href'])
|
450
|
+
raise "Date still needs to be parsed; thanks a lot Drupal"
|
451
|
+
article_datestr = title.previous_element.text
|
452
|
+
arr << {
|
453
|
+
:source => page_url,
|
454
|
+
:url => article_url.to_s,
|
455
|
+
:domain => article_url.host,
|
456
|
+
:title => title.text,
|
457
|
+
:date => Date.strptime(article_datestr, 'SOMETHING')
|
458
|
+
}
|
459
|
+
end
|
460
|
+
|
461
|
+
results
|
462
|
+
end
|
463
|
+
|
464
|
+
|
419
465
|
# fetches the latest 1000 releases, can be altered
|
420
466
|
def self.lautenberg(rows=1000)
|
421
467
|
results = []
|
@@ -811,24 +857,36 @@ module Statement
|
|
811
857
|
results
|
812
858
|
end
|
813
859
|
|
814
|
-
def self.backfill_bilirakis
|
860
|
+
def self.backfill_bilirakis(page=1)
|
815
861
|
results = []
|
816
862
|
domain = 'bilirakis.house.gov'
|
817
|
-
url =
|
863
|
+
url = "https://bilirakis.house.gov/press-releases?page=#{page}"
|
818
864
|
doc = open_html(url)
|
819
865
|
return if doc.nil?
|
820
|
-
doc.css("
|
821
|
-
|
866
|
+
doc.css("#region-content .views-row").each do |row|
|
867
|
+
title_anchor = row.css("h3 a")
|
868
|
+
title = title_anchor.text
|
869
|
+
release_url = "http://#{domain + title_anchor.attr('href')}"
|
870
|
+
raw_date = row.css(".views-field-created").text
|
871
|
+
results << { :source => url,
|
872
|
+
:url => release_url,
|
873
|
+
:title => title,
|
874
|
+
:date => begin Date.parse(raw_date) rescue nil end,
|
875
|
+
:domain => domain }
|
822
876
|
end
|
877
|
+
results
|
823
878
|
end
|
824
879
|
|
825
|
-
def self.backfill_boustany
|
880
|
+
def self.backfill_boustany(congress)
|
826
881
|
results = []
|
827
882
|
domain = 'boustany.house.gov'
|
828
|
-
url =
|
883
|
+
url = "http://boustany.house.gov/#{congress}th-congress/showallitems/"
|
829
884
|
doc = open_html(url)
|
830
885
|
return if doc.nil?
|
831
|
-
|
886
|
+
(doc/:ul)[13].search(:li).each do |row|
|
887
|
+
results << {:source => url, :url => 'http://boustany.house.gov' + row.children.search(:a)[0]['href'], :title => row.children.search(:a)[0].text, :date => Date.parse(row.children[5].text), :domain => domain }
|
888
|
+
end
|
889
|
+
results
|
832
890
|
end
|
833
891
|
|
834
892
|
def self.perlmutter
|
@@ -875,7 +933,9 @@ module Statement
|
|
875
933
|
"https://bilirakis.house.gov/press-releases",
|
876
934
|
"http://quigley.house.gov/media-center/press-releases",
|
877
935
|
"https://denham.house.gov/media-center/press-releases",
|
878
|
-
"https://sewell.house.gov/media-center/press-releases"
|
936
|
+
"https://sewell.house.gov/media-center/press-releases",
|
937
|
+
"https://buchanan.house.gov/media-center/press-releases",
|
938
|
+
"https://meehan.house.gov/media-center/press-releases"
|
879
939
|
]
|
880
940
|
end
|
881
941
|
|
data/lib/statement/version.rb
CHANGED