RubyGems - statement - Versions diffs - 0.5 → 0.6 - Mend

statement 0.5 → 0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.md +9 -3
data/lib/statement/version.rb +1 -1
data/lib/statement.rb +136 -22
data/spec/bill_nelson_press.html +593 -0
data/spec/cowan_press.html +1840 -0
data/spec/statement_spec.rb +26 -0
data/spec/vitter_press.html +2461 -0
metadata +18 -12

data/README.md CHANGED Viewed

@@ -1,6 +1,10 @@
 # Statement
-Statement parses RSS feeds and HTML pages containing press releases and other official statements from members of Congress, and produces hashes with information about those pages.
+Statement parses RSS feeds and HTML pages containing press releases and other official statements from members of Congress, and produces hashes with information about those pages. It has been tested under Ruby 1.9.2 and 1.9.3.
+## Coverage
+Statement currently parses press releases for members of the House and Senate. For members with RSS feeds, you can pass the feed URL into Statement. For members without RSS feeds, HTML scrapers are provided, as are methods for speciality groups, such as House Republicans. Suggestions are welcomed.
 ## Installation
@@ -28,9 +32,9 @@ $ gem install statement
 require 'rubygems'
 require 'statement'
-results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
+results = Statement::Link.from_rss('http://blumenauer.house.gov/index.php?option=com_bca-rss-syndicator&feed_id=1')
 puts results.first
-{:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
+{:source=>"http://blumenauer.house.gov/index.php?option=com_bca-rss-syndicator&feed_id=1", :url=>"http://blumenauer.house.gov/index.php?option=com_content&amp;view=article&amp;id=2203:blumenauer-qwe-need-a-national-system-that-speaks-to-the-transportation-challenges-of-todayq&amp;catid=66:2013-press-releases", :title=>"Blumenauer: &quot;We need a national system that speaks to the transportation challenges of ...", :date=>#<Date: 2013-04-24 ((2456407j,0s,0n),+0s,2299161j)>, :domain=>"blumenauer.house.gov"}
 ```
 ## Tests
@@ -49,6 +53,8 @@ $ rake test
 4. Push to the branch (`git push origin my-new-feature`)
 5. Create new Pull Request
+If you write a new scraper, please use Nokogiri for parsing - see some of the existing examples for guidance. The ``domain`` attribute represents the URI base domain of the source site.
 ## Authors
 * Derek Willis

data/lib/statement/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Statement
-  VERSION = "0.5"
+  VERSION = "0.6"
 end

data/lib/statement.rb CHANGED Viewed

@@ -36,8 +36,17 @@ module Statement
     end
     def self.from_scrapers
-      [freshman_senators, capuano, crenshaw(2013, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=2013),
-        billnelson(year=2012), roe(page=1), roe(page=2), roe(page=3), thornberry(page=1), thornberry(page=2), thornberry(page=3)].flatten
+      year = Date.today.year
+      [freshman_senators, capuano, cold_fusion(year, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=year),
+        document_query(page=1), document_query(page=2), lautenberg, crapo, coburn, boxer(start=1), mccain(year=year),
+        vitter_cowan(year=year), inhofe(year=year), reid].flatten
+    end
+    def self.backfill_from_scrapers
+      [cold_fusion(2012, 0), cold_fusion(2011, 0), cold_fusion(2010, 0), billnelson(year=2012), document_query(page=3),
+        document_query(page=4), coburn(year=2012), coburn(year=2011), coburn(year=2010), boxer(start=11), boxer(start=21),
+        boxer(start=31), boxer(start=41), mccain(year=2012), mccain(year=2011), vitter_cowan(year=2012), vitter_cowan(year=2011),
+        ].flatten
     end
     ## special cases for members without RSS feeds
@@ -60,19 +69,28 @@ module Statement
       return results[0..-5]
     end
-    def self.crenshaw(year, month)
+    def self.cold_fusion(year, month)
       results = []
       year = Date.today.year if not year
       month = 0 if not month
-      url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
-      doc = Nokogiri::HTML(open(url).read)
-      doc.xpath("//tr")[2..-1].each do |row|
-        date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
-        next if date_text == 'Date'
-        date = Date.parse(date_text)
-        results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => "crenshaw.house.gov" }
+      domains = ['crenshaw.house.gov/', 'www.ronjohnson.senate.gov/public/','www.lee.senate.gov/public/','www.hoeven.senate.gov/public/','www.moran.senate.gov/public/','www.risch.senate.gov/public/']
+      domains.each do |domain|
+        if domain == 'crenshaw.house.gov/' or domain == 'www.risch.senate.gov/public/'
+          url = "http://"+domain + "index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
+        elsif domain == 'www.hoeven.senate.gov/public/' or domain == 'www.moran.senate.gov/public/'
+          url = "http://"+domain + "index.cfm/news-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
+        else
+          url = "http://"+domain + "index.cfm/press-releases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
+        end
+        doc = Nokogiri::HTML(open(url).read)
+        doc.xpath("//tr")[2..-1].each do |row|
+          date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
+          next if date_text == 'Date' or date_text.size > 8
+          date = Date.parse(date_text)
+          results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => domain }
+        end
       end
-      results
+      results.flatten
     end
     def self.conaway(page=1)
@@ -150,30 +168,126 @@ module Statement
       year_url = base_url + "media.cfm?year=#{year}"
       doc = Nokogiri::HTML(open(year_url).read)
       doc.xpath('//li').each do |row|
-        results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text).to_s, :domain => "billnelson.senate.gov" }
+        results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text), :domain => "billnelson.senate.gov" }
+      end
+      results
+    end
+    # fetches the latest 1000 releases, can be altered
+    def self.lautenberg(rows=1000)
+      results = []
+      base_url = 'http://www.lautenberg.senate.gov/newsroom/'
+      url = base_url + "releases.cfm?maxrows=#{rows}&startrow=1&&type=1"
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//tr")[4..-2].each do |row|
+        results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "lautenberg.senate.gov" }
+      end
+      results
+    end
+    def self.crapo
+      results = []
+      base_url = "http://www.crapo.senate.gov/media/newsreleases/"
+      url = base_url + "release_all.cfm"
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//tr").each do |row|
+        results << { :source => url, :url => base_url + row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip.gsub('-','/')), :domain => "crapo.senate.gov" }
+      end
+      results
+    end
+    def self.coburn(year=Date.today.year)
+      results = []
+      url = "http://www.coburn.senate.gov/public/index.cfm?p=PressReleases&ContentType_id=d741b7a7-7863-4223-9904-8cb9378aa03a&Group_id=7a55cb96-4639-4dac-8c0c-99a4a227bd3a&MonthDisplay=0&YearDisplay=#{year}"
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//tr")[2..-1].each do |row|
+        next if row.text[0..3] == "Date"
+        results << { :source => url, :url => row.children[2].children[0]['href'], :title => row.children[2].text.strip, :date => Date.parse(row.children[0].text.strip), :domain => "coburn.senate.gov" }
+      end
+      results
+    end
+    def self.boxer(start=1)
+      results = []
+      url = "http://www.boxer.senate.gov/en/press/releases.cfm?start=#{start}"
+      domain = 'www.boxer.senate.gov'
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//div[@class='left']")[1..-1].each do |row|
+        results << { :source => url, :url => domain + row.next.next.children[1].children[0]['href'], :title => row.next.next.children[1].children[0].text, :date => Date.parse(row.text.strip), :domain => domain}
+      end
+      results
+    end
+    def self.mccain(year=Date.today.year)
+      results = []
+      url = "http://www.mccain.senate.gov/public/index.cfm?FuseAction=PressOffice.PressReleases&ContentRecordType_id=75e7e4a0-6088-44b6-8061-089d80513dc4&Region_id=&Issue_id=&MonthDisplay=0&YearDisplay=#{year}"
+      domain = 'www.mccain.senate.gov'
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//li")[7..-1].each do |row|
+        results << { :source => url, :url => domain + row.children[3].children[1].children[4].children[0]['href'], :title => row.children[3].children[1].children[4].text, :date => Date.parse(row.children[3].children[1].children[0].text), :domain => domain}
+      end
+      results
+    end
+    def self.vitter_cowan(year=Date.today.year)
+      results = []
+      urls = ["http://www.vitter.senate.gov/newsroom/", "http://www.cowan.senate.gov/"]
+      urls.each do |url|
+        next if year < 2013 and url == "http://www.cowan.senate.gov/"
+        domain = url == "http://www.vitter.senate.gov/newsroom/" ? "www.vitter.senate.gov" : "www.cowan.senate.gov"
+        doc = Nokogiri::HTML(open(url+"press?year=#{year}").read)
+        doc.xpath("//tr")[1..-1].each do |row|
+          next if row.text.strip.size < 30
+          results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
+        end
+      end
+      results.flatten
+    end
+    def self.inhofe(year=Date.today.year)
+      results = []
+      url = "http://www.inhofe.senate.gov/newsroom/press-releases?year=#{year}"
+      domain = "www.inhofe.senate.gov"
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//tr")[1..-1].each do |row|
+        results << { :source => url, :url => row.children[2].children[0]['href'].strip, :title => row.children[2].text, :date => Date.parse(row.children[0].text), :domain => domain}
       end
       results
     end
-    def self.roe(page=1)
+    def self.levin(page=1)
       results = []
-      base_url = "http://roe.house.gov/news/"
-      doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}").read)
-      doc.xpath("//span[@class='middlecopy']").each do |row|
-        results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => "roe.house.gov" }
+      url = "http://www.levin.senate.gov/newsroom/index.cfm?PageNum_rs=#{page}&section=press"
+      domain = "www.levin.senate.gov"
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath('//tr').each do |row|
+        results << { :source => url, :url => row.children[2].children[0]['href'].gsub(/\s+/, ""), :title => row.children[2].children[0].text, :date => Date.parse(row.children[0].text), :domain => domain}
       end
       results
     end
-    def self.thornberry(page=1)
+    def self.reid
       results = []
-      base_url = "http://thornberry.house.gov/news/"
-      doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}").read)
-      doc.xpath("//span[@class='middlecopy']").each do |row|
-        results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => "thornberry.house.gov" }
+      url = "http://www.reid.senate.gov/newsroom/press_releases.cfm"
+      domain = "www.reid.senate.gov"
+      doc = Nokogiri::HTML(open(url).read)
+      doc.xpath("//table[@id='CS_PgIndex_21891_21893']//tr")[1..-1].each do |row|
+        results << { :source => url, :url => "http://www.reid.senate.gov"+row.children[0].children[0]['href'], :title => row.children[0].children[0].text, :date => Date.parse(row.children[0].children[2].text), :domain => domain}
       end
       results
     end
+    def self.document_query(page=1)
+      results = []
+      domains = [{"roe.house.gov" => 1532}, {"thornberry.house.gov" => 1776}, {"wenstrup.house.gov" => 2491}]
+      domains.each do |domain|
+        doc = Nokogiri::HTML(open("http://"+domain.keys.first+"/news/documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}").read)
+        doc.xpath("//span[@class='middlecopy']").each do |row|
+          results << { :source => "http://"+domain.keys.first+"/news/"+"documentquery.aspx?DocumentTypeID=#{domain.values.first}&Page=#{page}", :url => "http://"+domain.keys.first+"/news/" + row.children[6]['href'], :title => row.children[1].text.strip, :date => Date.parse(row.children[4].text.strip), :domain => domain.keys.first }
+        end
+      end
+      results.flatten
+    end
   end
 end