statement 0.3 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -8,6 +8,8 @@ InstalledFiles
8
8
  _yardoc
9
9
  coverage
10
10
  doc/
11
+ coverage
12
+ InstalledFiles
11
13
  lib/bundler/man
12
14
  pkg
13
15
  rdoc
data/README.md CHANGED
@@ -6,30 +6,40 @@ Statement parses RSS feeds and HTML pages containing press releases and other of
6
6
 
7
7
  Add this line to your application's Gemfile:
8
8
 
9
- gem 'statement'
9
+ ```ruby
10
+ gem 'statement'
11
+ ```
10
12
 
11
13
  And then execute:
12
14
 
13
- $ bundle
15
+ ```sh
16
+ $ bundle
17
+ ```
14
18
 
15
19
  Or install it yourself as:
16
20
 
17
- $ gem install statement
21
+ ```sh
22
+ $ gem install statement
23
+ ```
18
24
 
19
25
  ## Usage
20
26
 
21
- require 'rubygems'
22
- require 'statement'
23
-
24
- results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
25
- puts results.first
26
- {:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
27
+ ```ruby
28
+ require 'rubygems'
29
+ require 'statement'
27
30
 
31
+ results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
32
+ puts results.first
33
+ {:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
34
+ ```
35
+
28
36
  ## Tests
29
37
 
30
38
  Statement uses MiniTest, to run tests:
31
39
 
32
- rake test
40
+ ```sh
41
+ $ rake test
42
+ ```
33
43
 
34
44
  ## Contributing
35
45
 
@@ -42,4 +52,5 @@ Statement uses MiniTest, to run tests:
42
52
  ## Authors
43
53
 
44
54
  * Derek Willis
45
- * Jacob Harris
55
+ * Jacob Harris
56
+
data/lib/statement.rb CHANGED
@@ -34,6 +34,145 @@ module Statement
34
34
  end
35
35
  end
36
36
 
37
+ def self.from_scrapers
38
+ [freshman_senators, capuano, crenshaw(2013, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=2013),
39
+ billnelson(year=2012), roe(page=1), roe(page=2), roe(page=3), thornberry(page=1), thornberry(page=2), thornberry(page=3)].flatten
40
+ end
41
+
42
+ ## special cases for members without RSS feeds
43
+
44
+ def self.capuano
45
+ results = []
46
+ base_url = "http://www.house.gov/capuano/news/"
47
+ list_url = base_url + 'date.shtml'
48
+ doc = Nokogiri::HTML(open(list_url).read)
49
+ doc.xpath("//a").each do |link|
50
+ if link['href'] and link['href'].include?('/pr')
51
+ begin
52
+ date = Date.parse(link.text)
53
+ rescue
54
+ date = nil
55
+ end
56
+ results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
57
+ end
58
+ end
59
+ return results[0..-5]
60
+ end
61
+
62
+ def self.crenshaw(year, month)
63
+ results = []
64
+ year = Date.today.year if not year
65
+ month = 0 if not month
66
+ url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
67
+ doc = Nokogiri::HTML(open(url).read)
68
+ doc.xpath("//tr")[2..-1].each do |row|
69
+ date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
70
+ next if date_text == 'Date'
71
+ date = Date.parse(date_text)
72
+ results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => "crenshaw.house.gov" }
73
+ end
74
+ results
75
+ end
76
+
77
+ def self.conaway(page=1)
78
+ results = []
79
+ base_url = "http://conaway.house.gov/news/"
80
+ page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
81
+ doc = Nokogiri::HTML(open(page_url).read)
82
+ doc.xpath("//tr")[1..-1].each do |row|
83
+ results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
84
+ end
85
+ results
86
+ end
87
+
88
+ def self.susandavis
89
+ results = []
90
+ base_url = "http://www.house.gov/susandavis/"
91
+ doc = Nokogiri::HTML(open(base_url+'news.shtml').read)
92
+ doc.search("ul")[6].children.each do |row|
93
+ next if row.text.strip == ''
94
+ results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
95
+ end
96
+ results
97
+ end
98
+
99
+ def self.faleomavaega
100
+ results = []
101
+ base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
102
+ doc = Nokogiri::HTML(open(base_url).read)
103
+ doc.xpath("//li[@type='disc']").each do |row|
104
+ results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text.gsub(/[\x80-\xff]/,' ')), :domain => "house.gov/faleomavaega" }
105
+ end
106
+ results
107
+ end
108
+
109
+ def self.freshman_senators
110
+ results = []
111
+ ['baldwin', 'donnelly', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz','kaine'].each do |senator|
112
+ base_url = "http://www.#{senator}.senate.gov/"
113
+ doc = Nokogiri::HTML(open(base_url+'press.cfm?maxrows=200&startrow=1&&type=1').read)
114
+ doc.xpath("//tr")[3..-1].each do |row|
115
+ next if row.text.strip == ''
116
+ results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text.gsub(/[\x80-\xff]/,'')), :domain => "#{senator}.senate.gov" }
117
+ end
118
+ end
119
+ results.flatten
120
+ end
121
+
122
+ def self.klobuchar
123
+ results = []
124
+ base_url = "http://www.klobuchar.senate.gov/"
125
+ [2012,2013].each do |year|
126
+ year_url = base_url + "newsreleases.cfm?year=#{year}"
127
+ doc = Nokogiri::HTML(open(year_url).read)
128
+ doc.xpath("//dt").each do |row|
129
+ results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[\x80-\xff]/,'').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
130
+ end
131
+ end
132
+ results
133
+ end
134
+
135
+ def self.lujan
136
+ results = []
137
+ base_url = 'http://lujan.house.gov/'
138
+ doc = Nokogiri::HTML(open(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78').read)
139
+ doc.xpath('//ul')[1].children.each do |row|
140
+ next if row.text.strip == ''
141
+ results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').gsub('Lujn','Lujan'), :date => nil, :domain => "lujan.house.gov" }
142
+ end
143
+ results
144
+ end
145
+
146
+ def self.billnelson(year=2013)
147
+ results = []
148
+ base_url = "http://www.billnelson.senate.gov/news/"
149
+ year_url = base_url + "media.cfm?year=#{year}"
150
+ doc = Nokogiri::HTML(open(year_url).read)
151
+ doc.xpath('//li').each do |row|
152
+ results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text).to_s, :domain => "billnelson.senate.gov" }
153
+ end
154
+ results
155
+ end
156
+
157
+ def self.roe(page=1)
158
+ results = []
159
+ base_url = "http://roe.house.gov/news/"
160
+ doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}").read)
161
+ doc.xpath("//span[@class='middlecopy']").each do |row|
162
+ results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "roe.house.gov" }
163
+ end
164
+ results
165
+ end
166
+
167
+ def self.thornberry(page=1)
168
+ results = []
169
+ base_url = "http://thornberry.house.gov/news/"
170
+ doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}").read)
171
+ doc.xpath("//span[@class='middlecopy']").each do |row|
172
+ results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "thornberry.house.gov" }
173
+ end
174
+ results
175
+ end
176
+
37
177
  end
38
-
39
178
  end
@@ -1,3 +1,3 @@
1
1
  module Statement
2
- VERSION = "0.3"
2
+ VERSION = "0.4"
3
3
  end
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: statement
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 3
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 3
9
- version: "0.3"
8
+ - 4
9
+ version: "0.4"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Derek Willis
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2013-04-22 00:00:00 Z
17
+ date: 2013-04-25 00:00:00 Z
18
18
  dependencies:
19
19
  - !ruby/object:Gem::Dependency
20
20
  name: bundler