statement 0.3 → 0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/README.md +22 -11
- data/lib/statement.rb +140 -1
- data/lib/statement/version.rb +1 -1
- metadata +4 -4
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -6,30 +6,40 @@ Statement parses RSS feeds and HTML pages containing press releases and other of
|
|
6
6
|
|
7
7
|
Add this line to your application's Gemfile:
|
8
8
|
|
9
|
-
|
9
|
+
```ruby
|
10
|
+
gem 'statement'
|
11
|
+
```
|
10
12
|
|
11
13
|
And then execute:
|
12
14
|
|
13
|
-
|
15
|
+
```sh
|
16
|
+
$ bundle
|
17
|
+
```
|
14
18
|
|
15
19
|
Or install it yourself as:
|
16
20
|
|
17
|
-
|
21
|
+
```sh
|
22
|
+
$ gem install statement
|
23
|
+
```
|
18
24
|
|
19
25
|
## Usage
|
20
26
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
|
25
|
-
puts results.first
|
26
|
-
{:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
|
27
|
+
```ruby
|
28
|
+
require 'rubygems'
|
29
|
+
require 'statement'
|
27
30
|
|
31
|
+
results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
|
32
|
+
puts results.first
|
33
|
+
{:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
|
34
|
+
```
|
35
|
+
|
28
36
|
## Tests
|
29
37
|
|
30
38
|
Statement uses MiniTest, to run tests:
|
31
39
|
|
32
|
-
|
40
|
+
```sh
|
41
|
+
$ rake test
|
42
|
+
```
|
33
43
|
|
34
44
|
## Contributing
|
35
45
|
|
@@ -42,4 +52,5 @@ Statement uses MiniTest, to run tests:
|
|
42
52
|
## Authors
|
43
53
|
|
44
54
|
* Derek Willis
|
45
|
-
* Jacob Harris
|
55
|
+
* Jacob Harris
|
56
|
+
|
data/lib/statement.rb
CHANGED
@@ -34,6 +34,145 @@ module Statement
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
def self.from_scrapers
|
38
|
+
[freshman_senators, capuano, crenshaw(2013, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=2013),
|
39
|
+
billnelson(year=2012), roe(page=1), roe(page=2), roe(page=3), thornberry(page=1), thornberry(page=2), thornberry(page=3)].flatten
|
40
|
+
end
|
41
|
+
|
42
|
+
## special cases for members without RSS feeds
|
43
|
+
|
44
|
+
def self.capuano
|
45
|
+
results = []
|
46
|
+
base_url = "http://www.house.gov/capuano/news/"
|
47
|
+
list_url = base_url + 'date.shtml'
|
48
|
+
doc = Nokogiri::HTML(open(list_url).read)
|
49
|
+
doc.xpath("//a").each do |link|
|
50
|
+
if link['href'] and link['href'].include?('/pr')
|
51
|
+
begin
|
52
|
+
date = Date.parse(link.text)
|
53
|
+
rescue
|
54
|
+
date = nil
|
55
|
+
end
|
56
|
+
results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
return results[0..-5]
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.crenshaw(year, month)
|
63
|
+
results = []
|
64
|
+
year = Date.today.year if not year
|
65
|
+
month = 0 if not month
|
66
|
+
url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
67
|
+
doc = Nokogiri::HTML(open(url).read)
|
68
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
69
|
+
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
70
|
+
next if date_text == 'Date'
|
71
|
+
date = Date.parse(date_text)
|
72
|
+
results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => "crenshaw.house.gov" }
|
73
|
+
end
|
74
|
+
results
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.conaway(page=1)
|
78
|
+
results = []
|
79
|
+
base_url = "http://conaway.house.gov/news/"
|
80
|
+
page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
|
81
|
+
doc = Nokogiri::HTML(open(page_url).read)
|
82
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
83
|
+
results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
|
84
|
+
end
|
85
|
+
results
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.susandavis
|
89
|
+
results = []
|
90
|
+
base_url = "http://www.house.gov/susandavis/"
|
91
|
+
doc = Nokogiri::HTML(open(base_url+'news.shtml').read)
|
92
|
+
doc.search("ul")[6].children.each do |row|
|
93
|
+
next if row.text.strip == ''
|
94
|
+
results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
|
95
|
+
end
|
96
|
+
results
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.faleomavaega
|
100
|
+
results = []
|
101
|
+
base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
|
102
|
+
doc = Nokogiri::HTML(open(base_url).read)
|
103
|
+
doc.xpath("//li[@type='disc']").each do |row|
|
104
|
+
results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text.gsub(/[\x80-\xff]/,' ')), :domain => "house.gov/faleomavaega" }
|
105
|
+
end
|
106
|
+
results
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.freshman_senators
|
110
|
+
results = []
|
111
|
+
['baldwin', 'donnelly', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz','kaine'].each do |senator|
|
112
|
+
base_url = "http://www.#{senator}.senate.gov/"
|
113
|
+
doc = Nokogiri::HTML(open(base_url+'press.cfm?maxrows=200&startrow=1&&type=1').read)
|
114
|
+
doc.xpath("//tr")[3..-1].each do |row|
|
115
|
+
next if row.text.strip == ''
|
116
|
+
results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text.gsub(/[\x80-\xff]/,'')), :domain => "#{senator}.senate.gov" }
|
117
|
+
end
|
118
|
+
end
|
119
|
+
results.flatten
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.klobuchar
|
123
|
+
results = []
|
124
|
+
base_url = "http://www.klobuchar.senate.gov/"
|
125
|
+
[2012,2013].each do |year|
|
126
|
+
year_url = base_url + "newsreleases.cfm?year=#{year}"
|
127
|
+
doc = Nokogiri::HTML(open(year_url).read)
|
128
|
+
doc.xpath("//dt").each do |row|
|
129
|
+
results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[\x80-\xff]/,'').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
|
130
|
+
end
|
131
|
+
end
|
132
|
+
results
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.lujan
|
136
|
+
results = []
|
137
|
+
base_url = 'http://lujan.house.gov/'
|
138
|
+
doc = Nokogiri::HTML(open(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78').read)
|
139
|
+
doc.xpath('//ul')[1].children.each do |row|
|
140
|
+
next if row.text.strip == ''
|
141
|
+
results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').gsub('Lujn','Lujan'), :date => nil, :domain => "lujan.house.gov" }
|
142
|
+
end
|
143
|
+
results
|
144
|
+
end
|
145
|
+
|
146
|
+
def self.billnelson(year=2013)
|
147
|
+
results = []
|
148
|
+
base_url = "http://www.billnelson.senate.gov/news/"
|
149
|
+
year_url = base_url + "media.cfm?year=#{year}"
|
150
|
+
doc = Nokogiri::HTML(open(year_url).read)
|
151
|
+
doc.xpath('//li').each do |row|
|
152
|
+
results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text).to_s, :domain => "billnelson.senate.gov" }
|
153
|
+
end
|
154
|
+
results
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.roe(page=1)
|
158
|
+
results = []
|
159
|
+
base_url = "http://roe.house.gov/news/"
|
160
|
+
doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}").read)
|
161
|
+
doc.xpath("//span[@class='middlecopy']").each do |row|
|
162
|
+
results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "roe.house.gov" }
|
163
|
+
end
|
164
|
+
results
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.thornberry(page=1)
|
168
|
+
results = []
|
169
|
+
base_url = "http://thornberry.house.gov/news/"
|
170
|
+
doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}").read)
|
171
|
+
doc.xpath("//span[@class='middlecopy']").each do |row|
|
172
|
+
results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "thornberry.house.gov" }
|
173
|
+
end
|
174
|
+
results
|
175
|
+
end
|
176
|
+
|
37
177
|
end
|
38
|
-
|
39
178
|
end
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 3
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 4
|
9
|
+
version: "0.4"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Derek Willis
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2013-04-
|
17
|
+
date: 2013-04-25 00:00:00 Z
|
18
18
|
dependencies:
|
19
19
|
- !ruby/object:Gem::Dependency
|
20
20
|
name: bundler
|