statement 0.3 → 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/README.md +22 -11
- data/lib/statement.rb +140 -1
- data/lib/statement/version.rb +1 -1
- metadata +4 -4
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -6,30 +6,40 @@ Statement parses RSS feeds and HTML pages containing press releases and other of
|
|
6
6
|
|
7
7
|
Add this line to your application's Gemfile:
|
8
8
|
|
9
|
-
|
9
|
+
```ruby
|
10
|
+
gem 'statement'
|
11
|
+
```
|
10
12
|
|
11
13
|
And then execute:
|
12
14
|
|
13
|
-
|
15
|
+
```sh
|
16
|
+
$ bundle
|
17
|
+
```
|
14
18
|
|
15
19
|
Or install it yourself as:
|
16
20
|
|
17
|
-
|
21
|
+
```sh
|
22
|
+
$ gem install statement
|
23
|
+
```
|
18
24
|
|
19
25
|
## Usage
|
20
26
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
|
25
|
-
puts results.first
|
26
|
-
{:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
|
27
|
+
```ruby
|
28
|
+
require 'rubygems'
|
29
|
+
require 'statement'
|
27
30
|
|
31
|
+
results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
|
32
|
+
puts results.first
|
33
|
+
{:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
|
34
|
+
```
|
35
|
+
|
28
36
|
## Tests
|
29
37
|
|
30
38
|
Statement uses MiniTest, to run tests:
|
31
39
|
|
32
|
-
|
40
|
+
```sh
|
41
|
+
$ rake test
|
42
|
+
```
|
33
43
|
|
34
44
|
## Contributing
|
35
45
|
|
@@ -42,4 +52,5 @@ Statement uses MiniTest, to run tests:
|
|
42
52
|
## Authors
|
43
53
|
|
44
54
|
* Derek Willis
|
45
|
-
* Jacob Harris
|
55
|
+
* Jacob Harris
|
56
|
+
|
data/lib/statement.rb
CHANGED
@@ -34,6 +34,145 @@ module Statement
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
+
def self.from_scrapers
|
38
|
+
[freshman_senators, capuano, crenshaw(2013, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=2013),
|
39
|
+
billnelson(year=2012), roe(page=1), roe(page=2), roe(page=3), thornberry(page=1), thornberry(page=2), thornberry(page=3)].flatten
|
40
|
+
end
|
41
|
+
|
42
|
+
## special cases for members without RSS feeds
|
43
|
+
|
44
|
+
def self.capuano
|
45
|
+
results = []
|
46
|
+
base_url = "http://www.house.gov/capuano/news/"
|
47
|
+
list_url = base_url + 'date.shtml'
|
48
|
+
doc = Nokogiri::HTML(open(list_url).read)
|
49
|
+
doc.xpath("//a").each do |link|
|
50
|
+
if link['href'] and link['href'].include?('/pr')
|
51
|
+
begin
|
52
|
+
date = Date.parse(link.text)
|
53
|
+
rescue
|
54
|
+
date = nil
|
55
|
+
end
|
56
|
+
results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
|
57
|
+
end
|
58
|
+
end
|
59
|
+
return results[0..-5]
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.crenshaw(year, month)
|
63
|
+
results = []
|
64
|
+
year = Date.today.year if not year
|
65
|
+
month = 0 if not month
|
66
|
+
url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
|
67
|
+
doc = Nokogiri::HTML(open(url).read)
|
68
|
+
doc.xpath("//tr")[2..-1].each do |row|
|
69
|
+
date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
|
70
|
+
next if date_text == 'Date'
|
71
|
+
date = Date.parse(date_text)
|
72
|
+
results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => "crenshaw.house.gov" }
|
73
|
+
end
|
74
|
+
results
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.conaway(page=1)
|
78
|
+
results = []
|
79
|
+
base_url = "http://conaway.house.gov/news/"
|
80
|
+
page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
|
81
|
+
doc = Nokogiri::HTML(open(page_url).read)
|
82
|
+
doc.xpath("//tr")[1..-1].each do |row|
|
83
|
+
results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
|
84
|
+
end
|
85
|
+
results
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.susandavis
|
89
|
+
results = []
|
90
|
+
base_url = "http://www.house.gov/susandavis/"
|
91
|
+
doc = Nokogiri::HTML(open(base_url+'news.shtml').read)
|
92
|
+
doc.search("ul")[6].children.each do |row|
|
93
|
+
next if row.text.strip == ''
|
94
|
+
results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
|
95
|
+
end
|
96
|
+
results
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.faleomavaega
|
100
|
+
results = []
|
101
|
+
base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
|
102
|
+
doc = Nokogiri::HTML(open(base_url).read)
|
103
|
+
doc.xpath("//li[@type='disc']").each do |row|
|
104
|
+
results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text.gsub(/[\x80-\xff]/,' ')), :domain => "house.gov/faleomavaega" }
|
105
|
+
end
|
106
|
+
results
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.freshman_senators
|
110
|
+
results = []
|
111
|
+
['baldwin', 'donnelly', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz','kaine'].each do |senator|
|
112
|
+
base_url = "http://www.#{senator}.senate.gov/"
|
113
|
+
doc = Nokogiri::HTML(open(base_url+'press.cfm?maxrows=200&startrow=1&&type=1').read)
|
114
|
+
doc.xpath("//tr")[3..-1].each do |row|
|
115
|
+
next if row.text.strip == ''
|
116
|
+
results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text.gsub(/[\x80-\xff]/,'')), :domain => "#{senator}.senate.gov" }
|
117
|
+
end
|
118
|
+
end
|
119
|
+
results.flatten
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.klobuchar
|
123
|
+
results = []
|
124
|
+
base_url = "http://www.klobuchar.senate.gov/"
|
125
|
+
[2012,2013].each do |year|
|
126
|
+
year_url = base_url + "newsreleases.cfm?year=#{year}"
|
127
|
+
doc = Nokogiri::HTML(open(year_url).read)
|
128
|
+
doc.xpath("//dt").each do |row|
|
129
|
+
results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[\x80-\xff]/,'').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
|
130
|
+
end
|
131
|
+
end
|
132
|
+
results
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.lujan
|
136
|
+
results = []
|
137
|
+
base_url = 'http://lujan.house.gov/'
|
138
|
+
doc = Nokogiri::HTML(open(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78').read)
|
139
|
+
doc.xpath('//ul')[1].children.each do |row|
|
140
|
+
next if row.text.strip == ''
|
141
|
+
results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').gsub('Lujn','Lujan'), :date => nil, :domain => "lujan.house.gov" }
|
142
|
+
end
|
143
|
+
results
|
144
|
+
end
|
145
|
+
|
146
|
+
def self.billnelson(year=2013)
|
147
|
+
results = []
|
148
|
+
base_url = "http://www.billnelson.senate.gov/news/"
|
149
|
+
year_url = base_url + "media.cfm?year=#{year}"
|
150
|
+
doc = Nokogiri::HTML(open(year_url).read)
|
151
|
+
doc.xpath('//li').each do |row|
|
152
|
+
results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text).to_s, :domain => "billnelson.senate.gov" }
|
153
|
+
end
|
154
|
+
results
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.roe(page=1)
|
158
|
+
results = []
|
159
|
+
base_url = "http://roe.house.gov/news/"
|
160
|
+
doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}").read)
|
161
|
+
doc.xpath("//span[@class='middlecopy']").each do |row|
|
162
|
+
results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "roe.house.gov" }
|
163
|
+
end
|
164
|
+
results
|
165
|
+
end
|
166
|
+
|
167
|
+
def self.thornberry(page=1)
|
168
|
+
results = []
|
169
|
+
base_url = "http://thornberry.house.gov/news/"
|
170
|
+
doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}").read)
|
171
|
+
doc.xpath("//span[@class='middlecopy']").each do |row|
|
172
|
+
results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "thornberry.house.gov" }
|
173
|
+
end
|
174
|
+
results
|
175
|
+
end
|
176
|
+
|
37
177
|
end
|
38
|
-
|
39
178
|
end
|
data/lib/statement/version.rb
CHANGED
metadata
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: statement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 3
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: "0.
|
8
|
+
- 4
|
9
|
+
version: "0.4"
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Derek Willis
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2013-04-
|
17
|
+
date: 2013-04-25 00:00:00 Z
|
18
18
|
dependencies:
|
19
19
|
- !ruby/object:Gem::Dependency
|
20
20
|
name: bundler
|