statement 0.3 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -8,6 +8,8 @@ InstalledFiles
8
8
  _yardoc
9
9
  coverage
10
10
  doc/
11
+ coverage
12
+ InstalledFiles
11
13
  lib/bundler/man
12
14
  pkg
13
15
  rdoc
data/README.md CHANGED
@@ -6,30 +6,40 @@ Statement parses RSS feeds and HTML pages containing press releases and other of
6
6
 
7
7
  Add this line to your application's Gemfile:
8
8
 
9
- gem 'statement'
9
+ ```ruby
10
+ gem 'statement'
11
+ ```
10
12
 
11
13
  And then execute:
12
14
 
13
- $ bundle
15
+ ```sh
16
+ $ bundle
17
+ ```
14
18
 
15
19
  Or install it yourself as:
16
20
 
17
- $ gem install statement
21
+ ```sh
22
+ $ gem install statement
23
+ ```
18
24
 
19
25
  ## Usage
20
26
 
21
- require 'rubygems'
22
- require 'statement'
23
-
24
- results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
25
- puts results.first
26
- {:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
27
+ ```ruby
28
+ require 'rubygems'
29
+ require 'statement'
27
30
 
31
+ results = Statement::Link.house_gop('http://www.gop.gov/republicans/news?offset=03/29/11')
32
+ puts results.first
33
+ {:source=>"http://www.gop.gov/republicans/news?offset=03/29/11", :url=>"http://poe.house.gov/News/DocumentSingle.aspx?DocumentID=233004", :title=>"Poe: War in the Name of Humanity", :date=> <Date: 2011-03-29 ((2455650j,0s,0n),+0s,2299161j)>, :domain=>"poe.house.gov"}
34
+ ```
35
+
28
36
  ## Tests
29
37
 
30
38
  Statement uses MiniTest, to run tests:
31
39
 
32
- rake test
40
+ ```sh
41
+ $ rake test
42
+ ```
33
43
 
34
44
  ## Contributing
35
45
 
@@ -42,4 +52,5 @@ Statement uses MiniTest, to run tests:
42
52
  ## Authors
43
53
 
44
54
  * Derek Willis
45
- * Jacob Harris
55
+ * Jacob Harris
56
+
data/lib/statement.rb CHANGED
@@ -34,6 +34,145 @@ module Statement
34
34
  end
35
35
  end
36
36
 
37
+ def self.from_scrapers
38
+ [freshman_senators, capuano, crenshaw(2013, 0), conaway, susandavis, faleomavaega, klobuchar, lujan, billnelson(year=2013),
39
+ billnelson(year=2012), roe(page=1), roe(page=2), roe(page=3), thornberry(page=1), thornberry(page=2), thornberry(page=3)].flatten
40
+ end
41
+
42
+ ## special cases for members without RSS feeds
43
+
44
+ def self.capuano
45
+ results = []
46
+ base_url = "http://www.house.gov/capuano/news/"
47
+ list_url = base_url + 'date.shtml'
48
+ doc = Nokogiri::HTML(open(list_url).read)
49
+ doc.xpath("//a").each do |link|
50
+ if link['href'] and link['href'].include?('/pr')
51
+ begin
52
+ date = Date.parse(link.text)
53
+ rescue
54
+ date = nil
55
+ end
56
+ results << { :source => list_url, :url => base_url + link['href'], :title => link.text.split(' ',2).last, :date => date, :domain => "www.house.gov/capuano/" }
57
+ end
58
+ end
59
+ return results[0..-5]
60
+ end
61
+
62
+ def self.crenshaw(year, month)
63
+ results = []
64
+ year = Date.today.year if not year
65
+ month = 0 if not month
66
+ url = "http://crenshaw.house.gov/index.cfm/pressreleases?YearDisplay=#{year}&MonthDisplay=#{month}&page=1"
67
+ doc = Nokogiri::HTML(open(url).read)
68
+ doc.xpath("//tr")[2..-1].each do |row|
69
+ date_text, title = row.children.map{|c| c.text.strip}.reject{|c| c.empty?}
70
+ next if date_text == 'Date'
71
+ date = Date.parse(date_text)
72
+ results << { :source => url, :url => row.children[2].children.first['href'], :title => title, :date => date, :domain => "crenshaw.house.gov" }
73
+ end
74
+ results
75
+ end
76
+
77
+ def self.conaway(page=1)
78
+ results = []
79
+ base_url = "http://conaway.house.gov/news/"
80
+ page_url = base_url + "documentquery.aspx?DocumentTypeID=1279&Page=#{page}"
81
+ doc = Nokogiri::HTML(open(page_url).read)
82
+ doc.xpath("//tr")[1..-1].each do |row|
83
+ results << { :source => page_url, :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip, :date => Date.parse(row.children.children[4].text), :domain => "conaway.house.gov" }
84
+ end
85
+ results
86
+ end
87
+
88
+ def self.susandavis
89
+ results = []
90
+ base_url = "http://www.house.gov/susandavis/"
91
+ doc = Nokogiri::HTML(open(base_url+'news.shtml').read)
92
+ doc.search("ul")[6].children.each do |row|
93
+ next if row.text.strip == ''
94
+ results << { :source => base_url+'news.shtml', :url => base_url + row.children[1]['href'], :title => row.children[1].text.split.join(' '), :date => Date.parse(row.children.first.text), :domain => "house.gov/susandavis" }
95
+ end
96
+ results
97
+ end
98
+
99
+ def self.faleomavaega
100
+ results = []
101
+ base_url = "http://www.house.gov/faleomavaega/news-press.shtml"
102
+ doc = Nokogiri::HTML(open(base_url).read)
103
+ doc.xpath("//li[@type='disc']").each do |row|
104
+ results << { :source => base_url, :url => "http://www.house.gov/" + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').split('Washington, D.C.').last, :date => Date.parse(row.children[1].text.gsub(/[\x80-\xff]/,' ')), :domain => "house.gov/faleomavaega" }
105
+ end
106
+ results
107
+ end
108
+
109
+ def self.freshman_senators
110
+ results = []
111
+ ['baldwin', 'donnelly', 'flake', 'hirono','heinrich','murphy','scott','king','heitkamp','cruz','kaine'].each do |senator|
112
+ base_url = "http://www.#{senator}.senate.gov/"
113
+ doc = Nokogiri::HTML(open(base_url+'press.cfm?maxrows=200&startrow=1&&type=1').read)
114
+ doc.xpath("//tr")[3..-1].each do |row|
115
+ next if row.text.strip == ''
116
+ results << { :source => base_url+'press.cfm?maxrows=200&startrow=1&&type=1', :url => base_url + row.children.children[1]['href'], :title => row.children.children[1].text.strip.split.join(' '), :date => Date.parse(row.children.children[0].text.gsub(/[\x80-\xff]/,'')), :domain => "#{senator}.senate.gov" }
117
+ end
118
+ end
119
+ results.flatten
120
+ end
121
+
122
+ def self.klobuchar
123
+ results = []
124
+ base_url = "http://www.klobuchar.senate.gov/"
125
+ [2012,2013].each do |year|
126
+ year_url = base_url + "newsreleases.cfm?year=#{year}"
127
+ doc = Nokogiri::HTML(open(year_url).read)
128
+ doc.xpath("//dt").each do |row|
129
+ results << { :source => year_url, :url => base_url + row.next.children[0]['href'], :title => row.next.text.strip.gsub(/[\x80-\xff]/,'').split.join(' '), :date => Date.parse(row.text), :domain => "klobuchar.senate.gov" }
130
+ end
131
+ end
132
+ results
133
+ end
134
+
135
+ def self.lujan
136
+ results = []
137
+ base_url = 'http://lujan.house.gov/'
138
+ doc = Nokogiri::HTML(open(base_url+'index.php?option=com_content&view=article&id=981&Itemid=78').read)
139
+ doc.xpath('//ul')[1].children.each do |row|
140
+ next if row.text.strip == ''
141
+ results << { :source => base_url+'index.php?option=com_content&view=article&id=981&Itemid=78', :url => base_url + row.children[0]['href'], :title => row.children[0].text.gsub(/[\x80-\xff]/,'').gsub('Lujn','Lujan'), :date => nil, :domain => "lujan.house.gov" }
142
+ end
143
+ results
144
+ end
145
+
146
+ def self.billnelson(year=2013)
147
+ results = []
148
+ base_url = "http://www.billnelson.senate.gov/news/"
149
+ year_url = base_url + "media.cfm?year=#{year}"
150
+ doc = Nokogiri::HTML(open(year_url).read)
151
+ doc.xpath('//li').each do |row|
152
+ results << { :source => year_url, :url => base_url + row.children[0]['href'], :title => row.children[0].text.strip, :date => Date.parse(row.children.last.text).to_s, :domain => "billnelson.senate.gov" }
153
+ end
154
+ results
155
+ end
156
+
157
+ def self.roe(page=1)
158
+ results = []
159
+ base_url = "http://roe.house.gov/news/"
160
+ doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}").read)
161
+ doc.xpath("//span[@class='middlecopy']").each do |row|
162
+ results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1532&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "roe.house.gov" }
163
+ end
164
+ results
165
+ end
166
+
167
+ def self.thornberry(page=1)
168
+ results = []
169
+ base_url = "http://thornberry.house.gov/news/"
170
+ doc = Nokogiri::HTML(open(base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}").read)
171
+ doc.xpath("//span[@class='middlecopy']").each do |row|
172
+ results << { :source => base_url+"documentquery.aspx?DocumentTypeID=1776&Page=#{page}", :url => base_url + row.children[6]['href'], :title => row.children[1].text.strip.gsub(/[\x80-\xff]/,''), :date => Date.parse(row.children[4].text.gsub(/[\x80-\xff]/,'').strip), :domain => "thornberry.house.gov" }
173
+ end
174
+ results
175
+ end
176
+
37
177
  end
38
-
39
178
  end
@@ -1,3 +1,3 @@
1
1
  module Statement
2
- VERSION = "0.3"
2
+ VERSION = "0.4"
3
3
  end
metadata CHANGED
@@ -1,12 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: statement
3
3
  version: !ruby/object:Gem::Version
4
- hash: 13
4
+ hash: 3
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
- - 3
9
- version: "0.3"
8
+ - 4
9
+ version: "0.4"
10
10
  platform: ruby
11
11
  authors:
12
12
  - Derek Willis
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2013-04-22 00:00:00 Z
17
+ date: 2013-04-25 00:00:00 Z
18
18
  dependencies:
19
19
  - !ruby/object:Gem::Dependency
20
20
  name: bundler