ruby-readability 0.5.4 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES.markdown ADDED
@@ -0,0 +1,3 @@
1
+ Oct 1, 2012:
2
+
3
+ - Merged in austinrfnd's `author` handling code.
data/lib/readability.rb CHANGED
@@ -131,6 +131,55 @@ module Readability
131
131
  title ? title.text : nil
132
132
  end
133
133
 
134
+ # Look through the @html document looking for the author
135
+ # Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
136
+ # Returns nil if no author is detected
137
+ def author
138
+ # Let's grab this author:
139
+ # <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
140
+ author_elements = @html.xpath('//meta[@name = "dc.creator"]')
141
+ unless author_elements.empty?
142
+ author_elements.each do |element|
143
+ if element['content']
144
+ return element['content'].strip
145
+ end
146
+ end
147
+ end
148
+
149
+ # Now let's try to grab this
150
+ # <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
151
+ # <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
152
+ author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
153
+ unless author_elements.empty?
154
+ author_elements.each do |element|
155
+ if element.text
156
+ return element.text.strip
157
+ end
158
+ end
159
+ end
160
+
161
+ # Now let's try to grab this
162
+ # <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
163
+ # TODO: strip out the (rel)?
164
+ author_elements = @html.xpath('//a[@rel = "author"]')
165
+ unless author_elements.empty?
166
+ author_elements.each do |element|
167
+ if element.text
168
+ return element.text.strip
169
+ end
170
+ end
171
+ end
172
+
173
+ author_elements = @html.xpath('//*[@id = "author"]')
174
+ unless author_elements.empty?
175
+ author_elements.each do |element|
176
+ if element.text
177
+ return element.text.strip
178
+ end
179
+ end
180
+ end
181
+ end
182
+
134
183
  def content(remove_unlikely_candidates = :default)
135
184
  @remove_unlikely_candidates = false if remove_unlikely_candidates == false
136
185
 
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.5.4'
6
+ s.version = '0.5.5'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/iterationlabs/ruby-readability"
@@ -128,6 +128,75 @@ describe Readability do
128
128
  end
129
129
  end
130
130
 
131
+ describe "author" do
132
+ it "should pick up <meta name='dc.creator'></meta> as an author" do
133
+ doc = Readability::Document.new(<<-HTML)
134
+ <html>
135
+ <head>
136
+ <meta name='dc.creator' content='Austin Fonacier' />
137
+ </head>
138
+ <body></body>
139
+ </html>
140
+ HTML
141
+ doc.author.should eql("Austin Fonacier")
142
+ end
143
+
144
+ it "should pick up readability's recommended author format" do
145
+ doc = Readability::Document.new(<<-HTML)
146
+ <html>
147
+ <head>
148
+ </head>
149
+ <body>
150
+ <p class="byline author vcard">
151
+ By <cite class="fn">Austin Fonacier</span>
152
+ </p>
153
+ </body>
154
+ </html>
155
+ HTML
156
+ doc.author.should eql("Austin Fonacier")
157
+ end
158
+
159
+ it "should pick up vcard fn" do
160
+ doc = Readability::Document.new(<<-HTML)
161
+ <html>
162
+ <head>
163
+ </head>
164
+ <body>
165
+ <div class="author">By</div>
166
+ <div class="author vcard">
167
+ <a class="url fn" href="http://austinlivesinyotests.com/">Austin Fonacier</a>
168
+ </div>
169
+ </body>
170
+ </html>
171
+ HTML
172
+ doc.author.should eql("Austin Fonacier")
173
+ end
174
+
175
+ it "should pick up <a rel='author'>" do
176
+ doc = Readability::Document.new(<<-HTML)
177
+ <html>
178
+ <head></head>
179
+ <body>
180
+ <a rel="author" href="http://google.com">Danny Banks (rel)</a>
181
+ </body>
182
+ </html>
183
+ HTML
184
+ doc.author.should eql("Danny Banks (rel)")
185
+ end
186
+
187
+ it "should pick up <div id='author'>" do
188
+ doc = Readability::Document.new(<<-HTML)
189
+ <html>
190
+ <head></head>
191
+ <body>
192
+ <div id="author">Austin Fonacier (author)</div>
193
+ </body>
194
+ </html>
195
+ HTML
196
+ doc.author.should eql("Austin Fonacier (author)")
197
+ end
198
+ end
199
+
131
200
  describe "score_node" do
132
201
  before do
133
202
  @doc = Readability::Document.new(<<-HTML)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.4
4
+ version: 0.5.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -12,7 +12,7 @@ authors:
12
12
  autorequire:
13
13
  bindir: bin
14
14
  cert_chain: []
15
- date: 2012-07-27 00:00:00.000000000 Z
15
+ date: 2012-10-02 00:00:00.000000000 Z
16
16
  dependencies:
17
17
  - !ruby/object:Gem::Dependency
18
18
  name: rspec
@@ -105,6 +105,7 @@ files:
105
105
  - .document
106
106
  - .gitignore
107
107
  - .rspec
108
+ - CHANGES.markdown
108
109
  - Gemfile
109
110
  - README.markdown
110
111
  - Rakefile
@@ -150,8 +151,26 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
151
  version: '0'
151
152
  requirements: []
152
153
  rubyforge_project: ruby-readability
153
- rubygems_version: 1.8.19
154
+ rubygems_version: 1.8.21
154
155
  signing_key:
155
156
  specification_version: 3
156
157
  summary: Port of arc90's readability project to ruby
157
- test_files: []
158
+ test_files:
159
+ - spec/fixtures/bbc.html
160
+ - spec/fixtures/cant_read.html
161
+ - spec/fixtures/images/dim_1416768a.jpg
162
+ - spec/fixtures/nytimes.html
163
+ - spec/fixtures/sample.html
164
+ - spec/fixtures/samples/blogpost_with_links-fragments.rb
165
+ - spec/fixtures/samples/blogpost_with_links.html
166
+ - spec/fixtures/samples/channel4-1-fragments.rb
167
+ - spec/fixtures/samples/channel4-1.html
168
+ - spec/fixtures/samples/foxnews-india1-fragments.rb
169
+ - spec/fixtures/samples/foxnews-india1.html
170
+ - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
171
+ - spec/fixtures/samples/globemail-ottawa-cuts.html
172
+ - spec/fixtures/should_not_truncate.txt
173
+ - spec/fixtures/thesun.html
174
+ - spec/readability_spec.rb
175
+ - spec/spec.opts
176
+ - spec/spec_helper.rb