ruby-readability 0.5.4 → 0.5.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES.markdown +3 -0
- data/lib/readability.rb +49 -0
- data/ruby-readability.gemspec +1 -1
- data/spec/readability_spec.rb +69 -0
- metadata +23 -4
data/CHANGES.markdown
ADDED
data/lib/readability.rb
CHANGED
@@ -131,6 +131,55 @@ module Readability
|
|
131
131
|
title ? title.text : nil
|
132
132
|
end
|
133
133
|
|
134
|
+
# Look through the @html document looking for the author
|
135
|
+
# Precedence Information here on the wiki: (TODO attach wiki URL if it is accepted)
|
136
|
+
# Returns nil if no author is detected
|
137
|
+
def author
|
138
|
+
# Let's grab this author:
|
139
|
+
# <meta name="dc.creator" content="Finch - http://www.getfinch.com" />
|
140
|
+
author_elements = @html.xpath('//meta[@name = "dc.creator"]')
|
141
|
+
unless author_elements.empty?
|
142
|
+
author_elements.each do |element|
|
143
|
+
if element['content']
|
144
|
+
return element['content'].strip
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Now let's try to grab this
|
150
|
+
# <span class="byline author vcard"><span>By</span><cite class="fn">Austin Fonacier</cite></span>
|
151
|
+
# <div class="author">By</div><div class="author vcard"><a class="url fn" href="http://austinlivesinyoapp.com/">Austin Fonacier</a></div>
|
152
|
+
author_elements = @html.xpath('//*[contains(@class, "vcard")]//*[contains(@class, "fn")]')
|
153
|
+
unless author_elements.empty?
|
154
|
+
author_elements.each do |element|
|
155
|
+
if element.text
|
156
|
+
return element.text.strip
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# Now let's try to grab this
|
162
|
+
# <a rel="author" href="http://dbanksdesign.com">Danny Banks (rel)</a>
|
163
|
+
# TODO: strip out the (rel)?
|
164
|
+
author_elements = @html.xpath('//a[@rel = "author"]')
|
165
|
+
unless author_elements.empty?
|
166
|
+
author_elements.each do |element|
|
167
|
+
if element.text
|
168
|
+
return element.text.strip
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
author_elements = @html.xpath('//*[@id = "author"]')
|
174
|
+
unless author_elements.empty?
|
175
|
+
author_elements.each do |element|
|
176
|
+
if element.text
|
177
|
+
return element.text.strip
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
134
183
|
def content(remove_unlikely_candidates = :default)
|
135
184
|
@remove_unlikely_candidates = false if remove_unlikely_candidates == false
|
136
185
|
|
data/ruby-readability.gemspec
CHANGED
@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "ruby-readability"
|
6
|
-
s.version = '0.5.
|
6
|
+
s.version = '0.5.5'
|
7
7
|
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
|
8
8
|
s.email = ["andrew@iterationlabs.com"]
|
9
9
|
s.homepage = "http://github.com/iterationlabs/ruby-readability"
|
data/spec/readability_spec.rb
CHANGED
@@ -128,6 +128,75 @@ describe Readability do
|
|
128
128
|
end
|
129
129
|
end
|
130
130
|
|
131
|
+
describe "author" do
|
132
|
+
it "should pick up <meta name='dc.creator'></meta> as an author" do
|
133
|
+
doc = Readability::Document.new(<<-HTML)
|
134
|
+
<html>
|
135
|
+
<head>
|
136
|
+
<meta name='dc.creator' content='Austin Fonacier' />
|
137
|
+
</head>
|
138
|
+
<body></body>
|
139
|
+
</html>
|
140
|
+
HTML
|
141
|
+
doc.author.should eql("Austin Fonacier")
|
142
|
+
end
|
143
|
+
|
144
|
+
it "should pick up readability's recommended author format" do
|
145
|
+
doc = Readability::Document.new(<<-HTML)
|
146
|
+
<html>
|
147
|
+
<head>
|
148
|
+
</head>
|
149
|
+
<body>
|
150
|
+
<p class="byline author vcard">
|
151
|
+
By <cite class="fn">Austin Fonacier</span>
|
152
|
+
</p>
|
153
|
+
</body>
|
154
|
+
</html>
|
155
|
+
HTML
|
156
|
+
doc.author.should eql("Austin Fonacier")
|
157
|
+
end
|
158
|
+
|
159
|
+
it "should pick up vcard fn" do
|
160
|
+
doc = Readability::Document.new(<<-HTML)
|
161
|
+
<html>
|
162
|
+
<head>
|
163
|
+
</head>
|
164
|
+
<body>
|
165
|
+
<div class="author">By</div>
|
166
|
+
<div class="author vcard">
|
167
|
+
<a class="url fn" href="http://austinlivesinyotests.com/">Austin Fonacier</a>
|
168
|
+
</div>
|
169
|
+
</body>
|
170
|
+
</html>
|
171
|
+
HTML
|
172
|
+
doc.author.should eql("Austin Fonacier")
|
173
|
+
end
|
174
|
+
|
175
|
+
it "should pick up <a rel='author'>" do
|
176
|
+
doc = Readability::Document.new(<<-HTML)
|
177
|
+
<html>
|
178
|
+
<head></head>
|
179
|
+
<body>
|
180
|
+
<a rel="author" href="http://google.com">Danny Banks (rel)</a>
|
181
|
+
</body>
|
182
|
+
</html>
|
183
|
+
HTML
|
184
|
+
doc.author.should eql("Danny Banks (rel)")
|
185
|
+
end
|
186
|
+
|
187
|
+
it "should pick up <div id='author'>" do
|
188
|
+
doc = Readability::Document.new(<<-HTML)
|
189
|
+
<html>
|
190
|
+
<head></head>
|
191
|
+
<body>
|
192
|
+
<div id="author">Austin Fonacier (author)</div>
|
193
|
+
</body>
|
194
|
+
</html>
|
195
|
+
HTML
|
196
|
+
doc.author.should eql("Austin Fonacier (author)")
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
131
200
|
describe "score_node" do
|
132
201
|
before do
|
133
202
|
@doc = Readability::Document.new(<<-HTML)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby-readability
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -12,7 +12,7 @@ authors:
|
|
12
12
|
autorequire:
|
13
13
|
bindir: bin
|
14
14
|
cert_chain: []
|
15
|
-
date: 2012-
|
15
|
+
date: 2012-10-02 00:00:00.000000000 Z
|
16
16
|
dependencies:
|
17
17
|
- !ruby/object:Gem::Dependency
|
18
18
|
name: rspec
|
@@ -105,6 +105,7 @@ files:
|
|
105
105
|
- .document
|
106
106
|
- .gitignore
|
107
107
|
- .rspec
|
108
|
+
- CHANGES.markdown
|
108
109
|
- Gemfile
|
109
110
|
- README.markdown
|
110
111
|
- Rakefile
|
@@ -150,8 +151,26 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
151
|
version: '0'
|
151
152
|
requirements: []
|
152
153
|
rubyforge_project: ruby-readability
|
153
|
-
rubygems_version: 1.8.
|
154
|
+
rubygems_version: 1.8.21
|
154
155
|
signing_key:
|
155
156
|
specification_version: 3
|
156
157
|
summary: Port of arc90's readability project to ruby
|
157
|
-
test_files:
|
158
|
+
test_files:
|
159
|
+
- spec/fixtures/bbc.html
|
160
|
+
- spec/fixtures/cant_read.html
|
161
|
+
- spec/fixtures/images/dim_1416768a.jpg
|
162
|
+
- spec/fixtures/nytimes.html
|
163
|
+
- spec/fixtures/sample.html
|
164
|
+
- spec/fixtures/samples/blogpost_with_links-fragments.rb
|
165
|
+
- spec/fixtures/samples/blogpost_with_links.html
|
166
|
+
- spec/fixtures/samples/channel4-1-fragments.rb
|
167
|
+
- spec/fixtures/samples/channel4-1.html
|
168
|
+
- spec/fixtures/samples/foxnews-india1-fragments.rb
|
169
|
+
- spec/fixtures/samples/foxnews-india1.html
|
170
|
+
- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
|
171
|
+
- spec/fixtures/samples/globemail-ottawa-cuts.html
|
172
|
+
- spec/fixtures/should_not_truncate.txt
|
173
|
+
- spec/fixtures/thesun.html
|
174
|
+
- spec/readability_spec.rb
|
175
|
+
- spec/spec.opts
|
176
|
+
- spec/spec_helper.rb
|