ruby-readability 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/README +9 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/bin/readability +13 -0
- data/lib/readability.rb +295 -0
- data/lib/readability_old.rb +74 -0
- data/spec/fixtures/cant_read.html +426 -0
- data/spec/fixtures/sample.html +1198 -0
- data/spec/fixtures/samples/channel4-1-fragments.rb +14 -0
- data/spec/fixtures/samples/channel4-1.html +1330 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
- data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
- data/spec/fixtures/should_not_truncate.txt +1077 -0
- data/spec/readability_spec.rb +180 -0
- data/spec/spec_helper.rb +10 -0
- metadata +94 -0
data/.document
ADDED
data/.gitignore
ADDED
data/README
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
|
2
|
+
|
3
|
+
This is a ruby port of arc90's readability project
|
4
|
+
|
5
|
+
http://lab.arc90.com/experiments/readability/
|
6
|
+
|
7
|
+
Given a html document, it pulls out the main body text and cleans it up.
|
8
|
+
|
9
|
+
Ruby port by starrhorne and iterationlabs. Gemification by fizx.
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "ruby-readability"
|
8
|
+
gem.summary = %Q{ruby-readability}
|
9
|
+
gem.description = %Q{ruby-readability}
|
10
|
+
gem.email = "kmaxwell@twitter.com"
|
11
|
+
gem.homepage = "http://github.com/fizx/ruby-readability"
|
12
|
+
gem.authors = ["Kyle Maxwell"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'spec/rake/spectask'
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
task :spec => :check_dependencies
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
require 'rake/rdoctask'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "ruby-readability #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/readability
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$KCODE='u'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'open-uri'
|
5
|
+
require File.dirname(__FILE__) + '/../lib/readability'
|
6
|
+
|
7
|
+
if ARGV.length != 1
|
8
|
+
STDERR.puts "Usage: #{File.basename($0)} URL"
|
9
|
+
exit 1
|
10
|
+
end
|
11
|
+
|
12
|
+
text = open(ARGV.first).read
|
13
|
+
puts Readability::Document.new(text).content
|
data/lib/readability.rb
ADDED
@@ -0,0 +1,295 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Readability
|
5
|
+
class Document
|
6
|
+
TEXT_LENGTH_THRESHOLD = 25
|
7
|
+
RETRY_LENGTH = 250
|
8
|
+
|
9
|
+
attr_accessor :options, :html
|
10
|
+
|
11
|
+
def initialize(input, options = {})
|
12
|
+
@input = input
|
13
|
+
@options = options
|
14
|
+
make_html
|
15
|
+
end
|
16
|
+
|
17
|
+
def make_html
|
18
|
+
@html = Nokogiri::HTML(@input, nil, 'UTF-8')
|
19
|
+
end
|
20
|
+
|
21
|
+
REGEXES = {
|
22
|
+
:unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
|
23
|
+
:okMaybeItsACandidateRe => /and|article|body|column|main/i,
|
24
|
+
:positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i,
|
25
|
+
:negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
|
26
|
+
:divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
|
27
|
+
:replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
|
28
|
+
:replaceFontsRe => /<(\/?)font[^>]*>/i,
|
29
|
+
:trimRe => /^\s+|\s+$/,
|
30
|
+
:normalizeRe => /\s{2,}/,
|
31
|
+
:killBreaksRe => /(<br\s*\/?>(\s| ?)*){1,}/,
|
32
|
+
:videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
|
33
|
+
}
|
34
|
+
|
35
|
+
def content(remove_unlikely_candidates = true)
|
36
|
+
@html.css("script, style").each { |i| i.remove }
|
37
|
+
|
38
|
+
remove_unlikely_candidates! if remove_unlikely_candidates
|
39
|
+
transform_misused_divs_into_paragraphs!
|
40
|
+
candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
|
41
|
+
best_candidate = select_best_candidate(candidates)
|
42
|
+
article = get_article(candidates, best_candidate)
|
43
|
+
|
44
|
+
cleaned_article = sanitize(article, candidates, options)
|
45
|
+
if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
|
46
|
+
make_html
|
47
|
+
content(false)
|
48
|
+
else
|
49
|
+
cleaned_article
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def get_article(candidates, best_candidate)
|
54
|
+
# Now that we have the top candidate, look through its siblings for content that might also be related.
|
55
|
+
# Things like preambles, content split by ads that we removed, etc.
|
56
|
+
|
57
|
+
sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
|
58
|
+
output = Nokogiri::XML::Node.new('div', @html)
|
59
|
+
best_candidate[:elem].parent.children.each do |sibling|
|
60
|
+
append = false
|
61
|
+
append = true if sibling == best_candidate[:elem]
|
62
|
+
append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
|
63
|
+
|
64
|
+
if sibling.name.downcase == "p"
|
65
|
+
link_density = get_link_density(sibling)
|
66
|
+
node_content = sibling.text
|
67
|
+
node_length = node_content.length
|
68
|
+
|
69
|
+
if node_length > 80 && link_density < 0.25
|
70
|
+
append = true
|
71
|
+
elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
|
72
|
+
append = true
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
if append
|
77
|
+
sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
|
78
|
+
output << sibling
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
output
|
83
|
+
end
|
84
|
+
|
85
|
+
def select_best_candidate(candidates)
|
86
|
+
sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
|
87
|
+
|
88
|
+
debug("Top 5 canidates:")
|
89
|
+
sorted_candidates[0...5].each do |candidate|
|
90
|
+
debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
|
91
|
+
end
|
92
|
+
|
93
|
+
best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
|
94
|
+
debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
|
95
|
+
|
96
|
+
best_candidate
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_link_density(elem)
|
100
|
+
link_length = elem.css("a").map {|i| i.text}.join("").length
|
101
|
+
text_length = elem.text.length
|
102
|
+
link_length / text_length.to_f
|
103
|
+
end
|
104
|
+
|
105
|
+
def score_paragraphs(min_text_length)
|
106
|
+
candidates = {}
|
107
|
+
@html.css("p,td").each do |elem|
|
108
|
+
parent_node = elem.parent
|
109
|
+
grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
|
110
|
+
inner_text = elem.text
|
111
|
+
|
112
|
+
# If this paragraph is less than 25 characters, don't even count it.
|
113
|
+
next if inner_text.length < min_text_length
|
114
|
+
|
115
|
+
candidates[parent_node] ||= score_node(parent_node)
|
116
|
+
candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
|
117
|
+
|
118
|
+
content_score = 1
|
119
|
+
content_score += inner_text.split(',').length
|
120
|
+
content_score += [(inner_text.length / 100).to_i, 3].min
|
121
|
+
|
122
|
+
candidates[parent_node][:content_score] += content_score
|
123
|
+
candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
|
124
|
+
end
|
125
|
+
|
126
|
+
# Scale the final candidates score based on link density. Good content should have a
|
127
|
+
# relatively small link density (5% or less) and be mostly unaffected by this operation.
|
128
|
+
candidates.each do |elem, candidate|
|
129
|
+
candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
|
130
|
+
end
|
131
|
+
|
132
|
+
candidates
|
133
|
+
end
|
134
|
+
|
135
|
+
def class_weight(e)
|
136
|
+
weight = 0
|
137
|
+
if e[:class] && e[:class] != ""
|
138
|
+
if e[:class] =~ REGEXES[:negativeRe]
|
139
|
+
weight -= 25
|
140
|
+
end
|
141
|
+
|
142
|
+
if e[:class] =~ REGEXES[:positiveRe]
|
143
|
+
weight += 25
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
if e[:id] && e[:id] != ""
|
148
|
+
if e[:id] =~ REGEXES[:negativeRe]
|
149
|
+
weight -= 25
|
150
|
+
end
|
151
|
+
|
152
|
+
if e[:id] =~ REGEXES[:positiveRe]
|
153
|
+
weight += 25
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
weight
|
158
|
+
end
|
159
|
+
|
160
|
+
def score_node(elem)
|
161
|
+
content_score = class_weight(elem)
|
162
|
+
case elem.name.downcase
|
163
|
+
when "div":
|
164
|
+
content_score += 5
|
165
|
+
when "blockquote":
|
166
|
+
content_score += 3
|
167
|
+
when "form":
|
168
|
+
content_score -= 3
|
169
|
+
when "th":
|
170
|
+
content_score -= 5
|
171
|
+
end
|
172
|
+
{ :content_score => content_score, :elem => elem }
|
173
|
+
end
|
174
|
+
|
175
|
+
def debug(str)
|
176
|
+
puts str if options[:debug]
|
177
|
+
end
|
178
|
+
|
179
|
+
def remove_unlikely_candidates!
|
180
|
+
@html.css("*").each do |elem|
|
181
|
+
str = "#{elem[:class]}#{elem[:id]}"
|
182
|
+
if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
|
183
|
+
debug("Removing unlikely candidate - #{str}")
|
184
|
+
elem.remove
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
def transform_misused_divs_into_paragraphs!
|
190
|
+
@html.css("*").each do |elem|
|
191
|
+
if elem.name.downcase == "div"
|
192
|
+
# transform <div>s that do not contain other block elements into <p>s
|
193
|
+
if elem.inner_html !~ REGEXES[:divToPElementsRe]
|
194
|
+
debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
|
195
|
+
elem.name = "p"
|
196
|
+
end
|
197
|
+
else
|
198
|
+
# wrap text nodes in p tags
|
199
|
+
# elem.children.each do |child|
|
200
|
+
# if child.text?
|
201
|
+
## debug("wrapping text node with a p")
|
202
|
+
# child.swap("<p>#{child.text}</p>")
|
203
|
+
# end
|
204
|
+
# end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def sanitize(node, candidates, options = {})
|
210
|
+
node.css("h1, h2, h3, h4, h5, h6").each do |header|
|
211
|
+
header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
|
212
|
+
end
|
213
|
+
|
214
|
+
node.css("form, object, iframe, embed").each do |elem|
|
215
|
+
elem.remove
|
216
|
+
end
|
217
|
+
|
218
|
+
# remove empty <p> tags
|
219
|
+
node.css("p").each do |elem|
|
220
|
+
elem.remove if elem.content.strip.empty?
|
221
|
+
end
|
222
|
+
|
223
|
+
# Conditionally clean <table>s, <ul>s, and <div>s
|
224
|
+
node.css("table, ul, div").each do |el|
|
225
|
+
weight = class_weight(el)
|
226
|
+
content_score = candidates[el] ? candidates[el][:content_score] : 0
|
227
|
+
name = el.name.downcase
|
228
|
+
|
229
|
+
if weight + content_score < 0
|
230
|
+
el.remove
|
231
|
+
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
|
232
|
+
elsif el.text.count(",") < 10
|
233
|
+
counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
234
|
+
counts["li"] -= 100
|
235
|
+
|
236
|
+
content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
|
237
|
+
link_density = get_link_density(el)
|
238
|
+
to_remove = false
|
239
|
+
reason = ""
|
240
|
+
|
241
|
+
if counts["img"] > counts["p"]
|
242
|
+
reason = "too many images"
|
243
|
+
to_remove = true
|
244
|
+
elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
|
245
|
+
reason = "more <li>s than <p>s"
|
246
|
+
to_remove = true
|
247
|
+
elsif counts["input"] > (counts["p"] / 3).to_i
|
248
|
+
reason = "less than 3x <p>s than <input>s"
|
249
|
+
to_remove = true
|
250
|
+
elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
|
251
|
+
reason = "too short a content length without a single image"
|
252
|
+
to_remove = true
|
253
|
+
elsif weight < 25 && link_density > 0.2
|
254
|
+
reason = "too many links for its weight (#{weight})"
|
255
|
+
to_remove = true
|
256
|
+
elsif weight >= 25 && link_density > 0.5
|
257
|
+
reason = "too many links for its weight (#{weight})"
|
258
|
+
to_remove = true
|
259
|
+
elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
|
260
|
+
reason = "<embed>s with too short a content length, or too many <embed>s"
|
261
|
+
to_remove = true
|
262
|
+
end
|
263
|
+
|
264
|
+
if to_remove
|
265
|
+
debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
|
266
|
+
el.remove
|
267
|
+
end
|
268
|
+
end
|
269
|
+
end
|
270
|
+
|
271
|
+
# We'll sanitize all elements using a whitelist
|
272
|
+
base_whitelist = @options[:tags] || %w[div p]
|
273
|
+
|
274
|
+
# Use a hash for speed (don't want to make a million calls to include?)
|
275
|
+
whitelist = Hash.new
|
276
|
+
base_whitelist.each {|tag| whitelist[tag] = true }
|
277
|
+
([node] + node.css("*")).each do |el|
|
278
|
+
|
279
|
+
# If element is in whitelist, delete all its attributes
|
280
|
+
if whitelist[el.node_name]
|
281
|
+
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
282
|
+
|
283
|
+
# Otherwise, replace the element with its contents
|
284
|
+
else
|
285
|
+
el.swap(el.text)
|
286
|
+
end
|
287
|
+
|
288
|
+
end
|
289
|
+
|
290
|
+
# Get rid of duplicate whitespace
|
291
|
+
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
292
|
+
end
|
293
|
+
|
294
|
+
end
|
295
|
+
end
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'nokogiri'
|
3
|
+
|
4
|
+
module Readability
|
5
|
+
class Document
|
6
|
+
|
7
|
+
def initialize(input, options = {})
|
8
|
+
@options = options
|
9
|
+
@html = Nokogiri::HTML(input, nil, 'UTF-8')
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
def content
|
14
|
+
|
15
|
+
# Get all parent elements containing a <p> tag
|
16
|
+
@parents = @html.css("p").map { |p| p.parent }.compact.uniq
|
17
|
+
|
18
|
+
sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def score(parent)
|
23
|
+
s = 0
|
24
|
+
|
25
|
+
# Adjust score based on parent's "class" attribute
|
26
|
+
s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
|
27
|
+
s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
|
28
|
+
|
29
|
+
# Adjust score based on parent id
|
30
|
+
s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
|
31
|
+
s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
|
32
|
+
|
33
|
+
# Adjust score based on # of <p> elements inside parent
|
34
|
+
s += parent.css("p").size
|
35
|
+
|
36
|
+
# Adjust score based on # of commas inside parent
|
37
|
+
s += parent.text.count ","
|
38
|
+
|
39
|
+
s
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize(node)
|
43
|
+
|
44
|
+
# Get rid of divs full of non-text items
|
45
|
+
node.css("div").each do |el|
|
46
|
+
counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
|
47
|
+
el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
|
48
|
+
end
|
49
|
+
|
50
|
+
# We'll sanitize all elements using a whitelist
|
51
|
+
whitelist = @options[:tags] || %w[div p]
|
52
|
+
|
53
|
+
# Use a hash for speed (don't want to make a million calls to include?)
|
54
|
+
whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
|
55
|
+
|
56
|
+
([node] + node.css("*")).each do |el|
|
57
|
+
|
58
|
+
# If element is in whitelist, delete all its attributes
|
59
|
+
if whitelist[el.node_name]
|
60
|
+
el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
|
61
|
+
|
62
|
+
# Otherwise, replace the element with its contents
|
63
|
+
else
|
64
|
+
el.swap(el.text)
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get rid of duplicate whitespace
|
70
|
+
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|