ruby-readability 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/README ADDED
@@ -0,0 +1,9 @@
1
+ This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
+
3
+ This is a ruby port of arc90's readability project
4
+
5
+ http://lab.arc90.com/experiments/readability/
6
+
7
+ Given a html document, it pulls out the main body text and cleans it up.
8
+
9
+ Ruby port by starrhorne and iterationlabs. Gemification by fizx.
data/Rakefile ADDED
@@ -0,0 +1,45 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "ruby-readability"
8
+ gem.summary = %Q{ruby-readability}
9
+ gem.description = %Q{ruby-readability}
10
+ gem.email = "kmaxwell@twitter.com"
11
+ gem.homepage = "http://github.com/fizx/ruby-readability"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_development_dependency "rspec", ">= 1.2.9"
14
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
20
+
21
+ require 'spec/rake/spectask'
22
+ Spec::Rake::SpecTask.new(:spec) do |spec|
23
+ spec.libs << 'lib' << 'spec'
24
+ spec.spec_files = FileList['spec/**/*_spec.rb']
25
+ end
26
+
27
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
28
+ spec.libs << 'lib' << 'spec'
29
+ spec.pattern = 'spec/**/*_spec.rb'
30
+ spec.rcov = true
31
+ end
32
+
33
+ task :spec => :check_dependencies
34
+
35
+ task :default => :spec
36
+
37
+ require 'rake/rdoctask'
38
+ Rake::RDocTask.new do |rdoc|
39
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
40
+
41
+ rdoc.rdoc_dir = 'rdoc'
42
+ rdoc.title = "ruby-readability #{version}"
43
+ rdoc.rdoc_files.include('README*')
44
+ rdoc.rdoc_files.include('lib/**/*.rb')
45
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/bin/readability ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ $KCODE='u'
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require File.dirname(__FILE__) + '/../lib/readability'
6
+
7
+ if ARGV.length != 1
8
+ STDERR.puts "Usage: #{File.basename($0)} URL"
9
+ exit 1
10
+ end
11
+
12
+ text = open(ARGV.first).read
13
+ puts Readability::Document.new(text).content
@@ -0,0 +1,295 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+
4
+ module Readability
5
+ class Document
6
+ TEXT_LENGTH_THRESHOLD = 25
7
+ RETRY_LENGTH = 250
8
+
9
+ attr_accessor :options, :html
10
+
11
+ def initialize(input, options = {})
12
+ @input = input
13
+ @options = options
14
+ make_html
15
+ end
16
+
17
+ def make_html
18
+ @html = Nokogiri::HTML(@input, nil, 'UTF-8')
19
+ end
20
+
21
+ REGEXES = {
22
+ :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
23
+ :okMaybeItsACandidateRe => /and|article|body|column|main/i,
24
+ :positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i,
25
+ :negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
26
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
27
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
28
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
29
+ :trimRe => /^\s+|\s+$/,
30
+ :normalizeRe => /\s{2,}/,
31
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
32
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
33
+ }
34
+
35
+ def content(remove_unlikely_candidates = true)
36
+ @html.css("script, style").each { |i| i.remove }
37
+
38
+ remove_unlikely_candidates! if remove_unlikely_candidates
39
+ transform_misused_divs_into_paragraphs!
40
+ candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
41
+ best_candidate = select_best_candidate(candidates)
42
+ article = get_article(candidates, best_candidate)
43
+
44
+ cleaned_article = sanitize(article, candidates, options)
45
+ if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
46
+ make_html
47
+ content(false)
48
+ else
49
+ cleaned_article
50
+ end
51
+ end
52
+
53
+ def get_article(candidates, best_candidate)
54
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
55
+ # Things like preambles, content split by ads that we removed, etc.
56
+
57
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
58
+ output = Nokogiri::XML::Node.new('div', @html)
59
+ best_candidate[:elem].parent.children.each do |sibling|
60
+ append = false
61
+ append = true if sibling == best_candidate[:elem]
62
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
63
+
64
+ if sibling.name.downcase == "p"
65
+ link_density = get_link_density(sibling)
66
+ node_content = sibling.text
67
+ node_length = node_content.length
68
+
69
+ if node_length > 80 && link_density < 0.25
70
+ append = true
71
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
72
+ append = true
73
+ end
74
+ end
75
+
76
+ if append
77
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
78
+ output << sibling
79
+ end
80
+ end
81
+
82
+ output
83
+ end
84
+
85
+ def select_best_candidate(candidates)
86
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
87
+
88
+ debug("Top 5 canidates:")
89
+ sorted_candidates[0...5].each do |candidate|
90
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
91
+ end
92
+
93
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
94
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
95
+
96
+ best_candidate
97
+ end
98
+
99
+ def get_link_density(elem)
100
+ link_length = elem.css("a").map {|i| i.text}.join("").length
101
+ text_length = elem.text.length
102
+ link_length / text_length.to_f
103
+ end
104
+
105
+ def score_paragraphs(min_text_length)
106
+ candidates = {}
107
+ @html.css("p,td").each do |elem|
108
+ parent_node = elem.parent
109
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
110
+ inner_text = elem.text
111
+
112
+ # If this paragraph is less than 25 characters, don't even count it.
113
+ next if inner_text.length < min_text_length
114
+
115
+ candidates[parent_node] ||= score_node(parent_node)
116
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
117
+
118
+ content_score = 1
119
+ content_score += inner_text.split(',').length
120
+ content_score += [(inner_text.length / 100).to_i, 3].min
121
+
122
+ candidates[parent_node][:content_score] += content_score
123
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
124
+ end
125
+
126
+ # Scale the final candidates score based on link density. Good content should have a
127
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
128
+ candidates.each do |elem, candidate|
129
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
130
+ end
131
+
132
+ candidates
133
+ end
134
+
135
+ def class_weight(e)
136
+ weight = 0
137
+ if e[:class] && e[:class] != ""
138
+ if e[:class] =~ REGEXES[:negativeRe]
139
+ weight -= 25
140
+ end
141
+
142
+ if e[:class] =~ REGEXES[:positiveRe]
143
+ weight += 25
144
+ end
145
+ end
146
+
147
+ if e[:id] && e[:id] != ""
148
+ if e[:id] =~ REGEXES[:negativeRe]
149
+ weight -= 25
150
+ end
151
+
152
+ if e[:id] =~ REGEXES[:positiveRe]
153
+ weight += 25
154
+ end
155
+ end
156
+
157
+ weight
158
+ end
159
+
160
+ def score_node(elem)
161
+ content_score = class_weight(elem)
162
+ case elem.name.downcase
163
+ when "div":
164
+ content_score += 5
165
+ when "blockquote":
166
+ content_score += 3
167
+ when "form":
168
+ content_score -= 3
169
+ when "th":
170
+ content_score -= 5
171
+ end
172
+ { :content_score => content_score, :elem => elem }
173
+ end
174
+
175
+ def debug(str)
176
+ puts str if options[:debug]
177
+ end
178
+
179
+ def remove_unlikely_candidates!
180
+ @html.css("*").each do |elem|
181
+ str = "#{elem[:class]}#{elem[:id]}"
182
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
183
+ debug("Removing unlikely candidate - #{str}")
184
+ elem.remove
185
+ end
186
+ end
187
+ end
188
+
189
+ def transform_misused_divs_into_paragraphs!
190
+ @html.css("*").each do |elem|
191
+ if elem.name.downcase == "div"
192
+ # transform <div>s that do not contain other block elements into <p>s
193
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
194
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
195
+ elem.name = "p"
196
+ end
197
+ else
198
+ # wrap text nodes in p tags
199
+ # elem.children.each do |child|
200
+ # if child.text?
201
+ ## debug("wrapping text node with a p")
202
+ # child.swap("<p>#{child.text}</p>")
203
+ # end
204
+ # end
205
+ end
206
+ end
207
+ end
208
+
209
+ def sanitize(node, candidates, options = {})
210
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
211
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
212
+ end
213
+
214
+ node.css("form, object, iframe, embed").each do |elem|
215
+ elem.remove
216
+ end
217
+
218
+ # remove empty <p> tags
219
+ node.css("p").each do |elem|
220
+ elem.remove if elem.content.strip.empty?
221
+ end
222
+
223
+ # Conditionally clean <table>s, <ul>s, and <div>s
224
+ node.css("table, ul, div").each do |el|
225
+ weight = class_weight(el)
226
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
227
+ name = el.name.downcase
228
+
229
+ if weight + content_score < 0
230
+ el.remove
231
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
232
+ elsif el.text.count(",") < 10
233
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
234
+ counts["li"] -= 100
235
+
236
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
237
+ link_density = get_link_density(el)
238
+ to_remove = false
239
+ reason = ""
240
+
241
+ if counts["img"] > counts["p"]
242
+ reason = "too many images"
243
+ to_remove = true
244
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
245
+ reason = "more <li>s than <p>s"
246
+ to_remove = true
247
+ elsif counts["input"] > (counts["p"] / 3).to_i
248
+ reason = "less than 3x <p>s than <input>s"
249
+ to_remove = true
250
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
251
+ reason = "too short a content length without a single image"
252
+ to_remove = true
253
+ elsif weight < 25 && link_density > 0.2
254
+ reason = "too many links for its weight (#{weight})"
255
+ to_remove = true
256
+ elsif weight >= 25 && link_density > 0.5
257
+ reason = "too many links for its weight (#{weight})"
258
+ to_remove = true
259
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
260
+ reason = "<embed>s with too short a content length, or too many <embed>s"
261
+ to_remove = true
262
+ end
263
+
264
+ if to_remove
265
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
266
+ el.remove
267
+ end
268
+ end
269
+ end
270
+
271
+ # We'll sanitize all elements using a whitelist
272
+ base_whitelist = @options[:tags] || %w[div p]
273
+
274
+ # Use a hash for speed (don't want to make a million calls to include?)
275
+ whitelist = Hash.new
276
+ base_whitelist.each {|tag| whitelist[tag] = true }
277
+ ([node] + node.css("*")).each do |el|
278
+
279
+ # If element is in whitelist, delete all its attributes
280
+ if whitelist[el.node_name]
281
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
282
+
283
+ # Otherwise, replace the element with its contents
284
+ else
285
+ el.swap(el.text)
286
+ end
287
+
288
+ end
289
+
290
+ # Get rid of duplicate whitespace
291
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
292
+ end
293
+
294
+ end
295
+ end
@@ -0,0 +1,74 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+
4
+ module Readability
5
+ class Document
6
+
7
+ def initialize(input, options = {})
8
+ @options = options
9
+ @html = Nokogiri::HTML(input, nil, 'UTF-8')
10
+ end
11
+
12
+
13
+ def content
14
+
15
+ # Get all parent elements containing a <p> tag
16
+ @parents = @html.css("p").map { |p| p.parent }.compact.uniq
17
+
18
+ sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
19
+
20
+ end
21
+
22
+ def score(parent)
23
+ s = 0
24
+
25
+ # Adjust score based on parent's "class" attribute
26
+ s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
27
+ s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
28
+
29
+ # Adjust score based on parent id
30
+ s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
31
+ s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
32
+
33
+ # Adjust score based on # of <p> elements inside parent
34
+ s += parent.css("p").size
35
+
36
+ # Adjust score based on # of commas inside parent
37
+ s += parent.text.count ","
38
+
39
+ s
40
+ end
41
+
42
+ def sanitize(node)
43
+
44
+ # Get rid of divs full of non-text items
45
+ node.css("div").each do |el|
46
+ counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
47
+ el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
48
+ end
49
+
50
+ # We'll sanitize all elements using a whitelist
51
+ whitelist = @options[:tags] || %w[div p]
52
+
53
+ # Use a hash for speed (don't want to make a million calls to include?)
54
+ whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
55
+
56
+ ([node] + node.css("*")).each do |el|
57
+
58
+ # If element is in whitelist, delete all its attributes
59
+ if whitelist[el.node_name]
60
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
61
+
62
+ # Otherwise, replace the element with its contents
63
+ else
64
+ el.swap(el.text)
65
+ end
66
+
67
+ end
68
+
69
+ # Get rid of duplicate whitespace
70
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
71
+ end
72
+
73
+ end
74
+ end