busk-ruby-readability 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/readability.rb +294 -0
  2. metadata +55 -0
@@ -0,0 +1,294 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+
4
+ module Readability
5
+ class Document
6
+ TEXT_LENGTH_THRESHOLD = 25
7
+ RETRY_LENGTH = 250
8
+
9
+ attr_accessor :options, :html
10
+
11
+ def initialize(input, options = {})
12
+ @input = input
13
+ @options = options
14
+ make_html
15
+ end
16
+
17
+ def make_html
18
+ @html = Nokogiri::HTML(@input, nil, 'UTF-8')
19
+ end
20
+
21
+ REGEXES = {
22
+ :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
23
+ :okMaybeItsACandidateRe => /and|article|body|column|main/i,
24
+ :positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i,
25
+ :negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i,
26
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
27
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
28
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
29
+ :trimRe => /^\s+|\s+$/,
30
+ :normalizeRe => /\s{2,}/,
31
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
32
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
33
+ }
34
+
35
+ def content(remove_unlikely_candidates = true)
36
+ @html.css("script, style").each { |i| i.remove }
37
+
38
+ remove_unlikely_candidates! if remove_unlikely_candidates
39
+ transform_misused_divs_into_paragraphs!
40
+ candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
41
+ best_candidate = select_best_candidate(candidates)
42
+ article = get_article(candidates, best_candidate)
43
+
44
+ cleaned_article = sanitize(article, candidates, options)
45
+ if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
46
+ make_html
47
+ content(false)
48
+ else
49
+ cleaned_article
50
+ end
51
+ end
52
+
53
+ def get_article(candidates, best_candidate)
54
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
55
+ # Things like preambles, content split by ads that we removed, etc.
56
+
57
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
58
+ output = Nokogiri::XML::Node.new('div', @html)
59
+ best_candidate[:elem].parent.children.each do |sibling|
60
+ append = false
61
+ append = true if sibling == best_candidate[:elem]
62
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
63
+
64
+ if sibling.name.downcase == "p"
65
+ link_density = get_link_density(sibling)
66
+ node_content = sibling.text
67
+ node_length = node_content.length
68
+
69
+ if node_length > 80 && link_density < 0.25
70
+ append = true
71
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
72
+ append = true
73
+ end
74
+ end
75
+
76
+ if append
77
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
78
+ output << sibling
79
+ end
80
+ end
81
+
82
+ output
83
+ end
84
+
85
+ def select_best_candidate(candidates)
86
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
87
+
88
+ debug("Top 5 canidates:")
89
+ sorted_candidates[0...5].each do |candidate|
90
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
91
+ end
92
+
93
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
94
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
95
+
96
+ best_candidate
97
+ end
98
+
99
+ def get_link_density(elem)
100
+ link_length = elem.css("a").map {|i| i.text}.join("").length
101
+ text_length = elem.text.length
102
+ link_length / text_length.to_f
103
+ end
104
+
105
+ def score_paragraphs(min_text_length)
106
+ candidates = {}
107
+ @html.css("p,td").each do |elem|
108
+ parent_node = elem.parent
109
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
110
+ inner_text = elem.text
111
+
112
+ # If this paragraph is less than 25 characters, don't even count it.
113
+ next if inner_text.length < min_text_length
114
+
115
+ candidates[parent_node] ||= score_node(parent_node)
116
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
117
+
118
+ content_score = 1
119
+ content_score += inner_text.split(',').length
120
+ content_score += [(inner_text.length / 100).to_i, 3].min
121
+
122
+ candidates[parent_node][:content_score] += content_score
123
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
124
+ end
125
+
126
+ # Scale the final candidates score based on link density. Good content should have a
127
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
128
+ candidates.each do |elem, candidate|
129
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
130
+ end
131
+
132
+ candidates
133
+ end
134
+
135
+ def class_weight(e)
136
+ weight = 0
137
+ if e[:class] && e[:class] != ""
138
+ if e[:class] =~ REGEXES[:negativeRe]
139
+ weight -= 25
140
+ end
141
+
142
+ if e[:class] =~ REGEXES[:positiveRe]
143
+ weight += 25
144
+ end
145
+ end
146
+
147
+ if e[:id] && e[:id] != ""
148
+ if e[:id] =~ REGEXES[:negativeRe]
149
+ weight -= 25
150
+ end
151
+
152
+ if e[:id] =~ REGEXES[:positiveRe]
153
+ weight += 25
154
+ end
155
+ end
156
+
157
+ weight
158
+ end
159
+
160
+ def score_node(elem)
161
+ content_score = class_weight(elem)
162
+ case elem.name.downcase
163
+ when "div":
164
+ content_score += 5
165
+ when "blockquote":
166
+ content_score += 3
167
+ when "form":
168
+ content_score -= 3
169
+ when "th":
170
+ content_score -= 5
171
+ end
172
+ { :content_score => content_score, :elem => elem }
173
+ end
174
+
175
+ def debug(str)
176
+ puts str if options[:debug]
177
+ end
178
+
179
+ def remove_unlikely_candidates!
180
+ @html.css("*").each do |elem|
181
+ str = "#{elem[:class]}#{elem[:id]}"
182
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
183
+ debug("Removing unlikely candidate - #{str}")
184
+ elem.remove
185
+ end
186
+ end
187
+ end
188
+
189
+ def transform_misused_divs_into_paragraphs!
190
+ @html.css("*").each do |elem|
191
+ if elem.name.downcase == "div"
192
+ # transform <div>s that do not contain other block elements into <p>s
193
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
194
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
195
+ elem.name = "p"
196
+ end
197
+ else
198
+ # wrap text nodes in p tags
199
+ # elem.children.each do |child|
200
+ # if child.text?
201
+ ## debug("wrapping text node with a p")
202
+ # child.swap("<p>#{child.text}</p>")
203
+ # end
204
+ # end
205
+ end
206
+ end
207
+ end
208
+
209
+ def sanitize(node, candidates, options = {})
210
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
211
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
212
+ end
213
+
214
+ node.css("form, iframe").each do |elem|
215
+ elem.remove
216
+ end
217
+
218
+ # remove empty <p> tags
219
+ # node.css("p").each do |elem|
220
+ # elem.remove if elem.content.strip.empty?
221
+ # end
222
+
223
+ # Conditionally clean <table>s, <ul>s, and <div>s
224
+ node.css("table, ul, div").each do |el|
225
+ weight = class_weight(el)
226
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
227
+ name = el.name.downcase
228
+
229
+ if weight + content_score < 0
230
+ el.remove
231
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
232
+ elsif el.text.count(",") < 10
233
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
234
+ counts["li"] -= 100
235
+
236
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
237
+ link_density = get_link_density(el)
238
+ to_remove = false
239
+ reason = ""
240
+
241
+ if (counts["img"] > counts["p"]) && (counts["p"] > 0)
242
+ reason = "too many images #{counts['p']}"
243
+ to_remove = true
244
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
245
+ reason = "more <li>s than <p>s"
246
+ to_remove = true
247
+ elsif counts["input"] > (counts["p"] / 3).to_i
248
+ reason = "less than 3x <p>s than <input>s"
249
+ to_remove = true
250
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
251
+ reason = "too short a content length without a single image"
252
+ to_remove = true
253
+ elsif weight < 25 && link_density > 0.2
254
+ reason = "too many links for its weight (#{weight})"
255
+ to_remove = true
256
+ elsif weight >= 25 && link_density > 0.5
257
+ reason = "too many links for its weight (#{weight})"
258
+ to_remove = true
259
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
260
+ reason = "<embed>s with too short a content length, or too many <embed>s"
261
+ to_remove = true
262
+ end
263
+
264
+ if to_remove
265
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
266
+ el.remove
267
+ end
268
+ end
269
+ end
270
+
271
+ # We'll sanitize all elements using a whitelist
272
+ base_whitelist = @options[:tags] || %w[div p]
273
+
274
+ # Use a hash for speed (don't want to make a million calls to include?)
275
+ whitelist = Hash.new
276
+ base_whitelist.each {|tag| whitelist[tag] = true }
277
+ ([node] + node.css("*")).each do |el|
278
+
279
+ # If element is in whitelist, delete all its attributes
280
+ if whitelist[el.node_name]
281
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
282
+ # Otherwise, replace the element with its contents
283
+ else
284
+ el.swap(el.text)
285
+ end
286
+
287
+ end
288
+
289
+ # Get rid of duplicate whitespace
290
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
291
+ end
292
+
293
+ end
294
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: busk-ruby-readability
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors: []
7
+
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-09-16 00:00:00 -03:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description:
17
+ email: spiceee@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/readability.rb
26
+ has_rdoc: true
27
+ homepage: http://github.com/busk/ruby-readability
28
+ licenses: []
29
+
30
+ post_install_message:
31
+ rdoc_options: []
32
+
33
+ require_paths:
34
+ - lib
35
+ required_ruby_version: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: "0"
40
+ version:
41
+ required_rubygems_version: !ruby/object:Gem::Requirement
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ version: "0"
46
+ version:
47
+ requirements: []
48
+
49
+ rubyforge_project:
50
+ rubygems_version: 1.3.5
51
+ signing_key:
52
+ specification_version: 3
53
+ summary: A rewrite of original ruby-readability
54
+ test_files: []
55
+