pismo 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,316 @@
1
+ # This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
+ #
3
+ # This is a Ruby port of arc90's readability project
4
+ # http://lab.arc90.com/experiments/readability/
5
+ # Given a html document, it pulls out the main body text and cleans it up.
6
+ # Ruby port by starrhorne and iterationlabs
7
+ #
8
+ # Original JavaScript version:
9
+ # http://lab.arc90.com/experiments/readability/js/readability.js
10
+ # * Copyright (c) 2009 Arc90 Inc
11
+ # * Readability is licensed under the Apache License, Version 2.0.
12
+
13
+ require 'nokogiri'
14
+
15
+ module Readability
16
+ class Document
17
+ TEXT_LENGTH_THRESHOLD = 25
18
+ RETRY_LENGTH = 250
19
+
20
+ attr_accessor :options, :html
21
+
22
+ def initialize(input, options = {})
23
+ @input = input
24
+ @options = options
25
+ make_html
26
+ end
27
+
28
+ def make_html
29
+ @html = Nokogiri::HTML(@input, nil, 'UTF-8')
30
+ end
31
+
32
+ REGEXES = {
33
+ :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
34
+ :okMaybeItsACandidateRe => /and|article|body|column|main/i,
35
+ :positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i,
36
+ :negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
37
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
38
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
39
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
40
+ :trimRe => /^\s+|\s+$/,
41
+ :normalizeRe => /\s{2,}/,
42
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
43
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
44
+ }
45
+
46
+ def content(remove_unlikely_candidates = true)
47
+ @html.css("script, style").each { |i| i.remove }
48
+
49
+ remove_unlikely_candidates! if remove_unlikely_candidates
50
+ transform_misused_divs_into_paragraphs!
51
+ candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
52
+ best_candidate = select_best_candidate(candidates)
53
+ article = get_article(candidates, best_candidate)
54
+ cleaned_article = sanitize(article, candidates, options)
55
+ cleaned_article.gsub!(/^\s+\n/, "\n")
56
+ cleaned_article.gsub!(/[\ \t]+/, ' ')
57
+ cleaned_article.gsub!(/^\s+/, '')
58
+ cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
59
+ if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
60
+ make_html
61
+ content(false)
62
+ else
63
+ cleaned_article
64
+ end
65
+ end
66
+
67
+ def get_article(candidates, best_candidate)
68
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
69
+ # Things like preambles, content split by ads that we removed, etc.
70
+
71
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
72
+ output = Nokogiri::XML::Node.new('div', @html)
73
+ best_candidate[:elem].parent.children.each do |sibling|
74
+ append = false
75
+ append = true if sibling == best_candidate[:elem]
76
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
77
+
78
+ if sibling.name.downcase == "p"
79
+ link_density = get_link_density(sibling)
80
+ node_content = sibling.text
81
+ node_length = node_content.length
82
+
83
+ if node_length > 80 && link_density < 0.25
84
+ append = true
85
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
86
+ append = true
87
+ end
88
+ end
89
+
90
+ if append
91
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
92
+ output << sibling
93
+ end
94
+ end
95
+
96
+ output
97
+ end
98
+
99
+ def select_best_candidate(candidates)
100
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
101
+
102
+ debug("Top 5 canidates:")
103
+ sorted_candidates[0...5].each do |candidate|
104
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
105
+ end
106
+
107
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
108
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
109
+
110
+ best_candidate
111
+ end
112
+
113
+ def get_link_density(elem)
114
+ link_length = elem.css("a").map {|i| i.text}.join("").length
115
+ text_length = elem.text.length
116
+ link_length / text_length.to_f
117
+ end
118
+
119
+ def score_paragraphs(min_text_length)
120
+ candidates = {}
121
+ @html.css("p,td").each do |elem|
122
+ parent_node = elem.parent
123
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
124
+ inner_text = elem.text
125
+
126
+ # If this paragraph is less than 25 characters, don't even count it.
127
+ next if inner_text.length < min_text_length
128
+
129
+ candidates[parent_node] ||= score_node(parent_node)
130
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
131
+
132
+ content_score = 1
133
+ content_score += inner_text.split(',').length
134
+ content_score += [(inner_text.length / 100).to_i, 3].min
135
+
136
+ candidates[parent_node][:content_score] += content_score
137
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
138
+ end
139
+
140
+ # Scale the final candidates score based on link density. Good content should have a
141
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
142
+ candidates.each do |elem, candidate|
143
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
144
+ end
145
+
146
+ candidates
147
+ end
148
+
149
+ def class_weight(e)
150
+ weight = 0
151
+ if e[:class] && e[:class] != ""
152
+ if e[:class] =~ REGEXES[:negativeRe]
153
+ weight -= 25
154
+ end
155
+
156
+ if e[:class] =~ REGEXES[:positiveRe]
157
+ weight += 25
158
+ end
159
+ end
160
+
161
+ if e[:id] && e[:id] != ""
162
+ if e[:id] =~ REGEXES[:negativeRe]
163
+ weight -= 25
164
+ end
165
+
166
+ if e[:id] =~ REGEXES[:positiveRe]
167
+ weight += 25
168
+ end
169
+ end
170
+
171
+ weight
172
+ end
173
+
174
+ def score_node(elem)
175
+ content_score = class_weight(elem)
176
+ case elem.name.downcase
177
+ when "div":
178
+ content_score += 5
179
+ when "blockquote":
180
+ content_score += 3
181
+ when "form":
182
+ content_score -= 3
183
+ when "th":
184
+ content_score -= 5
185
+ end
186
+ { :content_score => content_score, :elem => elem }
187
+ end
188
+
189
+ def debug(str)
190
+ puts str if options[:debug]
191
+ end
192
+
193
+ def remove_unlikely_candidates!
194
+ @html.css("*").each do |elem|
195
+ str = "#{elem[:class]}#{elem[:id]}"
196
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
197
+ debug("Removing unlikely candidate - #{str}")
198
+ elem.remove
199
+ end
200
+ end
201
+ end
202
+
203
+ def transform_misused_divs_into_paragraphs!
204
+ @html.css("*").each do |elem|
205
+ if elem.name.downcase == "div"
206
+ # transform <div>s that do not contain other block elements into <p>s
207
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
208
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
209
+ elem.name = "p"
210
+ end
211
+ else
212
+ # wrap text nodes in p tags
213
+ # elem.children.each do |child|
214
+ # if child.text?
215
+ ## debug("wrapping text node with a p")
216
+ # child.swap("<p>#{child.text}</p>")
217
+ # end
218
+ # end
219
+ end
220
+ end
221
+ end
222
+
223
+ def sanitize(node, candidates, options = {})
224
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
225
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
226
+ end
227
+
228
+ node.css("form, object, iframe, embed").each do |elem|
229
+ elem.remove
230
+ end
231
+
232
+ # Remove empty <p> tags
233
+ node.css("p").each do |elem|
234
+ elem.remove if elem.content.strip.empty?
235
+ end
236
+
237
+ # Remove empty <div> tags
238
+ node.css("div").each do |elem|
239
+ elem.remove if elem.content.strip.empty?
240
+ end
241
+
242
+
243
+
244
+ # Conditionally clean <table>s, <ul>s, and <div>s
245
+ node.css("table, ul, div").each do |el|
246
+ weight = class_weight(el)
247
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
248
+ name = el.name.downcase
249
+
250
+ if weight + content_score < 0
251
+ el.remove
252
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
253
+ elsif el.text.count(",") < 10
254
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
255
+ counts["li"] -= 100
256
+
257
+ content_length = el.text.length
258
+ link_density = get_link_density(el)
259
+ to_remove = false
260
+ reason = ""
261
+
262
+ if counts["img"] > counts["p"]
263
+ reason = "too many images"
264
+ to_remove = true
265
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
266
+ reason = "more <li>s than <p>s"
267
+ to_remove = true
268
+ elsif counts["input"] > (counts["p"] / 3).to_i
269
+ reason = "less than 3x <p>s than <input>s"
270
+ to_remove = true
271
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
272
+ reason = "too short a content length without a single image"
273
+ to_remove = true
274
+ elsif weight < 25 && link_density > 0.2
275
+ reason = "too many links for its weight (#{weight})"
276
+ to_remove = true
277
+ elsif weight >= 25 && link_density > 0.5
278
+ reason = "too many links for its weight (#{weight})"
279
+ to_remove = true
280
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
281
+ reason = "<embed>s with too short a content length, or too many <embed>s"
282
+ to_remove = true
283
+ end
284
+
285
+ if to_remove
286
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
287
+ el.remove
288
+ end
289
+ end
290
+ end
291
+
292
+ # We'll sanitize all elements using a whitelist
293
+ whitelist = @options[:tags] || %w[p]
294
+
295
+ # Use a hash for speed (don't want to make a million calls to include?)
296
+ whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
297
+
298
+ ([node] + node.css("*")).each do |el|
299
+
300
+ # If element is in whitelist, delete all its attributes
301
+ if whitelist[el.node_name]
302
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
303
+
304
+ # Otherwise, replace the element with its contents
305
+ else
306
+ el.swap(el.text)
307
+ end
308
+
309
+ end
310
+
311
+ # Get rid of duplicate whitespace
312
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
313
+ end
314
+
315
+ end
316
+ end