pismo 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,316 @@
1
+ # This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
2
+ #
3
+ # This is a Ruby port of arc90's readability project
4
+ # http://lab.arc90.com/experiments/readability/
5
+ # Given a html document, it pulls out the main body text and cleans it up.
6
+ # Ruby port by starrhorne and iterationlabs
7
+ #
8
+ # Original JavaScript version:
9
+ # http://lab.arc90.com/experiments/readability/js/readability.js
10
+ # * Copyright (c) 2009 Arc90 Inc
11
+ # * Readability is licensed under the Apache License, Version 2.0.
12
+
13
+ require 'nokogiri'
14
+
15
+ module Readability
16
+ class Document
17
+ TEXT_LENGTH_THRESHOLD = 25
18
+ RETRY_LENGTH = 250
19
+
20
+ attr_accessor :options, :html
21
+
22
+ def initialize(input, options = {})
23
+ @input = input
24
+ @options = options
25
+ make_html
26
+ end
27
+
28
+ def make_html
29
+ @html = Nokogiri::HTML(@input, nil, 'UTF-8')
30
+ end
31
+
32
+ REGEXES = {
33
+ :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
34
+ :okMaybeItsACandidateRe => /and|article|body|column|main/i,
35
+ :positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i,
36
+ :negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
37
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
38
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
39
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
40
+ :trimRe => /^\s+|\s+$/,
41
+ :normalizeRe => /\s{2,}/,
42
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
43
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
44
+ }
45
+
46
+ def content(remove_unlikely_candidates = true)
47
+ @html.css("script, style").each { |i| i.remove }
48
+
49
+ remove_unlikely_candidates! if remove_unlikely_candidates
50
+ transform_misused_divs_into_paragraphs!
51
+ candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
52
+ best_candidate = select_best_candidate(candidates)
53
+ article = get_article(candidates, best_candidate)
54
+ cleaned_article = sanitize(article, candidates, options)
55
+ cleaned_article.gsub!(/^\s+\n/, "\n")
56
+ cleaned_article.gsub!(/[\ \t]+/, ' ')
57
+ cleaned_article.gsub!(/^\s+/, '')
58
+ cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
59
+ if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
60
+ make_html
61
+ content(false)
62
+ else
63
+ cleaned_article
64
+ end
65
+ end
66
+
67
+ def get_article(candidates, best_candidate)
68
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
69
+ # Things like preambles, content split by ads that we removed, etc.
70
+
71
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
72
+ output = Nokogiri::XML::Node.new('div', @html)
73
+ best_candidate[:elem].parent.children.each do |sibling|
74
+ append = false
75
+ append = true if sibling == best_candidate[:elem]
76
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
77
+
78
+ if sibling.name.downcase == "p"
79
+ link_density = get_link_density(sibling)
80
+ node_content = sibling.text
81
+ node_length = node_content.length
82
+
83
+ if node_length > 80 && link_density < 0.25
84
+ append = true
85
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
86
+ append = true
87
+ end
88
+ end
89
+
90
+ if append
91
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
92
+ output << sibling
93
+ end
94
+ end
95
+
96
+ output
97
+ end
98
+
99
+ def select_best_candidate(candidates)
100
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
101
+
102
+ debug("Top 5 canidates:")
103
+ sorted_candidates[0...5].each do |candidate|
104
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
105
+ end
106
+
107
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
108
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
109
+
110
+ best_candidate
111
+ end
112
+
113
+ def get_link_density(elem)
114
+ link_length = elem.css("a").map {|i| i.text}.join("").length
115
+ text_length = elem.text.length
116
+ link_length / text_length.to_f
117
+ end
118
+
119
+ def score_paragraphs(min_text_length)
120
+ candidates = {}
121
+ @html.css("p,td").each do |elem|
122
+ parent_node = elem.parent
123
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
124
+ inner_text = elem.text
125
+
126
+ # If this paragraph is less than 25 characters, don't even count it.
127
+ next if inner_text.length < min_text_length
128
+
129
+ candidates[parent_node] ||= score_node(parent_node)
130
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
131
+
132
+ content_score = 1
133
+ content_score += inner_text.split(',').length
134
+ content_score += [(inner_text.length / 100).to_i, 3].min
135
+
136
+ candidates[parent_node][:content_score] += content_score
137
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
138
+ end
139
+
140
+ # Scale the final candidates score based on link density. Good content should have a
141
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
142
+ candidates.each do |elem, candidate|
143
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
144
+ end
145
+
146
+ candidates
147
+ end
148
+
149
+ def class_weight(e)
150
+ weight = 0
151
+ if e[:class] && e[:class] != ""
152
+ if e[:class] =~ REGEXES[:negativeRe]
153
+ weight -= 25
154
+ end
155
+
156
+ if e[:class] =~ REGEXES[:positiveRe]
157
+ weight += 25
158
+ end
159
+ end
160
+
161
+ if e[:id] && e[:id] != ""
162
+ if e[:id] =~ REGEXES[:negativeRe]
163
+ weight -= 25
164
+ end
165
+
166
+ if e[:id] =~ REGEXES[:positiveRe]
167
+ weight += 25
168
+ end
169
+ end
170
+
171
+ weight
172
+ end
173
+
174
+ def score_node(elem)
175
+ content_score = class_weight(elem)
176
+ case elem.name.downcase
177
+ when "div":
178
+ content_score += 5
179
+ when "blockquote":
180
+ content_score += 3
181
+ when "form":
182
+ content_score -= 3
183
+ when "th":
184
+ content_score -= 5
185
+ end
186
+ { :content_score => content_score, :elem => elem }
187
+ end
188
+
189
+ def debug(str)
190
+ puts str if options[:debug]
191
+ end
192
+
193
+ def remove_unlikely_candidates!
194
+ @html.css("*").each do |elem|
195
+ str = "#{elem[:class]}#{elem[:id]}"
196
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
197
+ debug("Removing unlikely candidate - #{str}")
198
+ elem.remove
199
+ end
200
+ end
201
+ end
202
+
203
+ def transform_misused_divs_into_paragraphs!
204
+ @html.css("*").each do |elem|
205
+ if elem.name.downcase == "div"
206
+ # transform <div>s that do not contain other block elements into <p>s
207
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
208
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
209
+ elem.name = "p"
210
+ end
211
+ else
212
+ # wrap text nodes in p tags
213
+ # elem.children.each do |child|
214
+ # if child.text?
215
+ ## debug("wrapping text node with a p")
216
+ # child.swap("<p>#{child.text}</p>")
217
+ # end
218
+ # end
219
+ end
220
+ end
221
+ end
222
+
223
+ def sanitize(node, candidates, options = {})
224
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
225
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
226
+ end
227
+
228
+ node.css("form, object, iframe, embed").each do |elem|
229
+ elem.remove
230
+ end
231
+
232
+ # Remove empty <p> tags
233
+ node.css("p").each do |elem|
234
+ elem.remove if elem.content.strip.empty?
235
+ end
236
+
237
+ # Remove empty <div> tags
238
+ node.css("div").each do |elem|
239
+ elem.remove if elem.content.strip.empty?
240
+ end
241
+
242
+
243
+
244
+ # Conditionally clean <table>s, <ul>s, and <div>s
245
+ node.css("table, ul, div").each do |el|
246
+ weight = class_weight(el)
247
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
248
+ name = el.name.downcase
249
+
250
+ if weight + content_score < 0
251
+ el.remove
252
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
253
+ elsif el.text.count(",") < 10
254
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
255
+ counts["li"] -= 100
256
+
257
+ content_length = el.text.length
258
+ link_density = get_link_density(el)
259
+ to_remove = false
260
+ reason = ""
261
+
262
+ if counts["img"] > counts["p"]
263
+ reason = "too many images"
264
+ to_remove = true
265
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
266
+ reason = "more <li>s than <p>s"
267
+ to_remove = true
268
+ elsif counts["input"] > (counts["p"] / 3).to_i
269
+ reason = "less than 3x <p>s than <input>s"
270
+ to_remove = true
271
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
272
+ reason = "too short a content length without a single image"
273
+ to_remove = true
274
+ elsif weight < 25 && link_density > 0.2
275
+ reason = "too many links for its weight (#{weight})"
276
+ to_remove = true
277
+ elsif weight >= 25 && link_density > 0.5
278
+ reason = "too many links for its weight (#{weight})"
279
+ to_remove = true
280
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
281
+ reason = "<embed>s with too short a content length, or too many <embed>s"
282
+ to_remove = true
283
+ end
284
+
285
+ if to_remove
286
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
287
+ el.remove
288
+ end
289
+ end
290
+ end
291
+
292
+ # We'll sanitize all elements using a whitelist
293
+ whitelist = @options[:tags] || %w[p]
294
+
295
+ # Use a hash for speed (don't want to make a million calls to include?)
296
+ whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
297
+
298
+ ([node] + node.css("*")).each do |el|
299
+
300
+ # If element is in whitelist, delete all its attributes
301
+ if whitelist[el.node_name]
302
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
303
+
304
+ # Otherwise, replace the element with its contents
305
+ else
306
+ el.swap(el.text)
307
+ end
308
+
309
+ end
310
+
311
+ # Get rid of duplicate whitespace
312
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
313
+ end
314
+
315
+ end
316
+ end