marcosinger-ruby-readability 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ .DS_Store
2
+ .gem
3
+ .bundle
4
+ Gemfile.lock
5
+ pkg/*
6
+ .idea
7
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --colour
2
+ --format s -c
3
+ --debugger
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ruby-readability.gemspec
4
+
5
+ group :test do
6
+ gem "ruby-debug19", "0.11.6", :platform => :ruby_19
7
+ gem "fakeweb", "~> 1.3.0"
8
+ end
9
+
10
+ gemspec
data/README ADDED
@@ -0,0 +1,54 @@
1
+ Ruby Readability
2
+
3
+ Command line:
4
+ (sudo) gem install ruby-readability
5
+
6
+ Bundler:
7
+ gem "ruby-readability", :require => 'readability'
8
+
9
+ Example:
10
+
11
+ require 'rubygems'
12
+ require 'readability'
13
+ require 'open-uri'
14
+
15
+ source = open('http://lab.arc90.com/experiments/readability/').read
16
+ puts Readability::Document.new(source).content
17
+
18
+ Options:
19
+
20
+ You may provide additions options to Readability::Document.new, including:
21
+
22
+ :tags - the base whitelist of tags to sanitize, defaults to %w[div p]
23
+ :remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
24
+ :attributes - whitelist of allowed attributes
25
+ :debug - provide debugging output, defaults false
26
+ :encoding - if this page is of a known encoding, you can specify it; if left
27
+ unspecified, the encoding will be guessed (only in Ruby 1.9.x)
28
+ :html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
29
+ to aid with guessing the HTML encoding
30
+
31
+ Readability comes with a command-line tool for experimentation in bin/readability.
32
+
33
+ Usage: readability [options] URL
34
+ -d, --debug Show debug output
35
+ -i, --images Keep images and links
36
+ -h, --help Show this message
37
+
38
+ Potential issues:
39
+
40
+ * If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
41
+ Version 2.7.8 of libxml2 with the following worked for me:
42
+ gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
43
+
44
+ ===
45
+
46
+ This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
47
+
48
+ This is a ruby port of arc90's readability project
49
+
50
+ http://lab.arc90.com/experiments/readability/
51
+
52
+ Given a html document, it pulls out the main body text and cleans it up.
53
+
54
+ Ruby port by starrhorne, libc, and iterationlabs. Original gemification by fizx.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/readability ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+ $KCODE='u'
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require 'optparse'
6
+ require File.dirname(__FILE__) + '/../lib/readability'
7
+
8
+ options = { :debug => false, :images => false }
9
+ options_parser = OptionParser.new do |opts|
10
+ opts.banner = "Usage: #{File.basename($0)} [options] URL"
11
+
12
+ opts.on("-d", "--debug", "Show debug output") do |v|
13
+ options[:debug] = v
14
+ end
15
+
16
+ opts.on("-i", "--images", "Keep images and links") do |i|
17
+ options[:images] = i
18
+ end
19
+
20
+ opts.on_tail("-h", "--help", "Show this message") do
21
+ puts opts
22
+ exit
23
+ end
24
+ end
25
+ options_parser.parse!
26
+
27
+ if ARGV.length != 1
28
+ STDERR.puts options_parser
29
+ exit 1
30
+ end
31
+
32
+ text = open(ARGV.first).read
33
+ if options[:images]
34
+ puts Readability::Document.new(text, :tags => %w[div p img a],
35
+ :attributes => %w[src href],
36
+ :remove_empty_nodes => false,
37
+ :debug => options[:debug]).content
38
+ else
39
+ puts Readability::Document.new(text, :debug => options[:debug]).content
40
+ end
@@ -0,0 +1,402 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'guess_html_encoding'
4
+ require 'mini_magick'
5
+
6
+ module Readability
7
+ class Document
8
+ DEFAULT_OPTIONS = {
9
+ :retry_length => 250,
10
+ :min_text_length => 25,
11
+ :remove_unlikely_candidates => true,
12
+ :weight_classes => true,
13
+ :clean_conditionally => true,
14
+ :remove_empty_nodes => true,
15
+ :min_image_width => 130,
16
+ :min_image_height => 80,
17
+ :ignore_image_format => ["gif"]
18
+ }.freeze
19
+
20
+ attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
21
+
22
+ def initialize(input, options = {})
23
+ @options = DEFAULT_OPTIONS.merge(options)
24
+ @input = input
25
+
26
+ if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
27
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
28
+ @options[:encoding] = @input.encoding.to_s
29
+ end
30
+
31
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
32
+ @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
33
+ @weight_classes = @options[:weight_classes]
34
+ @clean_conditionally = @options[:clean_conditionally]
35
+ @best_candidate_has_image = true
36
+ make_html
37
+ end
38
+
39
+ def prepare_candidates
40
+ @html.css("script, style").each { |i| i.remove }
41
+ remove_unlikely_candidates! if @remove_unlikely_candidates
42
+ transform_misused_divs_into_paragraphs!
43
+
44
+ @candidates = score_paragraphs(options[:min_text_length])
45
+ @best_candidate = select_best_candidate(@candidates)
46
+ end
47
+
48
+ def make_html
49
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
50
+ end
51
+
52
+ def images(content=nil, reload=false)
53
+ @best_candidate_has_image = false if reload
54
+
55
+ prepare_candidates
56
+ list_images = []
57
+ tested_images = []
58
+ content = @best_candidate[:elem] unless reload
59
+
60
+ return list_images if content.nil?
61
+ elements = content.css("img").map(&:attributes)
62
+
63
+ elements.each do |element|
64
+ begin
65
+ url = element["src"].value
66
+ height = element["height"].nil? ? 0 : element["height"].value.to_i
67
+ width = element["width"].nil? ? 0 : element["width"].value.to_i
68
+ format = File.extname(url).gsub(".", "")
69
+ image = {:width => width, :height => height, :format => format}
70
+ image = MiniMagick::Image.open(url) if height.zero? or width.zero?
71
+
72
+ if tested_images.include?(url)
73
+ debug("Image was tested: #{url}")
74
+ next
75
+ end
76
+
77
+ tested_images.push(url)
78
+ if imageable?(image)
79
+ list_images << url
80
+ else
81
+ debug("Image descarted: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
82
+ end
83
+ rescue => e
84
+ debug("Image error: #{e}")
85
+ next
86
+ end
87
+ end
88
+
89
+ (list_images.empty? and content != @html) ? images(@html, true) : list_images
90
+ end
91
+
92
+ def imageable?(image)
93
+ image[:width] >= options[:min_image_width] and
94
+ image[:height] >= options[:min_image_height] and not
95
+ options[:ignore_image_format].include?(image[:format].downcase)
96
+ end
97
+
98
+ REGEXES = {
99
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
100
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
101
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
102
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
103
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
104
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
105
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
106
+ :trimRe => /^\s+|\s+$/,
107
+ :normalizeRe => /\s{2,}/,
108
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
109
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
110
+ }
111
+
112
+ def title
113
+ title = @html.css("title").first
114
+ title ? title.text : nil
115
+ end
116
+
117
+ def content(remove_unlikely_candidates = :default)
118
+ @remove_unlikely_candidates = false if remove_unlikely_candidates == false
119
+
120
+ prepare_candidates
121
+ article = get_article(@candidates, @best_candidate)
122
+ cleaned_article = sanitize(article, @candidates, options)
123
+ if article.text.strip.length < options[:retry_length]
124
+ if @remove_unlikely_candidates
125
+ @remove_unlikely_candidates = false
126
+ elsif @weight_classes
127
+ @weight_classes = false
128
+ elsif @clean_conditionally
129
+ @clean_conditionally = false
130
+ else
131
+ # nothing we can do
132
+ return cleaned_article
133
+ end
134
+
135
+ make_html
136
+ content
137
+ else
138
+ cleaned_article
139
+ end
140
+ end
141
+
142
+ def get_article(candidates, best_candidate)
143
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
144
+ # Things like preambles, content split by ads that we removed, etc.
145
+
146
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
147
+ output = Nokogiri::XML::Node.new('div', @html)
148
+ best_candidate[:elem].parent.children.each do |sibling|
149
+ append = false
150
+ append = true if sibling == best_candidate[:elem]
151
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
152
+
153
+ if sibling.name.downcase == "p"
154
+ link_density = get_link_density(sibling)
155
+ node_content = sibling.text
156
+ node_length = node_content.length
157
+
158
+ if node_length > 80 && link_density < 0.25
159
+ append = true
160
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
161
+ append = true
162
+ end
163
+ end
164
+
165
+ if append
166
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
167
+ output << sibling
168
+ end
169
+ end
170
+
171
+ output
172
+ end
173
+
174
+ def select_best_candidate(candidates)
175
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
176
+
177
+ debug("Top 5 canidates:")
178
+ sorted_candidates[0...5].each do |candidate|
179
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
180
+ end
181
+
182
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
183
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
184
+
185
+ best_candidate
186
+ end
187
+
188
+ def get_link_density(elem)
189
+ link_length = elem.css("a").map(&:text).join("").length
190
+ text_length = elem.text.length
191
+ link_length / text_length.to_f
192
+ end
193
+
194
+ def score_paragraphs(min_text_length)
195
+ candidates = {}
196
+ @html.css("p,td").each do |elem|
197
+ parent_node = elem.parent
198
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
199
+ inner_text = elem.text
200
+
201
+ # If this paragraph is less than 25 characters, don't even count it.
202
+ next if inner_text.length < min_text_length
203
+
204
+ candidates[parent_node] ||= score_node(parent_node)
205
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
206
+
207
+ content_score = 1
208
+ content_score += inner_text.split(',').length
209
+ content_score += [(inner_text.length / 100).to_i, 3].min
210
+
211
+ candidates[parent_node][:content_score] += content_score
212
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
213
+ end
214
+
215
+ # Scale the final candidates score based on link density. Good content should have a
216
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
217
+ candidates.each do |elem, candidate|
218
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
219
+ end
220
+
221
+ candidates
222
+ end
223
+
224
+ def class_weight(e)
225
+ weight = 0
226
+ return weight unless @weight_classes
227
+
228
+ if e[:class] && e[:class] != ""
229
+ if e[:class] =~ REGEXES[:negativeRe]
230
+ weight -= 25
231
+ end
232
+
233
+ if e[:class] =~ REGEXES[:positiveRe]
234
+ weight += 25
235
+ end
236
+ end
237
+
238
+ if e[:id] && e[:id] != ""
239
+ if e[:id] =~ REGEXES[:negativeRe]
240
+ weight -= 25
241
+ end
242
+
243
+ if e[:id] =~ REGEXES[:positiveRe]
244
+ weight += 25
245
+ end
246
+ end
247
+
248
+ weight
249
+ end
250
+
251
+ def score_node(elem)
252
+ content_score = class_weight(elem)
253
+ case elem.name.downcase
254
+ when "div"
255
+ content_score += 5
256
+ when "blockquote"
257
+ content_score += 3
258
+ when "form"
259
+ content_score -= 3
260
+ when "th"
261
+ content_score -= 5
262
+ end
263
+ { :content_score => content_score, :elem => elem }
264
+ end
265
+
266
+ def debug(str)
267
+ puts str if options[:debug]
268
+ end
269
+
270
+ def remove_unlikely_candidates!
271
+ @html.css("*").each do |elem|
272
+ str = "#{elem[:class]}#{elem[:id]}"
273
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
274
+ debug("Removing unlikely candidate - #{str}")
275
+ elem.remove
276
+ end
277
+ end
278
+ end
279
+
280
+ def transform_misused_divs_into_paragraphs!
281
+ @html.css("*").each do |elem|
282
+ if elem.name.downcase == "div"
283
+ # transform <div>s that do not contain other block elements into <p>s
284
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
285
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
286
+ elem.name = "p"
287
+ end
288
+ else
289
+ # wrap text nodes in p tags
290
+ # elem.children.each do |child|
291
+ # if child.text?
292
+ # debug("wrapping text node with a p")
293
+ # child.swap("<p>#{child.text}</p>")
294
+ # end
295
+ # end
296
+ end
297
+ end
298
+ end
299
+
300
+ def sanitize(node, candidates, options = {})
301
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
302
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
303
+ end
304
+
305
+ node.css("form, object, iframe, embed").each do |elem|
306
+ elem.remove
307
+ end
308
+
309
+ if @options[:remove_empty_nodes]
310
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
311
+ node.css("p").each do |elem|
312
+ elem.remove if elem.content.strip.empty?
313
+ end
314
+ end
315
+
316
+ # Conditionally clean <table>s, <ul>s, and <div>s
317
+ clean_conditionally(node, candidates, "table, ul, div")
318
+
319
+ # We'll sanitize all elements using a whitelist
320
+ base_whitelist = @options[:tags] || %w[div p]
321
+ # We'll add whitespace instead of block elements,
322
+ # so a<br>b will have a nice space between them
323
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
324
+
325
+ # Use a hash for speed (don't want to make a million calls to include?)
326
+ whitelist = Hash.new
327
+ base_whitelist.each {|tag| whitelist[tag] = true }
328
+ replace_with_whitespace = Hash.new
329
+ base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
330
+
331
+ ([node] + node.css("*")).each do |el|
332
+ # If element is in whitelist, delete all its attributes
333
+ if whitelist[el.node_name]
334
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
335
+
336
+ # Otherwise, replace the element with its contents
337
+ else
338
+ if replace_with_whitespace[el.node_name]
339
+ el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
340
+ else
341
+ el.swap(Nokogiri::XML::Text.new(el.text, el.document))
342
+ end
343
+ end
344
+
345
+ end
346
+
347
+ # Get rid of duplicate whitespace
348
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ")
349
+ end
350
+
351
+ def clean_conditionally(node, candidates, selector)
352
+ return unless @clean_conditionally
353
+ node.css(selector).each do |el|
354
+ weight = class_weight(el)
355
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
356
+ name = el.name.downcase
357
+
358
+ if weight + content_score < 0
359
+ el.remove
360
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
361
+ elsif el.text.count(",") < 10
362
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
363
+ counts["li"] -= 100
364
+
365
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
366
+ link_density = get_link_density(el)
367
+ to_remove = false
368
+ reason = ""
369
+
370
+ if counts["img"] > counts["p"]
371
+ reason = "too many images"
372
+ to_remove = true
373
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
374
+ reason = "more <li>s than <p>s"
375
+ to_remove = true
376
+ elsif counts["input"] > (counts["p"] / 3).to_i
377
+ reason = "less than 3x <p>s than <input>s"
378
+ to_remove = true
379
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
380
+ reason = "too short a content length without a single image"
381
+ to_remove = true
382
+ elsif weight < 25 && link_density > 0.2
383
+ reason = "too many links for its weight (#{weight})"
384
+ to_remove = true
385
+ elsif weight >= 25 && link_density > 0.5
386
+ reason = "too many links for its weight (#{weight})"
387
+ to_remove = true
388
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
389
+ reason = "<embed>s with too short a content length, or too many <embed>s"
390
+ to_remove = true
391
+ end
392
+
393
+ if to_remove
394
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
395
+ el.remove
396
+ end
397
+ end
398
+ end
399
+ end
400
+
401
+ end
402
+ end