marcosinger-ruby-readability 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ .DS_Store
2
+ .gem
3
+ .bundle
4
+ Gemfile.lock
5
+ pkg/*
6
+ .idea
7
+ .rvmrc
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --colour
2
+ --format s -c
3
+ --debugger
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ruby-readability.gemspec
4
+
5
+ group :test do
6
+ gem "ruby-debug19", "0.11.6", :platform => :ruby_19
7
+ gem "fakeweb", "~> 1.3.0"
8
+ end
9
+
10
+ gemspec
data/README ADDED
@@ -0,0 +1,54 @@
1
+ Ruby Readability
2
+
3
+ Command line:
4
+ (sudo) gem install ruby-readability
5
+
6
+ Bundler:
7
+ gem "ruby-readability", :require => 'readability'
8
+
9
+ Example:
10
+
11
+ require 'rubygems'
12
+ require 'readability'
13
+ require 'open-uri'
14
+
15
+ source = open('http://lab.arc90.com/experiments/readability/').read
16
+ puts Readability::Document.new(source).content
17
+
18
+ Options:
19
+
20
+ You may provide additions options to Readability::Document.new, including:
21
+
22
+ :tags - the base whitelist of tags to sanitize, defaults to %w[div p]
23
+ :remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
24
+ :attributes - whitelist of allowed attributes
25
+ :debug - provide debugging output, defaults false
26
+ :encoding - if this page is of a known encoding, you can specify it; if left
27
+ unspecified, the encoding will be guessed (only in Ruby 1.9.x)
28
+ :html_headers - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
29
+ to aid with guessing the HTML encoding
30
+
31
+ Readability comes with a command-line tool for experimentation in bin/readability.
32
+
33
+ Usage: readability [options] URL
34
+ -d, --debug Show debug output
35
+ -i, --images Keep images and links
36
+ -h, --help Show this message
37
+
38
+ Potential issues:
39
+
40
+ * If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
41
+ Version 2.7.8 of libxml2 with the following worked for me:
42
+ gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
43
+
44
+ ===
45
+
46
+ This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
47
+
48
+ This is a ruby port of arc90's readability project
49
+
50
+ http://lab.arc90.com/experiments/readability/
51
+
52
+ Given a html document, it pulls out the main body text and cleans it up.
53
+
54
+ Ruby port by starrhorne, libc, and iterationlabs. Original gemification by fizx.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/readability ADDED
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+ $KCODE='u'
3
+ require 'rubygems'
4
+ require 'open-uri'
5
+ require 'optparse'
6
+ require File.dirname(__FILE__) + '/../lib/readability'
7
+
8
+ options = { :debug => false, :images => false }
9
+ options_parser = OptionParser.new do |opts|
10
+ opts.banner = "Usage: #{File.basename($0)} [options] URL"
11
+
12
+ opts.on("-d", "--debug", "Show debug output") do |v|
13
+ options[:debug] = v
14
+ end
15
+
16
+ opts.on("-i", "--images", "Keep images and links") do |i|
17
+ options[:images] = i
18
+ end
19
+
20
+ opts.on_tail("-h", "--help", "Show this message") do
21
+ puts opts
22
+ exit
23
+ end
24
+ end
25
+ options_parser.parse!
26
+
27
+ if ARGV.length != 1
28
+ STDERR.puts options_parser
29
+ exit 1
30
+ end
31
+
32
+ text = open(ARGV.first).read
33
+ if options[:images]
34
+ puts Readability::Document.new(text, :tags => %w[div p img a],
35
+ :attributes => %w[src href],
36
+ :remove_empty_nodes => false,
37
+ :debug => options[:debug]).content
38
+ else
39
+ puts Readability::Document.new(text, :debug => options[:debug]).content
40
+ end
@@ -0,0 +1,402 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'guess_html_encoding'
4
+ require 'mini_magick'
5
+
6
+ module Readability
7
+ class Document
8
+ DEFAULT_OPTIONS = {
9
+ :retry_length => 250,
10
+ :min_text_length => 25,
11
+ :remove_unlikely_candidates => true,
12
+ :weight_classes => true,
13
+ :clean_conditionally => true,
14
+ :remove_empty_nodes => true,
15
+ :min_image_width => 130,
16
+ :min_image_height => 80,
17
+ :ignore_image_format => ["gif"]
18
+ }.freeze
19
+
20
+ attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
21
+
22
+ def initialize(input, options = {})
23
+ @options = DEFAULT_OPTIONS.merge(options)
24
+ @input = input
25
+
26
+ if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
27
+ @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
28
+ @options[:encoding] = @input.encoding.to_s
29
+ end
30
+
31
+ @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
32
+ @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
33
+ @weight_classes = @options[:weight_classes]
34
+ @clean_conditionally = @options[:clean_conditionally]
35
+ @best_candidate_has_image = true
36
+ make_html
37
+ end
38
+
39
+ def prepare_candidates
40
+ @html.css("script, style").each { |i| i.remove }
41
+ remove_unlikely_candidates! if @remove_unlikely_candidates
42
+ transform_misused_divs_into_paragraphs!
43
+
44
+ @candidates = score_paragraphs(options[:min_text_length])
45
+ @best_candidate = select_best_candidate(@candidates)
46
+ end
47
+
48
+ def make_html
49
+ @html = Nokogiri::HTML(@input, nil, @options[:encoding])
50
+ end
51
+
52
+ def images(content=nil, reload=false)
53
+ @best_candidate_has_image = false if reload
54
+
55
+ prepare_candidates
56
+ list_images = []
57
+ tested_images = []
58
+ content = @best_candidate[:elem] unless reload
59
+
60
+ return list_images if content.nil?
61
+ elements = content.css("img").map(&:attributes)
62
+
63
+ elements.each do |element|
64
+ begin
65
+ url = element["src"].value
66
+ height = element["height"].nil? ? 0 : element["height"].value.to_i
67
+ width = element["width"].nil? ? 0 : element["width"].value.to_i
68
+ format = File.extname(url).gsub(".", "")
69
+ image = {:width => width, :height => height, :format => format}
70
+ image = MiniMagick::Image.open(url) if height.zero? or width.zero?
71
+
72
+ if tested_images.include?(url)
73
+ debug("Image was tested: #{url}")
74
+ next
75
+ end
76
+
77
+ tested_images.push(url)
78
+ if imageable?(image)
79
+ list_images << url
80
+ else
81
+ debug("Image descarted: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
82
+ end
83
+ rescue => e
84
+ debug("Image error: #{e}")
85
+ next
86
+ end
87
+ end
88
+
89
+ (list_images.empty? and content != @html) ? images(@html, true) : list_images
90
+ end
91
+
92
+ def imageable?(image)
93
+ image[:width] >= options[:min_image_width] and
94
+ image[:height] >= options[:min_image_height] and not
95
+ options[:ignore_image_format].include?(image[:format].downcase)
96
+ end
97
+
98
+ REGEXES = {
99
+ :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
100
+ :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
101
+ :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
102
+ :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
103
+ :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
104
+ :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
105
+ :replaceFontsRe => /<(\/?)font[^>]*>/i,
106
+ :trimRe => /^\s+|\s+$/,
107
+ :normalizeRe => /\s{2,}/,
108
+ :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
109
+ :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
110
+ }
111
+
112
+ def title
113
+ title = @html.css("title").first
114
+ title ? title.text : nil
115
+ end
116
+
117
+ def content(remove_unlikely_candidates = :default)
118
+ @remove_unlikely_candidates = false if remove_unlikely_candidates == false
119
+
120
+ prepare_candidates
121
+ article = get_article(@candidates, @best_candidate)
122
+ cleaned_article = sanitize(article, @candidates, options)
123
+ if article.text.strip.length < options[:retry_length]
124
+ if @remove_unlikely_candidates
125
+ @remove_unlikely_candidates = false
126
+ elsif @weight_classes
127
+ @weight_classes = false
128
+ elsif @clean_conditionally
129
+ @clean_conditionally = false
130
+ else
131
+ # nothing we can do
132
+ return cleaned_article
133
+ end
134
+
135
+ make_html
136
+ content
137
+ else
138
+ cleaned_article
139
+ end
140
+ end
141
+
142
+ def get_article(candidates, best_candidate)
143
+ # Now that we have the top candidate, look through its siblings for content that might also be related.
144
+ # Things like preambles, content split by ads that we removed, etc.
145
+
146
+ sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
147
+ output = Nokogiri::XML::Node.new('div', @html)
148
+ best_candidate[:elem].parent.children.each do |sibling|
149
+ append = false
150
+ append = true if sibling == best_candidate[:elem]
151
+ append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
152
+
153
+ if sibling.name.downcase == "p"
154
+ link_density = get_link_density(sibling)
155
+ node_content = sibling.text
156
+ node_length = node_content.length
157
+
158
+ if node_length > 80 && link_density < 0.25
159
+ append = true
160
+ elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
161
+ append = true
162
+ end
163
+ end
164
+
165
+ if append
166
+ sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
167
+ output << sibling
168
+ end
169
+ end
170
+
171
+ output
172
+ end
173
+
174
+ def select_best_candidate(candidates)
175
+ sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
176
+
177
+ debug("Top 5 canidates:")
178
+ sorted_candidates[0...5].each do |candidate|
179
+ debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
180
+ end
181
+
182
+ best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
183
+ debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
184
+
185
+ best_candidate
186
+ end
187
+
188
+ def get_link_density(elem)
189
+ link_length = elem.css("a").map(&:text).join("").length
190
+ text_length = elem.text.length
191
+ link_length / text_length.to_f
192
+ end
193
+
194
+ def score_paragraphs(min_text_length)
195
+ candidates = {}
196
+ @html.css("p,td").each do |elem|
197
+ parent_node = elem.parent
198
+ grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
199
+ inner_text = elem.text
200
+
201
+ # If this paragraph is less than 25 characters, don't even count it.
202
+ next if inner_text.length < min_text_length
203
+
204
+ candidates[parent_node] ||= score_node(parent_node)
205
+ candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
206
+
207
+ content_score = 1
208
+ content_score += inner_text.split(',').length
209
+ content_score += [(inner_text.length / 100).to_i, 3].min
210
+
211
+ candidates[parent_node][:content_score] += content_score
212
+ candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
213
+ end
214
+
215
+ # Scale the final candidates score based on link density. Good content should have a
216
+ # relatively small link density (5% or less) and be mostly unaffected by this operation.
217
+ candidates.each do |elem, candidate|
218
+ candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
219
+ end
220
+
221
+ candidates
222
+ end
223
+
224
+ def class_weight(e)
225
+ weight = 0
226
+ return weight unless @weight_classes
227
+
228
+ if e[:class] && e[:class] != ""
229
+ if e[:class] =~ REGEXES[:negativeRe]
230
+ weight -= 25
231
+ end
232
+
233
+ if e[:class] =~ REGEXES[:positiveRe]
234
+ weight += 25
235
+ end
236
+ end
237
+
238
+ if e[:id] && e[:id] != ""
239
+ if e[:id] =~ REGEXES[:negativeRe]
240
+ weight -= 25
241
+ end
242
+
243
+ if e[:id] =~ REGEXES[:positiveRe]
244
+ weight += 25
245
+ end
246
+ end
247
+
248
+ weight
249
+ end
250
+
251
+ def score_node(elem)
252
+ content_score = class_weight(elem)
253
+ case elem.name.downcase
254
+ when "div"
255
+ content_score += 5
256
+ when "blockquote"
257
+ content_score += 3
258
+ when "form"
259
+ content_score -= 3
260
+ when "th"
261
+ content_score -= 5
262
+ end
263
+ { :content_score => content_score, :elem => elem }
264
+ end
265
+
266
+ def debug(str)
267
+ puts str if options[:debug]
268
+ end
269
+
270
+ def remove_unlikely_candidates!
271
+ @html.css("*").each do |elem|
272
+ str = "#{elem[:class]}#{elem[:id]}"
273
+ if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
274
+ debug("Removing unlikely candidate - #{str}")
275
+ elem.remove
276
+ end
277
+ end
278
+ end
279
+
280
+ def transform_misused_divs_into_paragraphs!
281
+ @html.css("*").each do |elem|
282
+ if elem.name.downcase == "div"
283
+ # transform <div>s that do not contain other block elements into <p>s
284
+ if elem.inner_html !~ REGEXES[:divToPElementsRe]
285
+ debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
286
+ elem.name = "p"
287
+ end
288
+ else
289
+ # wrap text nodes in p tags
290
+ # elem.children.each do |child|
291
+ # if child.text?
292
+ # debug("wrapping text node with a p")
293
+ # child.swap("<p>#{child.text}</p>")
294
+ # end
295
+ # end
296
+ end
297
+ end
298
+ end
299
+
300
+ def sanitize(node, candidates, options = {})
301
+ node.css("h1, h2, h3, h4, h5, h6").each do |header|
302
+ header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
303
+ end
304
+
305
+ node.css("form, object, iframe, embed").each do |elem|
306
+ elem.remove
307
+ end
308
+
309
+ if @options[:remove_empty_nodes]
310
+ # remove <p> tags that have no text content - this will also remove p tags that contain only images.
311
+ node.css("p").each do |elem|
312
+ elem.remove if elem.content.strip.empty?
313
+ end
314
+ end
315
+
316
+ # Conditionally clean <table>s, <ul>s, and <div>s
317
+ clean_conditionally(node, candidates, "table, ul, div")
318
+
319
+ # We'll sanitize all elements using a whitelist
320
+ base_whitelist = @options[:tags] || %w[div p]
321
+ # We'll add whitespace instead of block elements,
322
+ # so a<br>b will have a nice space between them
323
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
324
+
325
+ # Use a hash for speed (don't want to make a million calls to include?)
326
+ whitelist = Hash.new
327
+ base_whitelist.each {|tag| whitelist[tag] = true }
328
+ replace_with_whitespace = Hash.new
329
+ base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
330
+
331
+ ([node] + node.css("*")).each do |el|
332
+ # If element is in whitelist, delete all its attributes
333
+ if whitelist[el.node_name]
334
+ el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
335
+
336
+ # Otherwise, replace the element with its contents
337
+ else
338
+ if replace_with_whitespace[el.node_name]
339
+ el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
340
+ else
341
+ el.swap(Nokogiri::XML::Text.new(el.text, el.document))
342
+ end
343
+ end
344
+
345
+ end
346
+
347
+ # Get rid of duplicate whitespace
348
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ")
349
+ end
350
+
351
+ def clean_conditionally(node, candidates, selector)
352
+ return unless @clean_conditionally
353
+ node.css(selector).each do |el|
354
+ weight = class_weight(el)
355
+ content_score = candidates[el] ? candidates[el][:content_score] : 0
356
+ name = el.name.downcase
357
+
358
+ if weight + content_score < 0
359
+ el.remove
360
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
361
+ elsif el.text.count(",") < 10
362
+ counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
363
+ counts["li"] -= 100
364
+
365
+ content_length = el.text.strip.length # Count the text length excluding any surrounding whitespace
366
+ link_density = get_link_density(el)
367
+ to_remove = false
368
+ reason = ""
369
+
370
+ if counts["img"] > counts["p"]
371
+ reason = "too many images"
372
+ to_remove = true
373
+ elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
374
+ reason = "more <li>s than <p>s"
375
+ to_remove = true
376
+ elsif counts["input"] > (counts["p"] / 3).to_i
377
+ reason = "less than 3x <p>s than <input>s"
378
+ to_remove = true
379
+ elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
380
+ reason = "too short a content length without a single image"
381
+ to_remove = true
382
+ elsif weight < 25 && link_density > 0.2
383
+ reason = "too many links for its weight (#{weight})"
384
+ to_remove = true
385
+ elsif weight >= 25 && link_density > 0.5
386
+ reason = "too many links for its weight (#{weight})"
387
+ to_remove = true
388
+ elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
389
+ reason = "<embed>s with too short a content length, or too many <embed>s"
390
+ to_remove = true
391
+ end
392
+
393
+ if to_remove
394
+ debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
395
+ el.remove
396
+ end
397
+ end
398
+ end
399
+ end
400
+
401
+ end
402
+ end