RubyGems - marcosinger-ruby-readability - Versions diffs - 0.6.0 - Mend

marcosinger-ruby-readability 0.6.0

Files changed (29) hide show

data/.document +5 -0
data/.gitignore +7 -0
data/.rspec +3 -0
data/Gemfile +10 -0
data/README +54 -0
data/Rakefile +6 -0
data/bin/readability +40 -0
data/lib/readability.rb +402 -0
data/lib/ruby-readability.rb +1 -0
data/ruby-readability.gemspec +24 -0
data/spec/fixtures/bbc.html +2069 -0
data/spec/fixtures/cant_read.html +426 -0
data/spec/fixtures/images/dim_1416768a.jpg +0 -0
data/spec/fixtures/nytimes.html +58 -0
data/spec/fixtures/sample.html +1198 -0
data/spec/fixtures/samples/blogpost_with_links-fragments.rb +10 -0
data/spec/fixtures/samples/blogpost_with_links.html +137 -0
data/spec/fixtures/samples/channel4-1-fragments.rb +13 -0
data/spec/fixtures/samples/channel4-1.html +1330 -0
data/spec/fixtures/samples/foxnews-india1-fragments.rb +13 -0
data/spec/fixtures/samples/foxnews-india1.html +2058 -0
data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
data/spec/fixtures/should_not_truncate.txt +1077 -0
data/spec/fixtures/thesun.html +1122 -0
data/spec/readability_spec.rb +330 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +11 -0
metadata +176 -0

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED Viewed

@@ -0,0 +1,7 @@
+.DS_Store
+.gem
+.bundle
+Gemfile.lock
+pkg/*
+.idea
+.rvmrc

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--colour
+--format s -c
+--debugger

data/Gemfile ADDED Viewed

@@ -0,0 +1,10 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in ruby-readability.gemspec
+group :test do
+  gem "ruby-debug19", "0.11.6", :platform => :ruby_19
+  gem "fakeweb",      "~> 1.3.0"
+end
+gemspec

data/README ADDED Viewed

@@ -0,0 +1,54 @@
+Ruby Readability
+Command line:
+  (sudo) gem install ruby-readability
+Bundler:
+  gem "ruby-readability", :require => 'readability'
+Example:
+  require 'rubygems'
+  require 'readability'
+  require 'open-uri'
+  source = open('http://lab.arc90.com/experiments/readability/').read
+  puts Readability::Document.new(source).content
+Options:
+  You may provide additions options to Readability::Document.new, including:
+    :tags               - the base whitelist of tags to sanitize, defaults to %w[div p]
+    :remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
+    :attributes         - whitelist of allowed attributes
+    :debug              - provide debugging output, defaults false
+    :encoding           - if this page is of a known encoding, you can specify it; if left
+                          unspecified, the encoding will be guessed (only in Ruby 1.9.x)
+    :html_headers       - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
+                          to aid with guessing the HTML encoding
+Readability comes with a command-line tool for experimentation in bin/readability.
+  Usage: readability [options] URL
+      -d, --debug                      Show debug output
+      -i, --images                     Keep images and links
+      -h, --help                       Show this message
+Potential issues:
+* If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
+  Version 2.7.8 of libxml2 with the following worked for me:
+  gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
+===
+This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
+This is a ruby port of arc90's readability project
+http://lab.arc90.com/experiments/readability/
+Given a html document, it pulls out the main body text and cleans it up.
+Ruby port by starrhorne, libc, and iterationlabs.  Original gemification by fizx.

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/readability ADDED Viewed

@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+$KCODE='u'
+require 'rubygems'
+require 'open-uri'
+require 'optparse'
+require File.dirname(__FILE__) + '/../lib/readability'
+options = { :debug => false, :images => false }
+options_parser = OptionParser.new do |opts|
+  opts.banner = "Usage: #{File.basename($0)} [options] URL"
+  opts.on("-d", "--debug", "Show debug output") do |v|
+    options[:debug] = v
+  end
+  opts.on("-i", "--images", "Keep images and links") do |i|
+    options[:images] = i
+  end
+  opts.on_tail("-h", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+end
+options_parser.parse!
+if ARGV.length != 1
+  STDERR.puts options_parser
+  exit 1
+end
+text = open(ARGV.first).read
+if options[:images]
+  puts Readability::Document.new(text, :tags => %w[div p img a],
+                                       :attributes => %w[src href],
+                                       :remove_empty_nodes => false,
+                                       :debug => options[:debug]).content
+else
+  puts Readability::Document.new(text, :debug => options[:debug]).content
+end

data/lib/readability.rb ADDED Viewed

@@ -0,0 +1,402 @@
+require 'rubygems'
+require 'nokogiri'
+require 'guess_html_encoding'
+require 'mini_magick'
+module Readability
+  class Document
+    DEFAULT_OPTIONS = {
+      :retry_length               => 250,
+      :min_text_length            => 25,
+      :remove_unlikely_candidates => true,
+      :weight_classes             => true,
+      :clean_conditionally        => true,
+      :remove_empty_nodes         => true,
+      :min_image_width            => 130,
+      :min_image_height           => 80,
+      :ignore_image_format        => ["gif"]
+    }.freeze
+    attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
+    def initialize(input, options = {})
+      @options = DEFAULT_OPTIONS.merge(options)
+      @input = input
+      if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
+        @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
+        @options[:encoding] = @input.encoding.to_s
+      end
+      @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
+      @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
+      @weight_classes = @options[:weight_classes]
+      @clean_conditionally = @options[:clean_conditionally]
+      @best_candidate_has_image = true
+      make_html
+    end
+    def prepare_candidates
+      @html.css("script, style").each { |i| i.remove }
+      remove_unlikely_candidates! if @remove_unlikely_candidates
+      transform_misused_divs_into_paragraphs!
+      @candidates     = score_paragraphs(options[:min_text_length])
+      @best_candidate = select_best_candidate(@candidates)
+    end
+    def make_html
+      @html = Nokogiri::HTML(@input, nil, @options[:encoding])
+    end
+    def images(content=nil, reload=false)
+      @best_candidate_has_image = false if reload
+      prepare_candidates
+      list_images   = []
+      tested_images = []
+      content       = @best_candidate[:elem] unless reload
+      return list_images if content.nil?
+      elements = content.css("img").map(&:attributes)
+        elements.each do |element|
+          begin
+            url     = element["src"].value
+            height  = element["height"].nil?  ? 0 : element["height"].value.to_i
+            width   = element["width"].nil?   ? 0 : element["width"].value.to_i
+            format  = File.extname(url).gsub(".", "")
+            image   = {:width => width, :height => height, :format => format}
+            image   = MiniMagick::Image.open(url) if height.zero? or width.zero?
+            if tested_images.include?(url)
+              debug("Image was tested: #{url}")
+              next
+            end
+            tested_images.push(url)
+            if imageable?(image)
+              list_images << url
+            else
+              debug("Image descarted: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
+            end
+          rescue => e
+            debug("Image error: #{e}")
+            next
+          end
+        end
+      (list_images.empty? and content != @html) ? images(@html, true) : list_images
+    end
+    def imageable?(image)
+      image[:width] >= options[:min_image_width] and
+      image[:height] >= options[:min_image_height] and not
+      options[:ignore_image_format].include?(image[:format].downcase)
+    end
+    REGEXES = {
+        :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+        :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
+        :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+        :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
+        :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+        :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
+        :replaceFontsRe => /<(\/?)font[^>]*>/i,
+        :trimRe => /^\s+|\s+$/,
+        :normalizeRe => /\s{2,}/,
+        :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
+        :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
+    }
+    def title
+      title = @html.css("title").first
+      title ? title.text : nil
+    end
+    def content(remove_unlikely_candidates = :default)
+      @remove_unlikely_candidates = false if remove_unlikely_candidates == false
+      prepare_candidates
+      article = get_article(@candidates, @best_candidate)
+      cleaned_article = sanitize(article, @candidates, options)
+      if article.text.strip.length < options[:retry_length]
+        if @remove_unlikely_candidates
+          @remove_unlikely_candidates = false
+        elsif @weight_classes
+          @weight_classes = false
+        elsif @clean_conditionally
+          @clean_conditionally = false
+        else
+          # nothing we can do
+          return cleaned_article
+        end
+        make_html
+        content
+      else
+        cleaned_article
+      end
+    end
+    def get_article(candidates, best_candidate)
+      # Now that we have the top candidate, look through its siblings for content that might also be related.
+      # Things like preambles, content split by ads that we removed, etc.
+      sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+      output = Nokogiri::XML::Node.new('div', @html)
+      best_candidate[:elem].parent.children.each do |sibling|
+        append = false
+        append = true if sibling == best_candidate[:elem]
+        append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
+        if sibling.name.downcase == "p"
+          link_density = get_link_density(sibling)
+          node_content = sibling.text
+          node_length = node_content.length
+          if node_length > 80 && link_density < 0.25
+            append = true
+          elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
+            append = true
+          end
+        end
+        if append
+          sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
+          output << sibling
+        end
+      end
+      output
+    end
+    def select_best_candidate(candidates)
+      sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
+      debug("Top 5 canidates:")
+      sorted_candidates[0...5].each do |candidate|
+        debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
+      end
+      best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
+      debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
+      best_candidate
+    end
+    def get_link_density(elem)
+      link_length = elem.css("a").map(&:text).join("").length
+      text_length = elem.text.length
+      link_length / text_length.to_f
+    end
+    def score_paragraphs(min_text_length)
+      candidates = {}
+      @html.css("p,td").each do |elem|
+        parent_node = elem.parent
+        grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
+        inner_text = elem.text
+        # If this paragraph is less than 25 characters, don't even count it.
+        next if inner_text.length < min_text_length
+        candidates[parent_node] ||= score_node(parent_node)
+        candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
+        content_score = 1
+        content_score += inner_text.split(',').length
+        content_score += [(inner_text.length / 100).to_i, 3].min
+        candidates[parent_node][:content_score] += content_score
+        candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
+      end
+      # Scale the final candidates score based on link density. Good content should have a
+      # relatively small link density (5% or less) and be mostly unaffected by this operation.
+      candidates.each do |elem, candidate|
+        candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
+      end
+      candidates
+    end
+    def class_weight(e)
+      weight = 0
+      return weight unless @weight_classes
+      if e[:class] && e[:class] != ""
+        if e[:class] =~ REGEXES[:negativeRe]
+          weight -= 25
+        end
+        if e[:class] =~ REGEXES[:positiveRe]
+          weight += 25
+        end
+      end
+      if e[:id] && e[:id] != ""
+        if e[:id] =~ REGEXES[:negativeRe]
+          weight -= 25
+        end
+        if e[:id] =~ REGEXES[:positiveRe]
+          weight += 25
+        end
+      end
+      weight
+    end
+    def score_node(elem)
+      content_score = class_weight(elem)
+      case elem.name.downcase
+        when "div"
+          content_score += 5
+        when "blockquote"
+          content_score += 3
+        when "form"
+          content_score -= 3
+        when "th"
+          content_score -= 5
+      end
+      { :content_score => content_score, :elem => elem }
+    end
+    def debug(str)
+      puts str if options[:debug]
+    end
+    def remove_unlikely_candidates!
+      @html.css("*").each do |elem|
+        str = "#{elem[:class]}#{elem[:id]}"
+        if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
+          debug("Removing unlikely candidate - #{str}")
+          elem.remove
+        end
+      end
+    end
+    def transform_misused_divs_into_paragraphs!
+      @html.css("*").each do |elem|
+        if elem.name.downcase == "div"
+          # transform <div>s that do not contain other block elements into <p>s
+          if elem.inner_html !~ REGEXES[:divToPElementsRe]
+            debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
+            elem.name = "p"
+          end
+        else
+          # wrap text nodes in p tags
+#          elem.children.each do |child|
+#            if child.text?
+#              debug("wrapping text node with a p")
+#              child.swap("<p>#{child.text}</p>")
+#            end
+#          end
+        end
+      end
+    end
+    def sanitize(node, candidates, options = {})
+      node.css("h1, h2, h3, h4, h5, h6").each do |header|
+        header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
+      end
+      node.css("form, object, iframe, embed").each do |elem|
+        elem.remove
+      end
+      if @options[:remove_empty_nodes]
+        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
+        node.css("p").each do |elem|
+          elem.remove if elem.content.strip.empty?
+        end
+      end
+      # Conditionally clean <table>s, <ul>s, and <div>s
+      clean_conditionally(node, candidates, "table, ul, div")
+      # We'll sanitize all elements using a whitelist
+      base_whitelist = @options[:tags] || %w[div p]
+      # We'll add whitespace instead of block elements,
+      # so a<br>b will have a nice space between them
+      base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
+      # Use a hash for speed (don't want to make a million calls to include?)
+      whitelist = Hash.new
+      base_whitelist.each {|tag| whitelist[tag] = true }
+      replace_with_whitespace = Hash.new
+      base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
+      ([node] + node.css("*")).each do |el|
+        # If element is in whitelist, delete all its attributes
+        if whitelist[el.node_name]
+          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+          # Otherwise, replace the element with its contents
+        else
+          if replace_with_whitespace[el.node_name]
+            el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
+          else
+            el.swap(Nokogiri::XML::Text.new(el.text, el.document))
+          end
+        end
+      end
+      # Get rid of duplicate whitespace
+      node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t  ]+/, " ")
+    end
+    def clean_conditionally(node, candidates, selector)
+      return unless @clean_conditionally
+      node.css(selector).each do |el|
+        weight = class_weight(el)
+        content_score = candidates[el] ? candidates[el][:content_score] : 0
+        name = el.name.downcase
+        if weight + content_score < 0
+          el.remove
+          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+        elsif el.text.count(",") < 10
+          counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
+          counts["li"] -= 100
+          content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
+          link_density = get_link_density(el)
+          to_remove = false
+          reason = ""
+          if counts["img"] > counts["p"]
+            reason = "too many images"
+            to_remove = true
+          elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
+            reason = "more <li>s than <p>s"
+            to_remove = true
+          elsif counts["input"] > (counts["p"] / 3).to_i
+            reason = "less than 3x <p>s than <input>s"
+            to_remove = true
+          elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
+            reason = "too short a content length without a single image"
+            to_remove = true
+          elsif weight < 25 && link_density > 0.2
+            reason = "too many links for its weight (#{weight})"
+            to_remove = true
+          elsif weight >= 25 && link_density > 0.5
+            reason = "too many links for its weight (#{weight})"
+            to_remove = true
+          elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
+            reason = "<embed>s with too short a content length, or too many <embed>s"
+            to_remove = true
+          end
+          if to_remove
+            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
+            el.remove
+          end
+        end
+      end
+    end
+  end
+end