RubyGems - marcosinger-ruby-readability - Versions diffs - 0.6.0 - Mend

marcosinger-ruby-readability 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

data/.document +5 -0
data/.gitignore +7 -0
data/.rspec +3 -0
data/Gemfile +10 -0
data/README +54 -0
data/Rakefile +6 -0
data/bin/readability +40 -0
data/lib/readability.rb +402 -0
data/lib/ruby-readability.rb +1 -0
data/ruby-readability.gemspec +24 -0
data/spec/fixtures/bbc.html +2069 -0
data/spec/fixtures/cant_read.html +426 -0
data/spec/fixtures/images/dim_1416768a.jpg +0 -0
data/spec/fixtures/nytimes.html +58 -0
data/spec/fixtures/sample.html +1198 -0
data/spec/fixtures/samples/blogpost_with_links-fragments.rb +10 -0
data/spec/fixtures/samples/blogpost_with_links.html +137 -0
data/spec/fixtures/samples/channel4-1-fragments.rb +13 -0
data/spec/fixtures/samples/channel4-1.html +1330 -0
data/spec/fixtures/samples/foxnews-india1-fragments.rb +13 -0
data/spec/fixtures/samples/foxnews-india1.html +2058 -0
data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
data/spec/fixtures/should_not_truncate.txt +1077 -0
data/spec/fixtures/thesun.html +1122 -0
data/spec/readability_spec.rb +330 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +11 -0
metadata +176 -0

data/.document ADDED Viewed

@@ -0,0 +1,5 @@
+README.rdoc
+lib/**/*.rb
+bin/*
+features/**/*.feature
+LICENSE

data/.gitignore ADDED Viewed

@@ -0,0 +1,7 @@
+.DS_Store
+.gem
+.bundle
+Gemfile.lock
+pkg/*
+.idea
+.rvmrc

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--colour
+--format s -c
+--debugger

data/Gemfile ADDED Viewed

@@ -0,0 +1,10 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in ruby-readability.gemspec
+group :test do
+  gem "ruby-debug19", "0.11.6", :platform => :ruby_19
+  gem "fakeweb",      "~> 1.3.0"
+end
+gemspec

data/README ADDED Viewed

@@ -0,0 +1,54 @@
+Ruby Readability
+Command line:
+  (sudo) gem install ruby-readability
+Bundler:
+  gem "ruby-readability", :require => 'readability'
+Example:
+  require 'rubygems'
+  require 'readability'
+  require 'open-uri'
+  source = open('http://lab.arc90.com/experiments/readability/').read
+  puts Readability::Document.new(source).content
+Options:
+  You may provide additions options to Readability::Document.new, including:
+    :tags               - the base whitelist of tags to sanitize, defaults to %w[div p]
+    :remove_empty_nodes - remove <p> tags that have no text content; also removes p tags that contain only images
+    :attributes         - whitelist of allowed attributes
+    :debug              - provide debugging output, defaults false
+    :encoding           - if this page is of a known encoding, you can specify it; if left
+                          unspecified, the encoding will be guessed (only in Ruby 1.9.x)
+    :html_headers       - in Ruby 1.9.x these will be passed to the guess_html_encoding gem
+                          to aid with guessing the HTML encoding
+Readability comes with a command-line tool for experimentation in bin/readability.
+  Usage: readability [options] URL
+      -d, --debug                      Show debug output
+      -i, --images                     Keep images and links
+      -h, --help                       Show this message
+Potential issues:
+* If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.
+  Version 2.7.8 of libxml2 with the following worked for me:
+  gem install nokogiri -- --with-xml2-include=/usr/local/Cellar/libxml2/2.7.8/include/libxml2 --with-xml2-lib=/usr/local/Cellar/libxml2/2.7.8/lib --with-xslt-dir=/usr/local/Cellar/libxslt/1.1.26
+===
+This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
+This is a ruby port of arc90's readability project
+http://lab.arc90.com/experiments/readability/
+Given a html document, it pulls out the main body text and cleans it up.
+Ruby port by starrhorne, libc, and iterationlabs.  Original gemification by fizx.

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require "bundler/gem_tasks"
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/readability ADDED Viewed

@@ -0,0 +1,40 @@
+#!/usr/bin/env ruby
+$KCODE='u'
+require 'rubygems'
+require 'open-uri'
+require 'optparse'
+require File.dirname(__FILE__) + '/../lib/readability'
+options = { :debug => false, :images => false }
+options_parser = OptionParser.new do |opts|
+  opts.banner = "Usage: #{File.basename($0)} [options] URL"
+  opts.on("-d", "--debug", "Show debug output") do |v|
+    options[:debug] = v
+  end
+  opts.on("-i", "--images", "Keep images and links") do |i|
+    options[:images] = i
+  end
+  opts.on_tail("-h", "--help", "Show this message") do
+    puts opts
+    exit
+  end
+end
+options_parser.parse!
+if ARGV.length != 1
+  STDERR.puts options_parser
+  exit 1
+end
+text = open(ARGV.first).read
+if options[:images]
+  puts Readability::Document.new(text, :tags => %w[div p img a],
+                                       :attributes => %w[src href],
+                                       :remove_empty_nodes => false,
+                                       :debug => options[:debug]).content
+else
+  puts Readability::Document.new(text, :debug => options[:debug]).content
+end

data/lib/readability.rb ADDED Viewed

@@ -0,0 +1,402 @@
+require 'rubygems'
+require 'nokogiri'
+require 'guess_html_encoding'
+require 'mini_magick'
+module Readability
+  class Document
+    DEFAULT_OPTIONS = {
+      :retry_length               => 250,
+      :min_text_length            => 25,
+      :remove_unlikely_candidates => true,
+      :weight_classes             => true,
+      :clean_conditionally        => true,
+      :remove_empty_nodes         => true,
+      :min_image_width            => 130,
+      :min_image_height           => 80,
+      :ignore_image_format        => ["gif"]
+    }.freeze
+    attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
+    def initialize(input, options = {})
+      @options = DEFAULT_OPTIONS.merge(options)
+      @input = input
+      if RUBY_VERSION =~ /^1\.9\./ && !@options[:encoding]
+        @input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
+        @options[:encoding] = @input.encoding.to_s
+      end
+      @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
+      @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
+      @weight_classes = @options[:weight_classes]
+      @clean_conditionally = @options[:clean_conditionally]
+      @best_candidate_has_image = true
+      make_html
+    end
+    def prepare_candidates
+      @html.css("script, style").each { |i| i.remove }
+      remove_unlikely_candidates! if @remove_unlikely_candidates
+      transform_misused_divs_into_paragraphs!
+      @candidates     = score_paragraphs(options[:min_text_length])
+      @best_candidate = select_best_candidate(@candidates)
+    end
+    def make_html
+      @html = Nokogiri::HTML(@input, nil, @options[:encoding])
+    end
+    def images(content=nil, reload=false)
+      @best_candidate_has_image = false if reload
+      prepare_candidates
+      list_images   = []
+      tested_images = []
+      content       = @best_candidate[:elem] unless reload
+      return list_images if content.nil?
+      elements = content.css("img").map(&:attributes)
+        elements.each do |element|
+          begin
+            url     = element["src"].value
+            height  = element["height"].nil?  ? 0 : element["height"].value.to_i
+            width   = element["width"].nil?   ? 0 : element["width"].value.to_i
+            format  = File.extname(url).gsub(".", "")
+            image   = {:width => width, :height => height, :format => format}
+            image   = MiniMagick::Image.open(url) if height.zero? or width.zero?
+            if tested_images.include?(url)
+              debug("Image was tested: #{url}")
+              next
+            end
+            tested_images.push(url)
+            if imageable?(image)
+              list_images << url
+            else
+              debug("Image descarted: #{url} - height: #{image[:height]} - width: #{image[:width]} - format: #{image[:format]}")
+            end
+          rescue => e
+            debug("Image error: #{e}")
+            next
+          end
+        end
+      (list_images.empty? and content != @html) ? images(@html, true) : list_images
+    end
+    def imageable?(image)
+      image[:width] >= options[:min_image_width] and
+      image[:height] >= options[:min_image_height] and not
+      options[:ignore_image_format].include?(image[:format].downcase)
+    end
+    REGEXES = {
+        :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
+        :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
+        :positiveRe => /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+        :negativeRe => /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
+        :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
+        :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
+        :replaceFontsRe => /<(\/?)font[^>]*>/i,
+        :trimRe => /^\s+|\s+$/,
+        :normalizeRe => /\s{2,}/,
+        :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
+        :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
+    }
+    def title
+      title = @html.css("title").first
+      title ? title.text : nil
+    end
+    def content(remove_unlikely_candidates = :default)
+      @remove_unlikely_candidates = false if remove_unlikely_candidates == false
+      prepare_candidates
+      article = get_article(@candidates, @best_candidate)
+      cleaned_article = sanitize(article, @candidates, options)
+      if article.text.strip.length < options[:retry_length]
+        if @remove_unlikely_candidates
+          @remove_unlikely_candidates = false
+        elsif @weight_classes
+          @weight_classes = false
+        elsif @clean_conditionally
+          @clean_conditionally = false
+        else
+          # nothing we can do
+          return cleaned_article
+        end
+        make_html
+        content
+      else
+        cleaned_article
+      end
+    end
+    def get_article(candidates, best_candidate)
+      # Now that we have the top candidate, look through its siblings for content that might also be related.
+      # Things like preambles, content split by ads that we removed, etc.
+      sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+      output = Nokogiri::XML::Node.new('div', @html)
+      best_candidate[:elem].parent.children.each do |sibling|
+        append = false
+        append = true if sibling == best_candidate[:elem]
+        append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
+        if sibling.name.downcase == "p"
+          link_density = get_link_density(sibling)
+          node_content = sibling.text
+          node_length = node_content.length
+          if node_length > 80 && link_density < 0.25
+            append = true
+          elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
+            append = true
+          end
+        end
+        if append
+          sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
+          output << sibling
+        end
+      end
+      output
+    end
+    def select_best_candidate(candidates)
+      sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
+      debug("Top 5 canidates:")
+      sorted_candidates[0...5].each do |candidate|
+        debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
+      end
+      best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
+      debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
+      best_candidate
+    end
+    def get_link_density(elem)
+      link_length = elem.css("a").map(&:text).join("").length
+      text_length = elem.text.length
+      link_length / text_length.to_f
+    end
+    def score_paragraphs(min_text_length)
+      candidates = {}
+      @html.css("p,td").each do |elem|
+        parent_node = elem.parent
+        grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
+        inner_text = elem.text
+        # If this paragraph is less than 25 characters, don't even count it.
+        next if inner_text.length < min_text_length
+        candidates[parent_node] ||= score_node(parent_node)
+        candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
+        content_score = 1
+        content_score += inner_text.split(',').length
+        content_score += [(inner_text.length / 100).to_i, 3].min
+        candidates[parent_node][:content_score] += content_score
+        candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
+      end
+      # Scale the final candidates score based on link density. Good content should have a
+      # relatively small link density (5% or less) and be mostly unaffected by this operation.
+      candidates.each do |elem, candidate|
+        candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
+      end
+      candidates
+    end
+    def class_weight(e)
+      weight = 0
+      return weight unless @weight_classes
+      if e[:class] && e[:class] != ""
+        if e[:class] =~ REGEXES[:negativeRe]
+          weight -= 25
+        end
+        if e[:class] =~ REGEXES[:positiveRe]
+          weight += 25
+        end
+      end
+      if e[:id] && e[:id] != ""
+        if e[:id] =~ REGEXES[:negativeRe]
+          weight -= 25
+        end
+        if e[:id] =~ REGEXES[:positiveRe]
+          weight += 25
+        end
+      end
+      weight
+    end
+    def score_node(elem)
+      content_score = class_weight(elem)
+      case elem.name.downcase
+        when "div"
+          content_score += 5
+        when "blockquote"
+          content_score += 3
+        when "form"
+          content_score -= 3
+        when "th"
+          content_score -= 5
+      end
+      { :content_score => content_score, :elem => elem }
+    end
+    def debug(str)
+      puts str if options[:debug]
+    end
+    def remove_unlikely_candidates!
+      @html.css("*").each do |elem|
+        str = "#{elem[:class]}#{elem[:id]}"
+        if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
+          debug("Removing unlikely candidate - #{str}")
+          elem.remove
+        end
+      end
+    end
+    def transform_misused_divs_into_paragraphs!
+      @html.css("*").each do |elem|
+        if elem.name.downcase == "div"
+          # transform <div>s that do not contain other block elements into <p>s
+          if elem.inner_html !~ REGEXES[:divToPElementsRe]
+            debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
+            elem.name = "p"
+          end
+        else
+          # wrap text nodes in p tags
+#          elem.children.each do |child|
+#            if child.text?
+#              debug("wrapping text node with a p")
+#              child.swap("<p>#{child.text}</p>")
+#            end
+#          end
+        end
+      end
+    end
+    def sanitize(node, candidates, options = {})
+      node.css("h1, h2, h3, h4, h5, h6").each do |header|
+        header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
+      end
+      node.css("form, object, iframe, embed").each do |elem|
+        elem.remove
+      end
+      if @options[:remove_empty_nodes]
+        # remove <p> tags that have no text content - this will also remove p tags that contain only images.
+        node.css("p").each do |elem|
+          elem.remove if elem.content.strip.empty?
+        end
+      end
+      # Conditionally clean <table>s, <ul>s, and <div>s
+      clean_conditionally(node, candidates, "table, ul, div")
+      # We'll sanitize all elements using a whitelist
+      base_whitelist = @options[:tags] || %w[div p]
+      # We'll add whitespace instead of block elements,
+      # so a<br>b will have a nice space between them
+      base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
+      # Use a hash for speed (don't want to make a million calls to include?)
+      whitelist = Hash.new
+      base_whitelist.each {|tag| whitelist[tag] = true }
+      replace_with_whitespace = Hash.new
+      base_replace_with_whitespace.each { |tag| replace_with_whitespace[tag] = true }
+      ([node] + node.css("*")).each do |el|
+        # If element is in whitelist, delete all its attributes
+        if whitelist[el.node_name]
+          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+          # Otherwise, replace the element with its contents
+        else
+          if replace_with_whitespace[el.node_name]
+            el.swap(Nokogiri::XML::Text.new(' ' << el.text << ' ', el.document))
+          else
+            el.swap(Nokogiri::XML::Text.new(el.text, el.document))
+          end
+        end
+      end
+      # Get rid of duplicate whitespace
+      node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t  ]+/, " ")
+    end
+    def clean_conditionally(node, candidates, selector)
+      return unless @clean_conditionally
+      node.css(selector).each do |el|
+        weight = class_weight(el)
+        content_score = candidates[el] ? candidates[el][:content_score] : 0
+        name = el.name.downcase
+        if weight + content_score < 0
+          el.remove
+          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+        elsif el.text.count(",") < 10
+          counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
+          counts["li"] -= 100
+          content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
+          link_density = get_link_density(el)
+          to_remove = false
+          reason = ""
+          if counts["img"] > counts["p"]
+            reason = "too many images"
+            to_remove = true
+          elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
+            reason = "more <li>s than <p>s"
+            to_remove = true
+          elsif counts["input"] > (counts["p"] / 3).to_i
+            reason = "less than 3x <p>s than <input>s"
+            to_remove = true
+          elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
+            reason = "too short a content length without a single image"
+            to_remove = true
+          elsif weight < 25 && link_density > 0.2
+            reason = "too many links for its weight (#{weight})"
+            to_remove = true
+          elsif weight >= 25 && link_density > 0.5
+            reason = "too many links for its weight (#{weight})"
+            to_remove = true
+          elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
+            reason = "<embed>s with too short a content length, or too many <embed>s"
+            to_remove = true
+          end
+          if to_remove
+            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
+            el.remove
+          end
+        end
+      end
+    end
+  end
+end