RubyGems - pismo - Versions diffs - 0.5.0 → 0.6.0 - Mend

pismo 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/LICENSE +19 -28
data/NOTICE +4 -0
data/README.markdown +37 -40
data/Rakefile +3 -2
data/VERSION +1 -1
data/bin/pismo +15 -7
data/lib/pismo/document.rb +2 -2
data/lib/pismo/internal_attributes.rb +23 -16
data/lib/pismo/reader.rb +390 -0
data/lib/pismo.rb +3 -2
data/pismo.gemspec +23 -15
data/test/corpus/bbcnews2.html +1575 -0
data/test/corpus/gmane.html +138 -0
data/test/corpus/metadata_expected.yaml +20 -5
data/test/corpus/queness.html +919 -0
data/test/corpus/reader_expected.yaml +45 -0
data/test/corpus/tweet.html +360 -0
data/test/corpus/zefrank.html +535 -0
data/test/test_corpus.rb +9 -1
metadata +89 -34
data/lib/pismo/readability.rb +0 -342
data/test/test_readability.rb +0 -152

metadata CHANGED Viewed

@@ -1,7 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: pismo
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  hash: 7
+  prerelease: false
+  segments:
+  - 0
+  - 6
+  - 0
+  version: 0.6.0
 platform: ruby
 authors:
 - Peter Cooper
@@ -9,69 +15,107 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-06-01 00:00:00 +01:00
+date: 2010-06-20 00:00:00 +01:00
 default_executable: pismo
 dependencies:
 - !ruby/object:Gem::Dependency
   name: shoulda
-  type: :development
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :development
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
-  name: nokogiri
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  name: awesome_print
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :development
+  version_requirements: *id002
 - !ruby/object:Gem::Dependency
-  name: loofah
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  name: jeweler
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id003
 - !ruby/object:Gem::Dependency
-  name: httparty
+  name: nokogiri
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
   type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: sanitize
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id005
 - !ruby/object:Gem::Dependency
   name: fast-stemmer
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id006
 - !ruby/object:Gem::Dependency
   name: chronic
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id007 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id007
 description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
 email: git@peterc.org
 executables:
@@ -85,6 +129,7 @@ files:
 - .document
 - .gitignore
 - LICENSE
+- NOTICE
 - README.markdown
 - Rakefile
 - VERSION
@@ -93,25 +138,30 @@ files:
 - lib/pismo/document.rb
 - lib/pismo/external_attributes.rb
 - lib/pismo/internal_attributes.rb
-- lib/pismo/readability.rb
+- lib/pismo/reader.rb
 - lib/pismo/stopwords.txt
 - pismo.gemspec
 - test/corpus/bbcnews.html
+- test/corpus/bbcnews2.html
 - test/corpus/briancray.html
 - test/corpus/cant_read.html
 - test/corpus/factor.html
+- test/corpus/gmane.html
 - test/corpus/huffington.html
 - test/corpus/metadata_expected.yaml
 - test/corpus/metadata_expected.yaml.old
+- test/corpus/queness.html
+- test/corpus/reader_expected.yaml
 - test/corpus/rubyinside.html
 - test/corpus/rww.html
 - test/corpus/spolsky.html
 - test/corpus/techcrunch.html
+- test/corpus/tweet.html
 - test/corpus/youtube.html
+- test/corpus/zefrank.html
 - test/helper.rb
 - test/test_corpus.rb
 - test/test_pismo_document.rb
-- test/test_readability.rb
 has_rdoc: true
 homepage: http://github.com/peterc/pismo
 licenses: []
@@ -122,21 +172,27 @@ rdoc_options:
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.5
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
 summary: Extracts or retrieves content-related metadata from HTML pages
@@ -144,4 +200,3 @@ test_files:
 - test/helper.rb
 - test/test_corpus.rb
 - test/test_pismo_document.rb
-- test/test_readability.rb

data/lib/pismo/readability.rb DELETED Viewed

@@ -1,342 +0,0 @@
-# This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
-#
-# This is a Ruby port of arc90's readability project
-# http://lab.arc90.com/experiments/readability/
-# Given a html document, it pulls out the main body text and cleans it up.
-# Ruby port by starrhorne and iterationlabs
-#
-# Original JavaScript version:
-#   http://lab.arc90.com/experiments/readability/js/readability.js
-#   * Copyright (c) 2009 Arc90 Inc
-#   * Readability is licensed under the Apache License, Version 2.0.
-#
-# Minor edits and tweaks by Peter Cooper
-require 'nokogiri'
-IS_RUBY19 = "a".respond_to?(:encoding)
-module Readability
-  class Document
-    TEXT_LENGTH_THRESHOLD = 25
-    RETRY_LENGTH = 250
-    attr_accessor :options, :html
-    def initialize(input, options = {})
-      @input = input
-      @options = options
-      make_html
-    end
-    def make_html
-      @html = Nokogiri::HTML(@input) #, nil, 'UTF-8')
-    end
-    REGEXES = {
-        :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i,
-        :okMaybeItsACandidateRe => /and|article|body|column|main/i,
-        :positiveRe => /article|body|content|entry|hentry|page|pagination|post|story|text/i,
-        :negativeRe => /combx|comment|contact|foot|box_wrap|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags/i,
-        :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
-        :replaceBrsRe => /(<br[^>]*>[ \n\r\t]*){2,}/i,
-        :replaceFontsRe => /<(\/?)font[^>]*>/i,
-        :trimRe => /^\s+|\s+$/,
-        :normalizeRe => /\s{2,}/,
-        :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
-        :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
-    }
-    def content(remove_unlikely_candidates = true)
-      @html.css("script, style").each { |i| i.remove }
-      remove_unlikely_candidates! if remove_unlikely_candidates
-      transform_misused_divs_into_paragraphs!
-      candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD)
-      best_candidate = select_best_candidate(candidates)
-      article = get_article(candidates, best_candidate)
-      cleaned_article = sanitize(article, candidates, options)
-      cleaned_article.gsub!(/^\s+\n/, "\n")
-      cleaned_article.gsub!(/[\ \t]+/, ' ')
-      cleaned_article.gsub!(/^\s+/, '')
-      cleaned_article.gsub!(/\<\!\-\-.*?\-\-\>/m, '')
-      if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH)
-        make_html
-        content(false)
-      else
-        cleaned_article
-      end
-    end
-    def get_article(candidates, best_candidate)
-      # Now that we have the top candidate, look through its siblings for content that might also be related.
-      # Things like preambles, content split by ads that we removed, etc.
-      sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
-      output = Nokogiri::XML::Node.new('div', @html)
-      return output unless best_candidate[:elem]
-      best_candidate[:elem].parent.children.each do |sibling|
-        append = false
-        append = true if sibling == best_candidate[:elem]
-        append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
-        if sibling.name.downcase == "p"
-          link_density = get_link_density(sibling)
-          node_content = sibling.text
-          node_length = node_content.length
-          if node_length > 80 && link_density < 0.25
-            append = true
-          elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/
-            append = true
-          end
-        end
-        if append
-          sibling.name = "div" unless %w[div p].include?(sibling.name.downcase)
-          output << sibling
-        end
-      end
-      output
-    end
-    def select_best_candidate(candidates)
-      sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
-      debug("Top 5 canidates:")
-      sorted_candidates[0...5].each do |candidate|
-        debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}")
-      end
-      best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 }
-      #debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}")
-      best_candidate
-    end
-    def get_link_density(elem)
-      link_length = elem.css("a").map {|i| i.text}.join("").length
-      text_length = elem.text.length
-      link_length / text_length.to_f
-    end
-    def score_paragraphs(min_text_length)
-      candidates = {}
-      @html.css("p,td").each do |elem|
-        parent_node = elem.parent
-        grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
-        inner_text = elem.text
-        # If this paragraph is less than 25 characters, don't even count it.
-        next if inner_text.length < min_text_length
-        candidates[parent_node] ||= score_node(parent_node)
-        candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node
-        content_score = 1
-        begin
-          content_score += inner_text.split(',').length
-          content_score += [(inner_text.length / 100).to_i, 3].min
-        rescue => e
-          raise e unless IS_RUBY19
-          inner_text.force_encoding('ASCII-8BIT')
-          content_score += inner_text.split(',').length
-          content_score += [(inner_text.length / 100).to_i, 3].min
-        end
-        candidates[parent_node][:content_score] += content_score
-        candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node
-      end
-      # Scale the final candidates score based on link density. Good content should have a
-      # relatively small link density (5% or less) and be mostly unaffected by this operation.
-      candidates.each do |elem, candidate|
-        candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem))
-      end
-      candidates
-    end
-    def class_weight(e)
-      weight = 0
-      if e[:class] && e[:class] != ""
-        if e[:class] =~ REGEXES[:negativeRe]
-          weight -= 25
-        end
-        if e[:class] =~ REGEXES[:positiveRe]
-          weight += 25
-        end
-      end
-      if e[:id] && e[:id] != ""
-        if e[:id] =~ REGEXES[:negativeRe]
-          weight -= 25
-        end
-        if e[:id] =~ REGEXES[:positiveRe]
-          weight += 25
-        end
-      end
-      weight
-    end
-    def score_node(elem)
-      content_score = class_weight(elem)
-      case elem.name.downcase
-        when "div"
-          content_score += 5
-        when "blockquote"
-          content_score += 3
-        when "form"
-          content_score -= 3
-        when "th"
-          content_score -= 5
-      end
-      { :content_score => content_score, :elem => elem }
-    end
-    def debug(str)
-      puts str if options[:debug]
-    end
-    def remove_unlikely_candidates!
-      @html.css("*").each do |elem|
-        str = "#{elem[:class]}#{elem[:id]}"
-        if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body'
-          debug("Removing unlikely candidate - #{str}")
-          elem.remove
-        end
-      end
-    end
-    def transform_misused_divs_into_paragraphs!
-      @html.css("*").each do |elem|
-        if elem.name.downcase == "div"
-          # transform <div>s that do not contain other block elements into <p>s
-          elem_inner_html = IS_RUBY19 ? elem.inner_html.dup.force_encoding('ASCII-8BIT') : elem.inner_html
-          if elem_inner_html !~ REGEXES[:divToPElementsRe]
-            debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p");
-            elem.name = "p"
-          end
-        else
-          # wrap text nodes in p tags
-#          elem.children.each do |child|
-#            if child.text?
-##              debug("wrapping text node with a p")
-#              child.swap("<p>#{child.text}</p>")
-#            end
-#          end
-        end
-      end
-    end
-    def sanitize(node, candidates, options = {})
-      node.css("h1, h2, h3, h4, h5, h6").each do |header|
-        header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33
-      end
-      node.css("form, object, iframe, embed").each do |elem|
-        elem.remove
-      end
-      # Remove empty <p> tags
-      node.css("p").each do |elem|
-        elem.remove if elem.content.strip.empty?
-      end
-      # Remove empty <div> tags
-      node.css("div").each do |elem|
-        elem.remove if elem.content.strip.empty?
-      end
-      # Conditionally clean <table>s, <ul>s, and <div>s
-      node.css("table, ul, div").each do |el|
-        weight = class_weight(el)
-        content_score = candidates[el] ? candidates[el][:content_score] : 0
-        name = el.name.downcase
-        if weight + content_score < 0
-          el.remove
-          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
-        elsif (IS_RUBY19 && el.text.force_encoding("ASCII-8BIT").count(",") < 10) || (!IS_RUBY19 && el.text.count(",") < 10)
-          counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
-          counts["li"] -= 100
-          content_length = el.text.length
-          link_density = get_link_density(el)
-          to_remove = false
-          reason = ""
-          if counts["img"] > counts["p"]
-            reason = "too many images"
-            to_remove = true
-          elsif counts["li"] > counts["p"] && name != "ul" && name != "ol"
-            reason = "more <li>s than <p>s"
-            to_remove = true
-          elsif counts["input"] > (counts["p"] / 3).to_i
-            reason = "less than 3x <p>s than <input>s"
-            to_remove = true
-          elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2)
-            reason = "too short a content length without a single image"
-            to_remove = true
-          elsif weight < 25 && link_density > 0.2
-            reason = "too many links for its weight (#{weight})"
-            to_remove = true
-          elsif weight >= 25 && link_density > 0.5
-            reason = "too many links for its weight (#{weight})"
-            to_remove = true
-          elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1
-            reason = "<embed>s with too short a content length, or too many <embed>s"
-            to_remove = true
-          end
-          if to_remove
-            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
-            el.remove
-          end
-        end
-      end
-      # We'll sanitize all elements using a whitelist
-      whitelist = @options[:tags] || %w[p]
-      # Use a hash for speed (don't want to make a million calls to include?)
-      whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
-      ([node] + node.css("*")).each do |el|
-        # If element is in whitelist, delete all its attributes
-        if whitelist[el.node_name]
-          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
-          # Otherwise, replace the element with its contents
-        else
-          begin
-            el.swap(el.text)
-          rescue => e
-            raise e unless IS_RUBY19
-            el.swap(el.text.force_encoding("ASCII-8BIT"))
-          end
-        end
-      end
-      # Get rid of duplicate whitespace
-      begin
-        node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
-      rescue => e
-        raise e unless IS_RUBY19
-        node.to_html.force_encoding("ASCII-8BIT").gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
-      end
-    end
-  end
-end