RubyGems - ruby-readability - Versions diffs - 0.1.0 → 0.2.1 - Mend

ruby-readability 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

data/.gitignore +2 -0
data/README +1 -1
data/Rakefile +5 -5
data/VERSION +1 -1
data/lib/readability.rb +75 -36
data/ruby-readability.gemspec +71 -0
data/spec/fixtures/samples/blogpost_with_links-fragments.rb +9 -0
data/spec/fixtures/samples/blogpost_with_links.html +137 -0
data/spec/fixtures/samples/channel4-1-fragments.rb +1 -2
data/spec/fixtures/samples/foxnews-india1-fragments.rb +13 -0
data/spec/fixtures/samples/foxnews-india1.html +2058 -0
data/spec/readability_spec.rb +51 -2
data/spec/spec.opts +4 -0
metadata +26 -9
data/lib/readability_old.rb +0 -74

data/spec/readability_spec.rb CHANGED Viewed

@@ -101,7 +101,7 @@ describe Readability do
               <p id="some_text2">some more text</p>
             </div>
           </body>
-        </html>
+        </html><!-- " -->
       HTML
       @candidates = @doc.score_paragraphs(0)
     end
@@ -117,6 +117,37 @@ describe Readability do
     end
   end
+  describe "score_paragraphs" do
+    context "when two consequent br tags are used instead of p" do
+      before :each do
+        @doc = Readability::Document.new(<<-HTML)
+          <html>
+            <head>
+              <title>title!</title>
+            </head>
+            <body id="body">
+              <div id="post1">
+                This is the main content!<br/><br/>
+                Zebra found killed butcher with the chainsaw.<br/><br/>
+                If only I could think of an example, oh, wait.
+              </div>
+              <div id="post2">
+                This is not the content and although it's longer if you meaure it in characters,
+                it's supposed to have lower score than the previous paragraph. And it's only because
+                of the previous paragraph is not one paragraph, it's three subparagraphs
+              </div>
+            </body>
+          </html>
+        HTML
+        @candidates = @doc.score_paragraphs(0)
+      end
+      it "should assign the higher score to the first paragraph in this particular example" do
+        @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
+      end
+    end
+  end
   describe "the cant_read.html fixture" do
     it "should work on the cant_read.html fixture with some allowed tags" do
       allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
@@ -147,7 +178,25 @@ describe Readability do
       @doc.content.should_not match("sidebar")
     end
   end
+  describe "inserting space for block elements" do
+    before do
+      @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
+        <html><head><title>title!</title></head>
+          <body>
+            <div>
+              <p>a<br>b<hr>c<address>d</address>f/p>
+            </div>
+          </body>
+        </html>
+      HTML
+    end
+    it "should not return the sidebar" do
+      @doc.content.should_not match("a b c d f")
+    end
+  end
   describe "outputs good stuff for known documents" do
     before do
       @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")

data/spec/spec.opts ADDED Viewed

@@ -0,0 +1,4 @@
+--colour
+--format s -c
+--loadby mtime
+--reverse

metadata CHANGED Viewed

@@ -1,29 +1,35 @@
 --- !ruby/object:Gem::Specification
 name: ruby-readability
 version: !ruby/object:Gem::Version
+  hash: 21
   prerelease: false
   segments:
   - 0
+  - 2
   - 1
-  - 0
-  version: 0.1.0
+  version: 0.2.1
 platform: ruby
 authors:
+- Andrew Cantino
+- starrhorne
+- libc
 - Kyle Maxwell
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-08-30 00:00:00 -07:00
+date: 2010-11-07 00:00:00 -07:00
 default_executable: readability
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        hash: 13
         segments:
         - 1
         - 2
@@ -31,8 +37,8 @@ dependencies:
         version: 1.2.9
   type: :development
   version_requirements: *id001
-description: ruby-readability
-email: kmaxwell@twitter.com
+description: Port of arc90's readability project to ruby
+email: andrew@iterationlabs.com
 executables:
 - readability
 extensions: []
@@ -47,18 +53,23 @@ files:
 - VERSION
 - bin/readability
 - lib/readability.rb
-- lib/readability_old.rb
+- ruby-readability.gemspec
 - spec/fixtures/cant_read.html
 - spec/fixtures/sample.html
+- spec/fixtures/samples/blogpost_with_links-fragments.rb
+- spec/fixtures/samples/blogpost_with_links.html
 - spec/fixtures/samples/channel4-1-fragments.rb
 - spec/fixtures/samples/channel4-1.html
+- spec/fixtures/samples/foxnews-india1-fragments.rb
+- spec/fixtures/samples/foxnews-india1.html
 - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
 - spec/fixtures/samples/globemail-ottawa-cuts.html
 - spec/fixtures/should_not_truncate.txt
 - spec/readability_spec.rb
+- spec/spec.opts
 - spec/spec_helper.rb
 has_rdoc: true
-homepage: http://github.com/fizx/ruby-readability
+homepage: http://github.com/iterationlabs/ruby-readability
 licenses: []
 post_install_message:
@@ -67,28 +78,34 @@ rdoc_options:
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
       segments:
       - 0
       version: "0"
 requirements: []
 rubyforge_project:
-rubygems_version: 1.3.6
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
-summary: ruby-readability
+summary: Port of arc90's readability project to ruby
 test_files:
+- spec/fixtures/samples/blogpost_with_links-fragments.rb
 - spec/fixtures/samples/channel4-1-fragments.rb
+- spec/fixtures/samples/foxnews-india1-fragments.rb
 - spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
 - spec/readability_spec.rb
 - spec/spec_helper.rb

data/lib/readability_old.rb DELETED Viewed

@@ -1,74 +0,0 @@
-require 'rubygems'
-require 'nokogiri'
-module Readability
-  class Document
-    def initialize(input, options = {})
-      @options = options
-      @html = Nokogiri::HTML(input, nil, 'UTF-8')
-    end
-    def content
-      # Get all parent elements containing a <p> tag
-      @parents = @html.css("p").map { |p| p.parent }.compact.uniq
-      sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0])
-    end
-    def score(parent)
-      s = 0
-      # Adjust score based on parent's "class" attribute
-      s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i
-      s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i
-      # Adjust score based on parent id
-      s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i
-      s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i
-      # Adjust score based on # of <p> elements inside parent
-      s += parent.css("p").size
-      # Adjust score based on # of commas inside parent
-      s += parent.text.count ","
-      s
-    end
-    def sanitize(node)
-      # Get rid of divs full of non-text items
-      node.css("div").each do |el|
-        counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
-        el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"])
-      end
-      # We'll sanitize all elements using a whitelist
-      whitelist = @options[:tags] || %w[div p]
-      # Use a hash for speed (don't want to make a million calls to include?)
-      whitelist = Hash[ whitelist.zip([true] * whitelist.size) ]
-      ([node] + node.css("*")).each do |el|
-        # If element is in whitelist, delete all its attributes
-        if whitelist[el.node_name]
-          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
-        # Otherwise, replace the element with its contents
-        else
-          el.swap(el.text)
-        end
-      end
-      # Get rid of duplicate whitespace
-      node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
-    end
-  end
-end