RubyGems - marcosinger-ruby-readability - Versions diffs - 0.6.0 - Mend

marcosinger-ruby-readability 0.6.0

Files changed (29) hide show

data/.document +5 -0
data/.gitignore +7 -0
data/.rspec +3 -0
data/Gemfile +10 -0
data/README +54 -0
data/Rakefile +6 -0
data/bin/readability +40 -0
data/lib/readability.rb +402 -0
data/lib/ruby-readability.rb +1 -0
data/ruby-readability.gemspec +24 -0
data/spec/fixtures/bbc.html +2069 -0
data/spec/fixtures/cant_read.html +426 -0
data/spec/fixtures/images/dim_1416768a.jpg +0 -0
data/spec/fixtures/nytimes.html +58 -0
data/spec/fixtures/sample.html +1198 -0
data/spec/fixtures/samples/blogpost_with_links-fragments.rb +10 -0
data/spec/fixtures/samples/blogpost_with_links.html +137 -0
data/spec/fixtures/samples/channel4-1-fragments.rb +13 -0
data/spec/fixtures/samples/channel4-1.html +1330 -0
data/spec/fixtures/samples/foxnews-india1-fragments.rb +13 -0
data/spec/fixtures/samples/foxnews-india1.html +2058 -0
data/spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb +31 -0
data/spec/fixtures/samples/globemail-ottawa-cuts.html +2410 -0
data/spec/fixtures/should_not_truncate.txt +1077 -0
data/spec/fixtures/thesun.html +1122 -0
data/spec/readability_spec.rb +330 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +11 -0
metadata +176 -0

data/spec/readability_spec.rb ADDED Viewed

@@ -0,0 +1,330 @@
+# encoding: UTF-8
+require 'spec_helper'
+describe Readability do
+  before do
+    @simple_html_fixture = <<-HTML
+      <html>
+        <head>
+          <title>title!</title>
+        </head>
+        <body class='comment'>
+          <div>
+            <p class='comment'>a comment</p>
+            <div class='comment' id='body'>real content</div>
+            <div id="contains_blockquote"><blockquote>something in a table</blockquote></div>
+          </div>
+        </body>
+      </html>
+    HTML
+  end
+  describe "images" do
+    before do
+      # bbc     => http://www.bbc.co.uk/news/magazine-15959067
+      # nytimes => http://opinionator.blogs.nytimes.com/2011/12/01/health-care-for-a-changing-work-force/
+      # thesum  => http://www.thesun.co.uk/sol/homepage/sport/football/3973265/Manchester-United-news-Dimitar-Berbatov-and-Carling-Cup-flops-warned.html
+      @bbc      = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
+      @nytimes  = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
+      @thesum   = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
+    end
+    it "should show one image, but outside of the best candidate" do
+      @doc = Readability::Document.new(@thesum)
+      @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
+      @doc.best_candidate_has_image.should == false
+    end
+    it "should show one image inside of the best candidate" do
+      @doc = Readability::Document.new(@nytimes)
+      @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
+      @doc.best_candidate_has_image.should == true
+    end
+    describe "no images" do
+      it "shouldn't show images" do
+        @doc = Readability::Document.new(@bbc, :min_image_height => 400)
+        @doc.images.should == []
+        @doc.best_candidate_has_image.should == false
+      end
+    end
+    describe "poll of images" do
+      it "should show some images inside of the best candidate" do
+        @doc = Readability::Document.new(@bbc)
+        @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
+        @doc.best_candidate_has_image.should == true
+      end
+      it "should show some images inside of the best candidate, include gif format" do
+        @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
+        @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
+        @doc.best_candidate_has_image.should == true
+      end
+      describe "width, height and format" do
+        it "should show some images inside of the best candidate, but with width most equal to 400px" do
+          @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
+          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
+          @doc.best_candidate_has_image.should == true
+        end
+        it "should show some images inside of the best candidate, but with width most equal to 304px" do
+          @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
+          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
+          @doc.best_candidate_has_image.should == true
+        end
+        it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
+          @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
+          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
+          @doc.best_candidate_has_image.should == true
+        end
+        it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
+          @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
+          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
+          @doc.best_candidate_has_image.should == true
+        end
+      end
+    end
+  end
+  describe "transformMisusedDivsIntoParagraphs" do
+    before do
+      @doc = Readability::Document.new(@simple_html_fixture)
+      @doc.transform_misused_divs_into_paragraphs!
+    end
+    it "should transform divs containing no block elements into <p>s" do
+      @doc.html.css("#body").first.name.should == "p"
+    end
+    it "should not transform divs that contain block elements" do
+      @doc.html.css("#contains_blockquote").first.name.should == "div"
+    end
+  end
+  describe "score_node" do
+    before do
+      @doc = Readability::Document.new(<<-HTML)
+        <html>
+          <body>
+            <div id='elem1'>
+              <p>some content</p>
+            </div>
+            <th id='elem2'>
+              <p>some other content</p>
+            </th>
+          </body>
+        </html>
+      HTML
+      @elem1 = @doc.html.css("#elem1").first
+      @elem2 = @doc.html.css("#elem2").first
+    end
+    it "should like <div>s more than <th>s" do
+      @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
+    end
+    it "should like classes like text more than classes like comment" do
+      @elem2.name = "div"
+      @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
+      @elem1['class'] = "text"
+      @elem2['class'] = "comment"
+      @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
+    end
+  end
+  describe "remove_unlikely_candidates!" do
+    before do
+      @doc = Readability::Document.new(@simple_html_fixture)
+      @doc.remove_unlikely_candidates!
+    end
+    it "should remove things that have class comment" do
+      @doc.html.inner_html.should_not =~ /a comment/
+    end
+    it "should not remove body tags" do
+      @doc.html.inner_html.should =~ /<\/body>/
+    end
+    it "should not remove things with class comment and id body" do
+      @doc.html.inner_html.should =~ /real content/
+    end
+  end
+  describe "score_paragraphs" do
+    before(:each) do
+      @doc = Readability::Document.new(<<-HTML)
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body id="body">
+            <div id="div1">
+              <div id="div2>
+                <p id="some_comment">a comment</p>
+              </div>
+              <p id="some_text">some text</p>
+            </div>
+            <div id="div3">
+              <p id="some_text2">some more text</p>
+            </div>
+          </body>
+        </html><!-- " -->
+      HTML
+      @candidates = @doc.score_paragraphs(0)
+    end
+    it "should score elements in the document" do
+      @candidates.values.length.should == 3
+    end
+    it "should prefer the body in this particular example" do
+      @candidates.values.sort { |a, b|
+        b[:content_score] <=> a[:content_score]
+      }.first[:elem][:id].should == "body"
+    end
+    context "when two consequent br tags are used instead of p" do
+      it "should assign the higher score to the first paragraph in this particular example" do
+        @doc = Readability::Document.new(<<-HTML)
+          <html>
+            <head>
+              <title>title!</title>
+            </head>
+            <body id="body">
+              <div id="post1">
+                This is the main content!<br/><br/>
+                Zebra found killed butcher with the chainsaw.<br/><br/>
+                If only I could think of an example, oh, wait.
+              </div>
+              <div id="post2">
+                This is not the content and although it's longer if you meaure it in characters,
+                it's supposed to have lower score than the previous paragraph. And it's only because
+                of the previous paragraph is not one paragraph, it's three subparagraphs
+              </div>
+            </body>
+          </html>
+        HTML
+        @candidates = @doc.score_paragraphs(0)
+        @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
+      end
+    end
+  end
+  describe "the cant_read.html fixture" do
+    it "should work on the cant_read.html fixture with some allowed tags" do
+      allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
+      allowed_attributes = %w[href]
+      html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
+      Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
+    end
+  end
+  describe "general functionality" do
+    before do
+      @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div></body>",
+                                       :min_text_length => 0, :retry_length => 1)
+    end
+    it "should return the main page content" do
+      @doc.content.should match("Some content")
+    end
+    it "should return the page title if present" do
+      @doc.title.should match("title!")
+      doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
+                                       :min_text_length => 0, :retry_length => 1)
+      doc.title.should be_nil
+    end
+  end
+  describe "ignoring sidebars" do
+    before do
+      @doc = Readability::Document.new("<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>",
+                                       :min_text_length => 0, :retry_length => 1)
+    end
+    it "should not return the sidebar" do
+      @doc.content.should_not match("sidebar")
+    end
+  end
+  describe "inserting space for block elements" do
+    before do
+      @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
+        <html><head><title>title!</title></head>
+          <body>
+            <div>
+              <p>a<br>b<hr>c<address>d</address>f/p>
+            </div>
+          </body>
+        </html>
+      HTML
+    end
+    it "should not return the sidebar" do
+      @doc.content.should_not match("a b c d f")
+    end
+  end
+  describe "outputs good stuff for known documents" do
+    before do
+      @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
+      @samples = @html_files.map {|filename| File.basename(filename, '.html') }
+    end
+    it "should output expected fragments of text" do
+      checks = 0
+      @samples.each do |sample|
+        html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
+        doc = Readability::Document.new(html).content
+        load "fixtures/samples/#{sample}-fragments.rb"
+        #puts "testing #{sample}..."
+        $required_fragments.each do |required_text|
+          doc.should include(required_text)
+          checks += 1
+        end
+        $excluded_fragments.each do |text_to_avoid|
+          doc.should_not include(text_to_avoid)
+          checks += 1
+        end
+      end
+      #puts "Performed #{checks} checks."
+    end
+  end
+  describe "encoding guessing" do
+    if RUBY_VERSION =~ /^1\.9\./
+      context "with ruby 1.9.2" do
+        it "should correctly guess and enforce HTML encoding" do
+          doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
+          content = doc.content
+          content.encoding.to_s.should == "ISO-8859-1"
+          content.should be_valid_encoding
+        end
+        it "should allow encoding guessing to be skipped" do
+          do_not_allow(GuessHtmlEncoding).encode
+          doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
+          doc.content
+        end
+        it "should allow encoding guessing to be overridden" do
+          do_not_allow(GuessHtmlEncoding).encode
+          doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
+          doc.content
+        end
+      end
+    end
+  end
+end

data/spec/spec.opts ADDED Viewed

@@ -0,0 +1,4 @@
+--colour
+--format s -c
+--loadby mtime
+--reverse

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,11 @@
+require 'rubygems'
+require 'readability'
+require 'rr'
+require 'fakeweb'
+RSpec.configure do |config|
+  config.mock_with :rr
+end
+FakeWeb.allow_net_connect = false
+FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))

metadata ADDED Viewed

@@ -0,0 +1,176 @@
+--- !ruby/object:Gem::Specification
+name: marcosinger-ruby-readability
+version: !ruby/object:Gem::Version
+  hash: 7
+  prerelease:
+  segments:
+  - 0
+  - 6
+  - 0
+  version: 0.6.0
+platform: ruby
+authors:
+- Andrew Cantino
+- starrhorne
+- libc
+- Kyle Maxwell
+- Marco Singer
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-12-19 00:00:00 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 15
+        segments:
+        - 2
+        - 6
+        version: "2.6"
+  type: :development
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rr
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 15
+        segments:
+        - 1
+        - 0
+        version: "1.0"
+  type: :development
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 1
+        - 4
+        - 2
+        version: 1.4.2
+  type: :runtime
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: guess_html_encoding
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 27
+        segments:
+        - 0
+        - 0
+        - 2
+        version: 0.0.2
+  type: :runtime
+  version_requirements: *id004
+description: Port of arc90's readability project to ruby
+email:
+- andrew@iterationlabs.com
+- markaum@gmail.com
+executables:
+- readability
+extensions: []
+extra_rdoc_files: []
+files:
+- .document
+- .gitignore
+- .rspec
+- Gemfile
+- README
+- Rakefile
+- bin/readability
+- lib/readability.rb
+- lib/ruby-readability.rb
+- ruby-readability.gemspec
+- spec/fixtures/bbc.html
+- spec/fixtures/cant_read.html
+- spec/fixtures/images/dim_1416768a.jpg
+- spec/fixtures/nytimes.html
+- spec/fixtures/sample.html
+- spec/fixtures/samples/blogpost_with_links-fragments.rb
+- spec/fixtures/samples/blogpost_with_links.html
+- spec/fixtures/samples/channel4-1-fragments.rb
+- spec/fixtures/samples/channel4-1.html
+- spec/fixtures/samples/foxnews-india1-fragments.rb
+- spec/fixtures/samples/foxnews-india1.html
+- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
+- spec/fixtures/samples/globemail-ottawa-cuts.html
+- spec/fixtures/should_not_truncate.txt
+- spec/fixtures/thesun.html
+- spec/readability_spec.rb
+- spec/spec.opts
+- spec/spec_helper.rb
+homepage: http://github.com/iterationlabs/ruby-readability
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project: ruby-readability
+rubygems_version: 1.8.10
+signing_key:
+specification_version: 3
+summary: Port of arc90's readability project to ruby
+test_files:
+- spec/fixtures/bbc.html
+- spec/fixtures/cant_read.html
+- spec/fixtures/images/dim_1416768a.jpg
+- spec/fixtures/nytimes.html
+- spec/fixtures/sample.html
+- spec/fixtures/samples/blogpost_with_links-fragments.rb
+- spec/fixtures/samples/blogpost_with_links.html
+- spec/fixtures/samples/channel4-1-fragments.rb
+- spec/fixtures/samples/channel4-1.html
+- spec/fixtures/samples/foxnews-india1-fragments.rb
+- spec/fixtures/samples/foxnews-india1.html
+- spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
+- spec/fixtures/samples/globemail-ottawa-cuts.html
+- spec/fixtures/should_not_truncate.txt
+- spec/fixtures/thesun.html
+- spec/readability_spec.rb
+- spec/spec.opts
+- spec/spec_helper.rb