RubyGems - ruby-readability - Versions diffs - 0.7.0 → 0.7.1 - Mend

ruby-readability 0.7.0 → 0.7.1

Files changed (14) hide show

checksums.yaml +5 -5
data/.github/workflows/ruby.yml +25 -0
data/.rspec +1 -1
data/README.md +2 -5
data/lib/readability.rb +9 -4
data/ruby-readability.gemspec +1 -4
data/spec/fixtures/codinghorror.html +189 -0
data/spec/fixtures/images/Confusion_of_Tongues.png +0 -0
data/spec/fixtures/images/JohnPinhole.jpg +0 -0
data/spec/fixtures/nested_images.html +11 -0
data/spec/readability_spec.rb +254 -99
data/spec/spec_helper.rb +0 -6
metadata +28 -35
data/.travis.yml +0 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: b9f4f443e32b774c8c2b14856c78e7c593c6ef41
-  data.tar.gz: 3f6916bfc9b1c88c3c45f5e839fe0e2a4b882ab5
+SHA256:
+  metadata.gz: 906a25fd00e8fc221c84aa41fedf38bbd3045aa0e4a543ff16a1d494e59c3a92
+  data.tar.gz: bf28e458f7fb7f87a49ea71f16e736191c53130b91bdf2203cf260e6dce99aee
 SHA512:
-  metadata.gz: fdf2bb73b0ff4db4617c34996e72f23465d33d90a7631eaaa979235fd8f1f8c529dcf39f7930dc447df72e35e640726b0a3567e3cf0abdafb1ab88e46eb4e3ac
-  data.tar.gz: e75ebfeb153e89fbe52e94e0eab2f33865b32c75ed89e5411387d2cfa6a2f92d0671ecc000229d1ac3cf2027d18e7b7050053c32ab44dca05c8f9a35b20a1194
+  metadata.gz: e2d262b6c4f0d7a2146718d3e16c0dd8973b217a9fe0ba850d03a456c68b7bd4355cbdd0a78454b09f6f50717c87ac8da524d42d99e78e0f362830c554376fdd
+  data.tar.gz: 6306f195c8d40842c0a4ed8ab2cfab1648fc562b03ba3137a0fd8c68ecb7a3668357c83abefd2b76bcac06efc961cdd042be10f44760aa102e34cdce2fe5d6d4

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,25 @@
+name: Ruby
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby-version: ['2.7']
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Ruby
+      uses: ruby/setup-ruby@v1
+      with:
+        ruby-version: ${{ matrix.ruby-version }}
+        bundler-cache: true # runs 'bundle install' and caches installed gems automatically
+    - name: Run tests
+      run: bundle exec rspec

data/.rspec CHANGED Viewed

@@ -1,2 +1,2 @@
 --colour
---format s -c
+--format documentation -c

data/README.md CHANGED Viewed

@@ -7,7 +7,7 @@ webpage. It is a Ruby port of arc90's readability project.
 Build Status
 ------------
-[![Build Status](https://travis-ci.org/cantino/ruby-readability.png)](https://travis-ci.org/cantino/ruby-readability)
+[![Ruby](https://github.com/cantino/ruby-readability/actions/workflows/ruby.yml/badge.svg?branch=master)](https://github.com/cantino/ruby-readability/actions/workflows/ruby.yml)
 Install
 -------
@@ -78,6 +78,7 @@ feature requires that the `fastimage` gem be installed.
 Related Projects
 ----------------
+* [readability.cr](https://github.com/joenas/readability.cr) - Port of ruby-readability's port of arc90's readability project to Crystal
 * [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
 Potential Issues
@@ -102,7 +103,3 @@ License
 This code is under the Apache License 2.0. See <http://www.apache.org/licenses/LICENSE-2.0>.
 Ruby port by cantino, starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
-[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/cantino/ruby-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")

data/lib/readability.rb CHANGED Viewed

@@ -17,7 +17,9 @@ module Readability
       :min_image_height           => 80,
       :ignore_image_format        => [],
       :blacklist                  => nil,
-      :whitelist                  => nil
+      :whitelist                  => nil,
+      :elements_to_score          => ["p", "td", "pre"],
+      :likely_siblings            => ["p"]
     }.freeze
     REGEXES = {
@@ -260,13 +262,14 @@ module Readability
       # Things like preambles, content split by ads that we removed, etc.
       sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
+      downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
       output = Nokogiri::XML::Node.new('div', @html)
       best_candidate[:elem].parent.children.each do |sibling|
         append = false
         append = true if sibling == best_candidate[:elem]
         append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
-        if sibling.name.downcase == "p"
+        if downcased_likely_siblings.include?(sibling.name.downcase)
           link_density = get_link_density(sibling)
           node_content = sibling.text
           node_length = node_content.length
@@ -310,7 +313,7 @@ module Readability
     def score_paragraphs(min_text_length)
       candidates = {}
-      @html.css("p,td").each do |elem|
+      @html.css(options[:elements_to_score].join(',')).each do |elem|
         parent_node = elem.parent
         grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
         inner_text = elem.text
@@ -423,6 +426,8 @@ module Readability
       # We'll sanitize all elements using a whitelist
       base_whitelist = @options[:tags] || %w[div p]
+      all_whitelisted = base_whitelist.include?("*")
       # We'll add whitespace instead of block elements,
       # so a<br>b will have a nice space between them
       base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
@@ -435,7 +440,7 @@ module Readability
       ([node] + node.css("*")).each do |el|
         # If element is in whitelist, delete all its attributes
-        if whitelist[el.node_name]
+        if all_whitelisted || whitelist[el.node_name]
           el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
           # Otherwise, replace the element with its contents

data/ruby-readability.gemspec CHANGED Viewed

@@ -3,15 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
 Gem::Specification.new do |s|
   s.name        = "ruby-readability"
-  s.version     = '0.7.0'
+  s.version     = '0.7.1'
   s.authors     = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
   s.email       = ["andrew@iterationlabs.com"]
   s.homepage    = "http://github.com/cantino/ruby-readability"
   s.summary     = %q{Port of arc90's readability project to ruby}
   s.description = %q{Port of arc90's readability project to ruby}
-  s.rubyforge_project = "ruby-readability"
   s.files         = `git ls-files`.split("\n")
   s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
   s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
@@ -19,7 +17,6 @@ Gem::Specification.new do |s|
   s.add_development_dependency "rspec", ">= 2.8"
   s.add_development_dependency "rspec-expectations", ">= 2.8"
-  s.add_development_dependency "rr", ">= 1.0"
   s.add_dependency 'nokogiri', '>= 1.6.0'
   s.add_dependency 'guess_html_encoding', '>= 0.0.4'
 end

data/spec/fixtures/codinghorror.html ADDED Viewed

@@ -0,0 +1,189 @@
+<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8"/>
+<title>Standard Flavored Markdown</title>
+<meta name="description" content=""/>
+<meta name="HandheldFriendly" content="True"/>
+<meta name="MobileOptimized" content="320"/>
+<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+<link rel="shortcut icon" href="/assets/images/favicon.ico?v=8684b6a35e">
+<link rel="apple-touch-icon" href="/assets/images/codinghorror-app-icon.png?v=8684b6a35e">
+<meta name="google-site-verification" content="sl0m9SU_4V0JcvjWlOX4dUFBR6VS2P4tlxjJMo0gphU"/>
+<link rel="stylesheet" type="text/css" href="/assets/css/screen.css?v=8684b6a35e"/>
+<link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Open+Sans:400italic,700italic,400,700"/>
+<link rel="alternate" type="application/rss+xml" title="Coding Horror" href="http://feeds.feedburner.com/codinghorror">
+</head>
+<body class="post-template">
+<header class="site-head">
+<div class="site-head-content">
+<a class="blog-logo" href="http://blog.codinghorror.com"><img src="/assets/images/codinghorror-app-icon.png?v=8684b6a35e" alt="Coding Horror Logo" width="158" height="158"/></a>
+<h1 class="blog-title"><a href="http://blog.codinghorror.com">Coding Horror</a></h1>
+<h2 class="blog-description">programming and human factors</h2>
+<div class="site-search">
+<script>
+                  (function() {
+                    var cx = '016956275695630057531:lqveu9tah7y';
+                    var gcse = document.createElement('script');
+                    gcse.type = 'text/javascript';
+                    gcse.async = true;
+                    gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + '//www.google.com/cse/cse.js?cx=' + cx;
+                    var s = document.getElementsByTagName('script')[0];
+                    s.parentNode.insertBefore(gcse, s);
+                  })();
+                </script>
+<gcse:search></gcse:search>
+</div>
+</div>
+</header>
+<div class="wrap clearfix">
+<div class="clearfix"></div>
+<main class="content" role="main">
+<article class="post">
+<span class="post-meta"><time datetime="2014-09-03">03 Sep 2014</time> </span>
+<h1 class="post-title">Standard Flavored Markdown</h1>
+<section class="post-content">
+<p>In 2009 I <a href="http://blog.codinghorror.com/responsible-open-source-code-parenting/">lamented the state of Markdown</a>:</p>
+<blockquote>
+<p>Right now we have the worst of both worlds. Lack of leadership from the top, and a bunch of fragmented, poorly coordinated community efforts to advance Markdown, none of which are officially canon. This isn't merely incovenient for anyone trying to find accurate information about Markdown; it's actually harming the project's future. </p>
+</blockquote>
+<p>In late 2012, David Greenspan from <a href="https://www.meteor.com/">Meteor</a> approached me and proposed we move forward, and <a href="http://blog.codinghorror.com/the-future-of-markdown/">a project crystallized</a>:</p>
+<blockquote>
+<p>I propose that Stack Exchange, GitHub, Meteor, Reddit, and any other company with lots of traffic and a strategic investment in Markdown, all work together to <strong>come up with an official Markdown specification, and standard test suites to validate Markdown implementations</strong>. We've all been working at cross purposes for too long, accidentally fragmenting Markdown while popularizing it.</p>
+</blockquote>
+<p>We formed a small private working group with key representatives from GitHub, from Reddit, from Stack Exchange, from the open source community. We spent months hashing out the details and agreeing on the necessary changes to turn Markdown into a language you can parse without feeling like you just walked through a sewer &ndash; while preserving the simple, clear, ASCII email inspired spirit of Markdown.</p>
+<p>We really struggled with this at <a href="http://www.discourse.org">Discourse</a>, which is also based on Markdown, but an even more complex dialect than the one we built at Stack Overflow. In Discourse, you can mix <em>three</em> forms of markup interchangeably:</p>
+<ul>
+<li>Markdown</li>
+<li>HTML (safe subset)</li>
+<li>BBCode (subset)</li>
+</ul>
+<p>Discourse is primarily a JavaScript app, so naturally we needed a nice, compliant implementation of Markdown in JavaScript. Surely such a thing exists, yes? Nope. Even in 2012, we found <em>zero</em> JavaScript implementations of Markdown that could pass the only Markdown test suite I know of, <a href="https://github.com/michelf/mdtest/">MDTest</a>. It isn't authoritative, it's a community created initiative that embodies its own decisions about rendering ambiguities in Markdown, but it's all we've got. We contributed many <a href="https://github.com/evilstreak/markdown-js/commits/master">upstream fixes to markdown.js</a> to make it pass MDTest &ndash; but it still only passes in our locally extended version.</p>
+<p>As an open source project ourselves, we're perfectly happy contributing upstream code to improve it for everyone. But it's an indictment of the state of the Markdown ecosystem that any remotely popular implementation wasn't already testing itself against a formal spec and test suite. But who can blame them, because <i>it didn't exist!</i></p>
+<p>Well, now it does.</p>
+<p>It took a while, but I'm pleased to announce that <a href="http://standardmarkdown.com"><strong>Standard Markdown</strong></a> is now finally ready for public review.</p>
+<p><strong><a href="http://standardmarkdown.com">standardmarkdown.com</a></strong></p>
+<p>It's a spec, including embedded examples, and implementations in portable C and JavaScript. We strived mightily to stay true to the spirit of Markdown in writing it. The primary author, John MacFarlane, <a href="http://spec.standardmarkdown.com">explains in the introduction to the spec</a>:</p>
+<blockquote>
+<p>Because Gruber’s syntax description leaves many aspects of the syntax undetermined, writing a precise spec requires making a large number of decisions, many of them somewhat arbitrary. In making them, I have appealed to existing conventions and considerations of simplicity, readability, expressive power, and consistency. I have tried to ensure that “normal” documents in the many incompatible existing implementations of markdown will render, as far as possible, as their authors intended. And I have tried to make the rules for different elements work together harmoniously. In places where different decisions could have been made (for example, the rules governing list indentation), I have explained the rationale for my choices. In a few cases, I have departed slightly from the canonical syntax description, in ways that I think further the goals of markdown as stated in that description.</p>
+</blockquote>
+<p>Part of my contribution to the project is to host the discussion / mailing list for Standard Markdown in a Discourse instance. </p>
+<p><strong><a href="http://talk.standardmarkdown.com">talk.standardmarkdown.com</a></strong></p>
+<p>Fortunately, Discourse itself <a href="http://blog.discourse.org/2014/08/introducing-discourse-1-0/">just reached version 1.0</a>. If the only thing Standard Markdown does is help save a few users from the continuing horror that is mailing list web UI, we all win.</p>
+<p>What I'm most excited about is that we got a massive contribution from the one person who, in my mind, was the most perfect person in the world to work on this project: <a href="http://johnmacfarlane.net/">John MacFarlane</a>. He took our feedback and wrote the entire Standard Markdown spec and both implementations.</p>
+<p><a href="http://johnmacfarlane.net/"><img src="/content/images/2014/Sep/JohnPinhole.jpg" alt="" title=""/></a></p>
+<p>A lot of people know of John through his <a href="http://johnmacfarlane.net/pandoc/">Pandoc</a> project, which is amazing in its own right, but I found out about him because he built <a href="http://johnmacfarlane.net/babelmark2/faq.html">Babelmark</a>. I learned to refer to Babelmark extensively while working on Stack Overflow and MarkdownSharp, a C# implementation of Markdown.</p>
+<p>Here's how crazy Markdown is: to decide what the "correct" behavior is, you provide sample Markdown input to 20+ different Markdown parsers &hellip; and then pray that some consensus emerges in all their output. That's what Babelmark does.</p>
+<p>Consider this simple Markdown example:</p>
+<pre><code># Hello there
+This is a paragraph.
+- one
+- two
+- three
+- four
+1. pirate
+2. ninja
+3. zombie
+</code></pre>
+<p>Just for that, I count <a href="http://johnmacfarlane.net/babelmark2/?text=%23+Hello+there%0A%0AThis+is+a+paragraph.%0A%0A-+one%0A-+two%0A-+three%0A-+four%0A%0A1.+pirate%0A2.+ninja%0A3.+zombie"><em>fifteen</em> different rendered outputs</a> from 22 different Markdown parsers.</p>
+<p><a href="http://en.wikipedia.org/wiki/Tower_of_Babel"><img src="/content/images/2014/Sep/Confusion_of_Tongues.png" alt="" title=""/></a></p>
+<p>In Markdown, we <em>literally</em> built a <a href="http://en.wikipedia.org/wiki/Tower_of_Babel">Tower of Babel</a>. </p>
+<p>Have I mentioned that it's a good idea for a language to have a formal specification and test suites? Maybe now you can see why that is.</p>
+<p>Oh, and in his spare time, John is also the chair of the department of philosophy at the University of California, Berkeley. <em>No big deal.</em> While I don't mean to minimize the contributions of anyone to the Standard Markdown project, we all owe a special thanks to John.</p>
+<p>Markdown is indeed everywhere. And that's a good thing. But it needs to be sane, parseable, and standard. That's the goal of <a href="http://standardmarkdown.com/">Standard Markdown</a> &mdash; but we need your help to get there. If you use Markdown on a website, <strong>ask what it would take for that site to become compatible with Standard Markdown</strong>; when you see the word "Markdown" you have the right to expect consistent rendering across all the websites you visit. If you implement Markdown, <a href="http://spec.standardmarkdown.com">take a look at the spec</a>, try to <strong>make your parser compatible with Standard Markdown</strong>, and <a href="http://talk.standardmarkdown.com">discuss improvements or refinements</a> to the spec.</p>
+<p><span style="color:red;">Update:</span> The project was renamed <a href="http://commonmark.org">CommonMark</a>. See <a href="http://blog.codinghorror.com/standard-markdown-is-now-common-markdown/">my subsequent blog post</a>.</p>
+<table>
+<tr><td class="welovecodinghorror">
+[advertisement] How are you showing off your awesome? Create a <a href="http://careers.stackoverflow.com/cv" rel="nofollow">Stack Overflow Careers profile</a> and show off all of your hard work from Stack Overflow, Github, and virtually every other coding site. Who knows, you might even get recruited for a great <a href="http://careers.stackoverflow.com/jobs" rel="nofollow">new position</a>!
+</td></tr>
+</table>
+</section>
+<footer class="post-footer">
+<section class="author">
+<h4>Written by Jeff Atwood</h4>
+<p>Indoor enthusiast. Co-founder of Stack Exchange and Discourse. Disclaimer: I have no idea what I&#x27;m talking about. Find me here: <a href="http://twitter.com/codinghorror">http://twitter.com/codinghorror</a></p>
+</section>
+</footer>
+<div id="nrelate_related_placeholder"></div> <script async id="nrelate_loader_script" type="text/javascript" src="http://static.nrelate.com/common_js/0.52.1/loader.min.js"></script>
+</article>
+<div id="discourse-comments"></div>
+<script type="text/javascript">
+      var discourseUrl = "http://discourse.codinghorror.com/",
+          discourseEmbedUrl = 'http://blog.codinghorror.com/standard-flavored-markdown/';
+      (function() {
+        var d = document.createElement('script'); d.type = 'text/javascript'; d.async = true;
+          d.src = discourseUrl + 'javascripts/embed.js';
+        (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(d);
+      })();
+    </script>
+</main>
+<aside class="sidebar">
+<div id="carbonads-container"><div class="carbonad"><div id="azcarbon"></div><script type="text/javascript">var z = document.createElement("script"); z.type = "text/javascript"; z.async = true; z.src = "http://engine.carbonads.com/z/56742/azcarbon_2_1_0_VERT"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(z, s);</script></div></div>
+<div id="hireme" class="hireme codinghorror" style="min-height: 220px; margin-bottom: 15px;"></div>
+<script>
+        setTimeout(function () {
+            var a = document.createElement("script");
+            var b = document.getElementsByTagName('script')[0];
+            a.src = "http://careers.stackoverflow.com/ad/js";
+            a.async = true;
+            a.type = "text/javascript";
+            b.parentNode.insertBefore(a, b);
+        }, 5);
+    </script>
+<div class="welovecodinghorror" style="margin-bottom:15px">
+[ad] Enjoy the blog? Read <b><a href="http://www.hyperink.com/Effective-Programming-More-Than-Writing-Code-b1559">Effective Programming: More than Writing Code</a></b> and <b><a href="http://www.hyperink.com/How-To-Stop-Sucking-And-Be-Awesome-Instead-b9A74B5CBA6">How to Stop Sucking and Be Awesome Instead</a></b> on your Kindle, iPad, Nook, or as a PDF.
+</div>
+<h3>Resources</h3>
+<ul>
+<li><a href="/about-me/">About Me</a></li>
+<li><a href="http://twitter.com/codinghorror">@codinghorror</a></li>
+<li><a href="http://www.discourse.org/">discourse.org</a></li>
+<li><a href="http://stackexchange.com/">stackexchange.com</a></li>
+<li><a href="/recommended-reading-for-developers/">Recommended Reading</a></li>
+</ul>
+<ul>
+<li><a href="http://feeds.feedburner.com/codinghorror" class="icon-feed">&nbsp;Subscribe in a reader</a></li>
+<li><a href="http://feedburner.google.com/fb/a/mailverify?uri=codinghorror&amp;loc=en_US" class="icon-email">&nbsp;Subscribe via email</a></li>
+</ul>
+<p>Coding Horror has been continuously published since 2004</p>
+<ul>
+<li><img src="http://feeds.feedburner.com/~fc/codinghorror?bg=EEEEEE&amp;fg=111111&amp;anim=0" height="26" width="88" style="border:0" alt="Count of RSS readers"></li>
+<li><a href="http://my.statcounter.com/project/standard/stats.php?project_id=2600027&amp;guest=1">Traffic Stats</a></li>
+</ul>
+<footer class="site-footer">
+<section class="copyright">Copyright <a rel="author" href="https://profiles.google.com/codinghorror1">Jeff Atwood</a> &copy; 2014<br/>
+Logo image &copy; 1993 Steven C. McConnell <br/>
+Proudly published with <a class="icon-ghost" href="http://ghost.org">Ghost</a></section>
+</footer></aside>
+</div>
+<script src="/public/jquery.min.js?v=8684b6a35e"></script>
+<script type="text/javascript" src="/assets/js/jquery.fitvids.js?v=8684b6a35e"></script>
+<script type="text/javascript" src="/assets/js/index.js?v=8684b6a35e"></script>
+<script async src="http://www.statcounter.com/counter/counter.js"></script>
+<noscript><a href="http://www.statcounter.com/"><img src="http://c26.statcounter.com/counter.php?sc_project=2600027&amp;java=0&amp;security=dcff5548&amp;invisible=0" alt="web metrics"></a> </noscript>
+<script>
+    document.write(unescape("%3Cscript src='" + (document.location.protocol == "https:" ? "https://sb" : "http://b") + ".scorecardresearch.com/beacon.js'%3E%3C/script%3E"));
+    </script>
+<script>
+    COMSCORE.beacon({
+      c1: 2,
+      c2: "6035669",
+      c3: "",
+      c4: "http://www.codinghorror.com/blog/",
+      c5: "",
+      c6: "",
+      c15: ""
+    });
+    </script>
+<noscript>
+<img src="http://b.scorecardresearch.com/b?c1=2&amp;c2=6035669&amp;c3=&amp;c4=http%3A%2F%2Fwww.codinghorror.com%2Fblog%2F&amp;c5=&amp;c6=&amp;c15=&amp;cv=1.3&amp;cj=1" style="display:none" width="0" height="0" alt=""/>
+</noscript>
+<img src="/view.gif?page=/standard-flavored-markdown/" alt="" style="display:none" hidden />
+</body>
+</html>

data/spec/fixtures/images/Confusion_of_Tongues.png ADDED Viewed

Binary file

data/spec/fixtures/images/JohnPinhole.jpg ADDED Viewed

Binary file

data/spec/fixtures/nested_images.html ADDED Viewed

@@ -0,0 +1,11 @@
+<html>
+  <body>
+    <article>
+      <section>
+        <figure>
+          <img src="http://example.com/image.jpeg" />
+        </figure>
+      </section>
+    </article>
+  </body>
+</html>

data/spec/readability_spec.rb CHANGED Viewed

@@ -19,7 +19,7 @@ describe Readability do
         </body>
       </html>
     HTML
     @simple_html_with_img_no_text = <<-HTML
     <html>
       <head>
@@ -32,7 +32,7 @@ describe Readability do
       </body>
       </html>
     HTML
     @simple_html_with_img_in_noscript = <<-HTML
     <html>
       <head>
@@ -40,8 +40,8 @@ describe Readability do
       </head>
       <body class='main'>
         <div class="article-img">
-        <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
-        height="317" alt="test" class="lazy"
+        <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
+        height="317" alt="test" class="lazy"
         data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
         <noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
         </div>
@@ -54,30 +54,65 @@ describe Readability do
     before do
       @bbc      = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
       @nytimes  = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
-      @thesum   = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
+      @thesun   = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
+      @ch       = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html")
+      @nested   = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html")
       FakeWeb::Registry.instance.clean_registry
       FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
                            :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
       FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif",
                            :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703711a.gif"))
-      FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
+      FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
                            :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
+      # Register images for codinghorror
+      FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
+                           :body => File.read(File.dirname(__FILE__) + "/fixtures/images/JohnPinhole.jpg"))
+      FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png',
+                           :body => File.read(File.dirname(__FILE__) + "/fixtures/images/Confusion_of_Tongues.png"))
     end
     it "should show one image, but outside of the best candidate" do
-      @doc = Readability::Document.new(@thesum)
-      @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"]
-      @doc.best_candidate_has_image.should == false
+      @doc = Readability::Document.new(@thesun)
+      expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"])
+      expect(@doc.best_candidate_has_image).to eq(false)
     end
     it "should show one image inside of the best candidate" do
       @doc = Readability::Document.new(@nytimes)
-      @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
-      @doc.best_candidate_has_image.should == true
+      expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
+      expect(@doc.best_candidate_has_image).to eq(true)
+    end
+    it "should expand relative image url" do
+      url = 'http://blog.codinghorror.com/standard-flavored-markdown/'
+      @doc = Readability::Document.new(@ch, tags: %w[div p img a],
+                                            attributes: %w[src href],
+                                            remove_empty_nodes: false)
+      @doc.images_with_fqdn_uris!(url)
+      expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg')
+      expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png')
+      expect(@doc.images).to match_array([
+        'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
+        'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png'
+      ])
+    end
+    it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do
+      @doc = Readability::Document.new(@nested, attributes: ["src"])
+      expect(@doc.images).to be_empty
+      @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"])
+      expect(@doc.images).to be_empty
+      @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"])
+      expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
     end
     it "should not try to download local images" do
@@ -93,69 +128,69 @@ describe Readability do
           </body>
         </html>
       HTML
-      do_not_allow(@doc).load_image(anything)
-      @doc.images.should == []
+      expect(@doc).not_to receive(:get_image_size)
+      expect(@doc.images).to eq([])
     end
     describe "no images" do
       it "shouldn't show images" do
         @doc = Readability::Document.new(@bbc, :min_image_height => 600)
-        @doc.images.should == []
-        @doc.best_candidate_has_image.should == false
+        expect(@doc.images).to eq([])
+        expect(@doc.best_candidate_has_image).to eq(false)
       end
     end
     describe "poll of images" do
       it "should show some images inside of the best candidate" do
         @doc = Readability::Document.new(@bbc)
-        @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
+        expect(@doc.images).to match_array(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
                                "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
                                "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
-                               "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
-        @doc.best_candidate_has_image.should == true
+                               "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
+        expect(@doc.best_candidate_has_image).to eq(true)
       end
       it "should show some images inside of the best candidate, include gif format" do
         @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
-        @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
-        @doc.best_candidate_has_image.should == true
+        expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
+        expect(@doc.best_candidate_has_image).to eq(true)
       end
       describe "width, height and format" do
         it "should show some images inside of the best candidate, but with width most equal to 400px" do
           @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
-          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
-          @doc.best_candidate_has_image.should == true
+          expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"])
+          expect(@doc.best_candidate_has_image).to eq(true)
         end
         it "should show some images inside of the best candidate, but with width most equal to 304px" do
           @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
-          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
-          @doc.best_candidate_has_image.should == true
+          expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
+          expect(@doc.best_candidate_has_image).to eq(true)
         end
         it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
           @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
-          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
-          @doc.best_candidate_has_image.should == true
+          expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
+          expect(@doc.best_candidate_has_image).to eq(true)
         end
         it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
           @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
-          @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
-          @doc.best_candidate_has_image.should == true
+          expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
+          expect(@doc.best_candidate_has_image).to eq(true)
         end
         it "should not miss an image if it exists by itself in a div without text" do
           @doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
-          @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
+          expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
         end
         it "should not double count an image between script and noscript" do
           @doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
-          @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
+          expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
         end
       end
     end
   end
@@ -167,11 +202,11 @@ describe Readability do
     end
     it "should transform divs containing no block elements into <p>s" do
-      @doc.html.css("#body").first.name.should == "p"
+      expect(@doc.html.css("#body").first.name).to eq("p")
     end
     it "should not transform divs that contain block elements" do
-      @doc.html.css("#contains_blockquote").first.name.should == "div"
+      expect(@doc.html.css("#contains_blockquote").first.name).to eq("div")
     end
   end
@@ -185,9 +220,9 @@ describe Readability do
           <body></body>
         </html>
       HTML
-      doc.author.should eql("Austin Fonacier")
+      expect(doc.author).to eql("Austin Fonacier")
     end
     it "should pick up readability's recommended author format" do
       doc = Readability::Document.new(<<-HTML)
         <html>
@@ -200,9 +235,9 @@ describe Readability do
           </body>
         </html>
       HTML
-      doc.author.should eql("Austin Fonacier")
+      expect(doc.author).to eql("Austin Fonacier")
     end
     it "should pick up vcard fn" do
       doc = Readability::Document.new(<<-HTML)
         <html>
@@ -216,9 +251,9 @@ describe Readability do
           </body>
         </html>
       HTML
-      doc.author.should eql("Austin Fonacier")
+      expect(doc.author).to eql("Austin Fonacier")
     end
     it "should pick up <a rel='author'>" do
       doc = Readability::Document.new(<<-HTML)
         <html>
@@ -228,9 +263,9 @@ describe Readability do
           </body>
         </html>
       HTML
-      doc.author.should eql("Danny Banks (rel)")
+      expect(doc.author).to eql("Danny Banks (rel)")
     end
     it "should pick up <div id='author'>" do
       doc = Readability::Document.new(<<-HTML)
         <html>
@@ -240,7 +275,7 @@ describe Readability do
           </body>
         </html>
       HTML
-      doc.author.should eql("Austin Fonacier (author)")
+      expect(doc.author).to eql("Austin Fonacier (author)")
     end
   end
@@ -263,15 +298,15 @@ describe Readability do
     end
     it "should like <div>s more than <th>s" do
-      @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
+      expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
     end
     it "should like classes like text more than classes like comment" do
       @elem2.name = "div"
-      @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
+      expect(@doc.score_node(@elem1)[:content_score]).to eq(@doc.score_node(@elem2)[:content_score])
       @elem1['class'] = "text"
       @elem2['class'] = "comment"
-      @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
+      expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
     end
   end
@@ -282,15 +317,15 @@ describe Readability do
     end
     it "should remove things that have class comment" do
-      @doc.html.inner_html.should_not =~ /a comment/
+      expect(@doc.html.inner_html).not_to match(/a comment/)
     end
     it "should not remove body tags" do
-      @doc.html.inner_html.should =~ /<\/body>/
+      expect(@doc.html.inner_html).to match(/<\/body>/)
     end
     it "should not remove things with class comment and id body" do
-      @doc.html.inner_html.should =~ /real content/
+      expect(@doc.html.inner_html).to match(/real content/)
     end
   end
@@ -318,13 +353,13 @@ describe Readability do
     end
     it "should score elements in the document" do
-      @candidates.values.length.should == 3
+      expect(@candidates.values.length).to eq(3)
     end
     it "should prefer the body in this particular example" do
-      @candidates.values.sort { |a, b|
+      expect(@candidates.values.sort { |a, b|
         b[:content_score] <=> a[:content_score]
-      }.first[:elem][:id].should == "body"
+      }.first[:elem][:id]).to eq("body")
     end
     context "when two consequent br tags are used instead of p" do
@@ -349,9 +384,129 @@ describe Readability do
           </html>
         HTML
         @candidates = @doc.score_paragraphs(0)
-        @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
+        expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
       end
     end
+    it "does not include short paragraphs as related siblings in the output" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <section>
+              <p>Too short</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).not_to include("Too short")
+    end
+    it "includes long paragraphs as related siblings in the output" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("This paragraph is longer")
+    end
+    it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              <p>Paragraph 2</p>
+            </section>
+            <section>
+              <p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).not_to include("Although this paragraph")
+    end
+    it "does include non-paragraph tags in the output if their content score is high enough" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+            </section>
+            <section>
+              <p>This should be included in the output because the content is score is high enough.<p>
+              <p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("This should be included")
+    end
+    it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <section>
+              <p>Paragraph 1</p>
+              #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+            </section>
+            <section>
+              <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
+              <p>The likely_siblings now include the section tag so it should be included in the output.</p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("should be included")
+    end
   end
   describe "the cant_read.html fixture" do
@@ -359,7 +514,7 @@ describe Readability do
       allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
       allowed_attributes = %w[href]
       html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
-      Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
+      expect(Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content).to match(/Can you talk a little about how you developed the looks for the/)
     end
   end
@@ -370,15 +525,15 @@ describe Readability do
     end
     it "should return the main page content" do
-      @doc.content.should match("Some content")
+      expect(@doc.content).to match("Some content")
     end
     it "should return the page title if present" do
-      @doc.title.should match("title!")
+      expect(@doc.title).to match("title!")
       doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
                                        :min_text_length => 0, :retry_length => 1)
-      doc.title.should be_nil
+      expect(doc.title).to be_nil
     end
   end
@@ -389,7 +544,7 @@ describe Readability do
     end
     it "should not return the sidebar" do
-      @doc.content.should_not match("sidebar")
+      expect(@doc.content).not_to match("sidebar")
     end
   end
@@ -407,7 +562,7 @@ describe Readability do
     end
     it "should not return the sidebar" do
-      @doc.content.should_not match("a b c d f")
+      expect(@doc.content).not_to match("a b c d f")
     end
   end
@@ -427,12 +582,12 @@ describe Readability do
         #puts "testing #{sample}..."
         $required_fragments.each do |required_text|
-          doc.should include(required_text)
+          expect(doc).to include(required_text)
           checks += 1
         end
         $excluded_fragments.each do |text_to_avoid|
-          doc.should_not include(text_to_avoid)
+          expect(doc).not_to include(text_to_avoid)
           checks += 1
         end
       end
@@ -446,18 +601,18 @@ describe Readability do
         it "should correctly guess and enforce HTML encoding" do
           doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
           content = doc.content
-          content.encoding.to_s.should == "ISO-8859-1"
-          content.should be_valid_encoding
+          expect(content.encoding.to_s).to eq("ISO-8859-1")
+          expect(content).to be_valid_encoding
         end
         it "should allow encoding guessing to be skipped" do
-          do_not_allow(GuessHtmlEncoding).encode
+          expect(GuessHtmlEncoding).to_not receive(:encode)
           doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
           doc.content
         end
         it "should allow encoding guessing to be overridden" do
-          do_not_allow(GuessHtmlEncoding).encode
+          expect(GuessHtmlEncoding).to_not receive(:encode)
           doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
           doc.content
         end
@@ -469,54 +624,54 @@ describe Readability do
     it "should strip the html comments tag" do
       doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
       content = doc.content
-      content.should include("hi!")
-      content.should_not include("bye")
+      expect(content).to include("hi!")
+      expect(content).not_to include("bye")
     end
     it "should not error with empty content" do
-      Readability::Document.new('').content.should == '<div><div></div></div>'
+      expect(Readability::Document.new('').content).to eq('<div><div></div></div>')
     end
     it "should not error with a document with no <body>" do
-      Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
+      expect(Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content).to eq('<div><div></div></div>')
     end
   end
   describe "No side-effects" do
     before do
       @bbc      = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
       @nytimes  = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
-      @thesum   = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
+      @thesun   = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
     end
     it "should not have any side-effects when calling content() and then images()" do
-      @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
+      @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
       :do_not_guess_encoding => true)
-      @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
+      expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
       @doc.content
-      @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
+      expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
     end
     it "should not have any side-effects when calling content() multiple times" do
-       @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
+       @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
         :do_not_guess_encoding => true)
-       @doc.content.should ==  @doc.content
+       expect(@doc.content).to eq(@doc.content)
     end
     it "should not have any side-effects when calling content and images multiple times" do
-       @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
+       @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
         :do_not_guess_encoding => true)
-       @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
-       @doc.content.should ==  @doc.content
-       @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
+       expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
+       expect(@doc.content).to eq(@doc.content)
+       expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
     end
   end
   describe "Code blocks" do
     before do
       @code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
-      @content  = Readability::Document.new(@code,
+      @content  = Readability::Document.new(@code,
                                         :tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
                                         :attributes => %w[src href],
                                         :remove_empty_nodes => false).content
@@ -524,29 +679,29 @@ describe Readability do
     end
     it "preserve the code blocks" do
-      @doc.css("code pre").text.should == "\nroot\n  indented\n    "
+      expect(@doc.css("code pre").text).to eq("\nroot\n  indented\n    ")
     end
     it "preserve backwards code blocks" do
-      @doc.css("pre code").text.should == "\nsecond\n  indented\n    "
+      expect(@doc.css("pre code").text).to eq("\nsecond\n  indented\n    ")
     end
   end
   describe "remove all tags" do
     it "should work for an incomplete piece of HTML" do
       doc = Readability::Document.new('<div>test</div', :tags => [])
-      doc.content.should == 'test'
+      expect(doc.content).to eq('test')
     end
     it "should work for a HTML document" do
       doc = Readability::Document.new('<html><head><title>title!</title></head><body><div><p>test</p></div></body></html>',
                                       :tags => [])
-      doc.content.should == 'test'
+      expect(doc.content).to eq('test')
     end
     it "should work for a plain text" do
       doc = Readability::Document.new('test', :tags => [])
-      doc.content.should == 'test'
+      expect(doc.content).to eq('test')
     end
   end
@@ -563,8 +718,8 @@ describe Readability do
       doc = Readability::Document.new(boing_boing)
       content = doc.content
-      (content !~ /Bees and Bombs/).should == true
-      content.should =~ /ADVERTISE/
+      expect(content !~ /Bees and Bombs/).to eq(true)
+      expect(content).to match(/ADVERTISE/)
     end
     it "should apply whitelist" do
@@ -572,13 +727,13 @@ describe Readability do
       doc = Readability::Document.new(boing_boing,
                                       whitelist: ".post-content")
       content = doc.content
-      content.should =~ /Bees and Bombs/
+      expect(content).to match(/Bees and Bombs/)
     end
     it "should apply blacklist" do
       doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
       content = doc.content
-      (content !~ /ADVERTISE/).should == true
+      expect(content !~ /ADVERTISE/).to eq(true)
     end
   end

data/spec/spec_helper.rb CHANGED Viewed

@@ -1,11 +1,5 @@
 require 'rubygems'
 require 'readability'
-require 'rr'
 require 'fakeweb'
 FakeWeb.allow_net_connect = false
-RSpec.configure do |config|
-  config.mock_with :rr
-end

metadata CHANGED Viewed

@@ -1,86 +1,72 @@
 --- !ruby/object:Gem::Specification
 name: ruby-readability
 version: !ruby/object:Gem::Version
-  version: 0.7.0
+  version: 0.7.1
 platform: ruby
 authors:
 - Andrew Cantino
 - starrhorne
 - libc
 - Kyle Maxwell
-autorequire:
+autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-08-17 00:00:00.000000000 Z
+date: 2024-06-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '2.8'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '2.8'
 - !ruby/object:Gem::Dependency
   name: rspec-expectations
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '2.8'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: '2.8'
-- !ruby/object:Gem::Dependency
-  name: rr
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - '>='
-      - !ruby/object:Gem::Version
-        version: '1.0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - '>='
-      - !ruby/object:Gem::Version
-        version: '1.0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.0
 - !ruby/object:Gem::Dependency
   name: guess_html_encoding
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 0.0.4
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - '>='
+    - - ">="
       - !ruby/object:Gem::Version
         version: 0.0.4
 description: Port of arc90's readability project to ruby
@@ -91,10 +77,10 @@ executables:
 extensions: []
 extra_rdoc_files: []
 files:
-- .gitignore
-- .rspec
-- .travis.yml
-- .yardopts
+- ".github/workflows/ruby.yml"
+- ".gitignore"
+- ".rspec"
+- ".yardopts"
 - Gemfile
 - Guardfile
 - LICENSE
@@ -108,9 +94,13 @@ files:
 - spec/fixtures/boing_boing.html
 - spec/fixtures/cant_read.html
 - spec/fixtures/code.html
+- spec/fixtures/codinghorror.html
+- spec/fixtures/images/Confusion_of_Tongues.png
+- spec/fixtures/images/JohnPinhole.jpg
 - spec/fixtures/images/dim_1416768a.jpg
 - spec/fixtures/images/sign_up_emails_682__703711a.gif
 - spec/fixtures/images/sign_up_emails_682__703712a.gif
+- spec/fixtures/nested_images.html
 - spec/fixtures/nytimes.html
 - spec/fixtures/sample.html
 - spec/fixtures/samples/blogpost_with_links-fragments.rb
@@ -129,24 +119,23 @@ files:
 homepage: http://github.com/cantino/ruby-readability
 licenses: []
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - '>='
+  - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project: ruby-readability
-rubygems_version: 2.0.3
-signing_key:
+rubygems_version: 3.5.10
+signing_key:
 specification_version: 4
 summary: Port of arc90's readability project to ruby
 test_files:
@@ -154,9 +143,13 @@ test_files:
 - spec/fixtures/boing_boing.html
 - spec/fixtures/cant_read.html
 - spec/fixtures/code.html
+- spec/fixtures/codinghorror.html
+- spec/fixtures/images/Confusion_of_Tongues.png
+- spec/fixtures/images/JohnPinhole.jpg
 - spec/fixtures/images/dim_1416768a.jpg
 - spec/fixtures/images/sign_up_emails_682__703711a.gif
 - spec/fixtures/images/sign_up_emails_682__703712a.gif
+- spec/fixtures/nested_images.html
 - spec/fixtures/nytimes.html
 - spec/fixtures/sample.html
 - spec/fixtures/samples/blogpost_with_links-fragments.rb

data/.travis.yml DELETED Viewed

@@ -1,6 +0,0 @@
-language: ruby
-rvm:
-  - 2.1.1
-  - 2.0.0
-  - 1.9.3
-script: "bundle exec rspec"