ruby-readability 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: b9f4f443e32b774c8c2b14856c78e7c593c6ef41
4
- data.tar.gz: 3f6916bfc9b1c88c3c45f5e839fe0e2a4b882ab5
2
+ SHA256:
3
+ metadata.gz: 906a25fd00e8fc221c84aa41fedf38bbd3045aa0e4a543ff16a1d494e59c3a92
4
+ data.tar.gz: bf28e458f7fb7f87a49ea71f16e736191c53130b91bdf2203cf260e6dce99aee
5
5
  SHA512:
6
- metadata.gz: fdf2bb73b0ff4db4617c34996e72f23465d33d90a7631eaaa979235fd8f1f8c529dcf39f7930dc447df72e35e640726b0a3567e3cf0abdafb1ab88e46eb4e3ac
7
- data.tar.gz: e75ebfeb153e89fbe52e94e0eab2f33865b32c75ed89e5411387d2cfa6a2f92d0671ecc000229d1ac3cf2027d18e7b7050053c32ab44dca05c8f9a35b20a1194
6
+ metadata.gz: e2d262b6c4f0d7a2146718d3e16c0dd8973b217a9fe0ba850d03a456c68b7bd4355cbdd0a78454b09f6f50717c87ac8da524d42d99e78e0f362830c554376fdd
7
+ data.tar.gz: 6306f195c8d40842c0a4ed8ab2cfab1648fc562b03ba3137a0fd8c68ecb7a3668357c83abefd2b76bcac06efc961cdd042be10f44760aa102e34cdce2fe5d6d4
@@ -0,0 +1,25 @@
1
+ name: Ruby
2
+
3
+ on:
4
+ push:
5
+ branches: [ master ]
6
+ pull_request:
7
+ branches: [ master ]
8
+
9
+ jobs:
10
+ test:
11
+
12
+ runs-on: ubuntu-latest
13
+ strategy:
14
+ matrix:
15
+ ruby-version: ['2.7']
16
+
17
+ steps:
18
+ - uses: actions/checkout@v2
19
+ - name: Set up Ruby
20
+ uses: ruby/setup-ruby@v1
21
+ with:
22
+ ruby-version: ${{ matrix.ruby-version }}
23
+ bundler-cache: true # runs 'bundle install' and caches installed gems automatically
24
+ - name: Run tests
25
+ run: bundle exec rspec
data/.rspec CHANGED
@@ -1,2 +1,2 @@
1
1
  --colour
2
- --format s -c
2
+ --format documentation -c
data/README.md CHANGED
@@ -7,7 +7,7 @@ webpage. It is a Ruby port of arc90's readability project.
7
7
  Build Status
8
8
  ------------
9
9
 
10
- [![Build Status](https://travis-ci.org/cantino/ruby-readability.png)](https://travis-ci.org/cantino/ruby-readability)
10
+ [![Ruby](https://github.com/cantino/ruby-readability/actions/workflows/ruby.yml/badge.svg?branch=master)](https://github.com/cantino/ruby-readability/actions/workflows/ruby.yml)
11
11
 
12
12
  Install
13
13
  -------
@@ -78,6 +78,7 @@ feature requires that the `fastimage` gem be installed.
78
78
  Related Projects
79
79
  ----------------
80
80
 
81
+ * [readability.cr](https://github.com/joenas/readability.cr) - Port of ruby-readability's port of arc90's readability project to Crystal
81
82
  * [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
82
83
 
83
84
  Potential Issues
@@ -102,7 +103,3 @@ License
102
103
  This code is under the Apache License 2.0. See <http://www.apache.org/licenses/LICENSE-2.0>.
103
104
 
104
105
  Ruby port by cantino, starrhorne, libc, and iterationlabs. Special thanks to fizx and marcosinger.
105
-
106
-
107
- [![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/cantino/ruby-readability/trend.png)](https://bitdeli.com/free "Bitdeli Badge")
108
-
data/lib/readability.rb CHANGED
@@ -17,7 +17,9 @@ module Readability
17
17
  :min_image_height => 80,
18
18
  :ignore_image_format => [],
19
19
  :blacklist => nil,
20
- :whitelist => nil
20
+ :whitelist => nil,
21
+ :elements_to_score => ["p", "td", "pre"],
22
+ :likely_siblings => ["p"]
21
23
  }.freeze
22
24
 
23
25
  REGEXES = {
@@ -260,13 +262,14 @@ module Readability
260
262
  # Things like preambles, content split by ads that we removed, etc.
261
263
 
262
264
  sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
265
+ downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
263
266
  output = Nokogiri::XML::Node.new('div', @html)
264
267
  best_candidate[:elem].parent.children.each do |sibling|
265
268
  append = false
266
269
  append = true if sibling == best_candidate[:elem]
267
270
  append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
268
271
 
269
- if sibling.name.downcase == "p"
272
+ if downcased_likely_siblings.include?(sibling.name.downcase)
270
273
  link_density = get_link_density(sibling)
271
274
  node_content = sibling.text
272
275
  node_length = node_content.length
@@ -310,7 +313,7 @@ module Readability
310
313
 
311
314
  def score_paragraphs(min_text_length)
312
315
  candidates = {}
313
- @html.css("p,td").each do |elem|
316
+ @html.css(options[:elements_to_score].join(',')).each do |elem|
314
317
  parent_node = elem.parent
315
318
  grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil
316
319
  inner_text = elem.text
@@ -423,6 +426,8 @@ module Readability
423
426
 
424
427
  # We'll sanitize all elements using a whitelist
425
428
  base_whitelist = @options[:tags] || %w[div p]
429
+ all_whitelisted = base_whitelist.include?("*")
430
+
426
431
  # We'll add whitespace instead of block elements,
427
432
  # so a<br>b will have a nice space between them
428
433
  base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
@@ -435,7 +440,7 @@ module Readability
435
440
 
436
441
  ([node] + node.css("*")).each do |el|
437
442
  # If element is in whitelist, delete all its attributes
438
- if whitelist[el.node_name]
443
+ if all_whitelisted || whitelist[el.node_name]
439
444
  el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
440
445
 
441
446
  # Otherwise, replace the element with its contents
@@ -3,15 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "ruby-readability"
6
- s.version = '0.7.0'
6
+ s.version = '0.7.1'
7
7
  s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
8
8
  s.email = ["andrew@iterationlabs.com"]
9
9
  s.homepage = "http://github.com/cantino/ruby-readability"
10
10
  s.summary = %q{Port of arc90's readability project to ruby}
11
11
  s.description = %q{Port of arc90's readability project to ruby}
12
12
 
13
- s.rubyforge_project = "ruby-readability"
14
-
15
13
  s.files = `git ls-files`.split("\n")
16
14
  s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
17
15
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
@@ -19,7 +17,6 @@ Gem::Specification.new do |s|
19
17
 
20
18
  s.add_development_dependency "rspec", ">= 2.8"
21
19
  s.add_development_dependency "rspec-expectations", ">= 2.8"
22
- s.add_development_dependency "rr", ">= 1.0"
23
20
  s.add_dependency 'nokogiri', '>= 1.6.0'
24
21
  s.add_dependency 'guess_html_encoding', '>= 0.0.4'
25
22
  end
@@ -0,0 +1,189 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8"/>
5
+ <title>Standard Flavored Markdown</title>
6
+ <meta name="description" content=""/>
7
+ <meta name="HandheldFriendly" content="True"/>
8
+ <meta name="MobileOptimized" content="320"/>
9
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
10
+ <link rel="shortcut icon" href="/assets/images/favicon.ico?v=8684b6a35e">
11
+ <link rel="apple-touch-icon" href="/assets/images/codinghorror-app-icon.png?v=8684b6a35e">
12
+ <meta name="google-site-verification" content="sl0m9SU_4V0JcvjWlOX4dUFBR6VS2P4tlxjJMo0gphU"/>
13
+ <link rel="stylesheet" type="text/css" href="/assets/css/screen.css?v=8684b6a35e"/>
14
+ <link rel="stylesheet" type="text/css" href="//fonts.googleapis.com/css?family=Open+Sans:400italic,700italic,400,700"/>
15
+ <link rel="alternate" type="application/rss+xml" title="Coding Horror" href="http://feeds.feedburner.com/codinghorror">
16
+ </head>
17
+ <body class="post-template">
18
+ <header class="site-head">
19
+ <div class="site-head-content">
20
+ <a class="blog-logo" href="http://blog.codinghorror.com"><img src="/assets/images/codinghorror-app-icon.png?v=8684b6a35e" alt="Coding Horror Logo" width="158" height="158"/></a>
21
+ <h1 class="blog-title"><a href="http://blog.codinghorror.com">Coding Horror</a></h1>
22
+ <h2 class="blog-description">programming and human factors</h2>
23
+ <div class="site-search">
24
+ <script>
25
+ (function() {
26
+ var cx = '016956275695630057531:lqveu9tah7y';
27
+ var gcse = document.createElement('script');
28
+ gcse.type = 'text/javascript';
29
+ gcse.async = true;
30
+ gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') + '//www.google.com/cse/cse.js?cx=' + cx;
31
+ var s = document.getElementsByTagName('script')[0];
32
+ s.parentNode.insertBefore(gcse, s);
33
+ })();
34
+ </script>
35
+ <gcse:search></gcse:search>
36
+ </div>
37
+ </div>
38
+ </header>
39
+ <div class="wrap clearfix">
40
+ <div class="clearfix"></div>
41
+ <main class="content" role="main">
42
+ <article class="post">
43
+ <span class="post-meta"><time datetime="2014-09-03">03 Sep 2014</time> </span>
44
+ <h1 class="post-title">Standard Flavored Markdown</h1>
45
+ <section class="post-content">
46
+ <p>In 2009 I <a href="http://blog.codinghorror.com/responsible-open-source-code-parenting/">lamented the state of Markdown</a>:</p>
47
+ <blockquote>
48
+ <p>Right now we have the worst of both worlds. Lack of leadership from the top, and a bunch of fragmented, poorly coordinated community efforts to advance Markdown, none of which are officially canon. This isn't merely incovenient for anyone trying to find accurate information about Markdown; it's actually harming the project's future. </p>
49
+ </blockquote>
50
+ <p>In late 2012, David Greenspan from <a href="https://www.meteor.com/">Meteor</a> approached me and proposed we move forward, and <a href="http://blog.codinghorror.com/the-future-of-markdown/">a project crystallized</a>:</p>
51
+ <blockquote>
52
+ <p>I propose that Stack Exchange, GitHub, Meteor, Reddit, and any other company with lots of traffic and a strategic investment in Markdown, all work together to <strong>come up with an official Markdown specification, and standard test suites to validate Markdown implementations</strong>. We've all been working at cross purposes for too long, accidentally fragmenting Markdown while popularizing it.</p>
53
+ </blockquote>
54
+ <p>We formed a small private working group with key representatives from GitHub, from Reddit, from Stack Exchange, from the open source community. We spent months hashing out the details and agreeing on the necessary changes to turn Markdown into a language you can parse without feeling like you just walked through a sewer &ndash; while preserving the simple, clear, ASCII email inspired spirit of Markdown.</p>
55
+ <p>We really struggled with this at <a href="http://www.discourse.org">Discourse</a>, which is also based on Markdown, but an even more complex dialect than the one we built at Stack Overflow. In Discourse, you can mix <em>three</em> forms of markup interchangeably:</p>
56
+ <ul>
57
+ <li>Markdown</li>
58
+ <li>HTML (safe subset)</li>
59
+ <li>BBCode (subset)</li>
60
+ </ul>
61
+ <p>Discourse is primarily a JavaScript app, so naturally we needed a nice, compliant implementation of Markdown in JavaScript. Surely such a thing exists, yes? Nope. Even in 2012, we found <em>zero</em> JavaScript implementations of Markdown that could pass the only Markdown test suite I know of, <a href="https://github.com/michelf/mdtest/">MDTest</a>. It isn't authoritative, it's a community created initiative that embodies its own decisions about rendering ambiguities in Markdown, but it's all we've got. We contributed many <a href="https://github.com/evilstreak/markdown-js/commits/master">upstream fixes to markdown.js</a> to make it pass MDTest &ndash; but it still only passes in our locally extended version.</p>
62
+ <p>As an open source project ourselves, we're perfectly happy contributing upstream code to improve it for everyone. But it's an indictment of the state of the Markdown ecosystem that any remotely popular implementation wasn't already testing itself against a formal spec and test suite. But who can blame them, because <i>it didn't exist!</i></p>
63
+ <p>Well, now it does.</p>
64
+ <p>It took a while, but I'm pleased to announce that <a href="http://standardmarkdown.com"><strong>Standard Markdown</strong></a> is now finally ready for public review.</p>
65
+ <p><strong><a href="http://standardmarkdown.com">standardmarkdown.com</a></strong></p>
66
+ <p>It's a spec, including embedded examples, and implementations in portable C and JavaScript. We strived mightily to stay true to the spirit of Markdown in writing it. The primary author, John MacFarlane, <a href="http://spec.standardmarkdown.com">explains in the introduction to the spec</a>:</p>
67
+ <blockquote>
68
+ <p>Because Gruber’s syntax description leaves many aspects of the syntax undetermined, writing a precise spec requires making a large number of decisions, many of them somewhat arbitrary. In making them, I have appealed to existing conventions and considerations of simplicity, readability, expressive power, and consistency. I have tried to ensure that “normal” documents in the many incompatible existing implementations of markdown will render, as far as possible, as their authors intended. And I have tried to make the rules for different elements work together harmoniously. In places where different decisions could have been made (for example, the rules governing list indentation), I have explained the rationale for my choices. In a few cases, I have departed slightly from the canonical syntax description, in ways that I think further the goals of markdown as stated in that description.</p>
69
+ </blockquote>
70
+ <p>Part of my contribution to the project is to host the discussion / mailing list for Standard Markdown in a Discourse instance. </p>
71
+ <p><strong><a href="http://talk.standardmarkdown.com">talk.standardmarkdown.com</a></strong></p>
72
+ <p>Fortunately, Discourse itself <a href="http://blog.discourse.org/2014/08/introducing-discourse-1-0/">just reached version 1.0</a>. If the only thing Standard Markdown does is help save a few users from the continuing horror that is mailing list web UI, we all win.</p>
73
+ <p>What I'm most excited about is that we got a massive contribution from the one person who, in my mind, was the most perfect person in the world to work on this project: <a href="http://johnmacfarlane.net/">John MacFarlane</a>. He took our feedback and wrote the entire Standard Markdown spec and both implementations.</p>
74
+ <p><a href="http://johnmacfarlane.net/"><img src="/content/images/2014/Sep/JohnPinhole.jpg" alt="" title=""/></a></p>
75
+ <p>A lot of people know of John through his <a href="http://johnmacfarlane.net/pandoc/">Pandoc</a> project, which is amazing in its own right, but I found out about him because he built <a href="http://johnmacfarlane.net/babelmark2/faq.html">Babelmark</a>. I learned to refer to Babelmark extensively while working on Stack Overflow and MarkdownSharp, a C# implementation of Markdown.</p>
76
+ <p>Here's how crazy Markdown is: to decide what the "correct" behavior is, you provide sample Markdown input to 20+ different Markdown parsers &hellip; and then pray that some consensus emerges in all their output. That's what Babelmark does.</p>
77
+ <p>Consider this simple Markdown example:</p>
78
+ <pre><code># Hello there
79
+
80
+ This is a paragraph.
81
+
82
+ - one
83
+ - two
84
+ - three
85
+ - four
86
+
87
+ 1. pirate
88
+ 2. ninja
89
+ 3. zombie
90
+ </code></pre>
91
+ <p>Just for that, I count <a href="http://johnmacfarlane.net/babelmark2/?text=%23+Hello+there%0A%0AThis+is+a+paragraph.%0A%0A-+one%0A-+two%0A-+three%0A-+four%0A%0A1.+pirate%0A2.+ninja%0A3.+zombie"><em>fifteen</em> different rendered outputs</a> from 22 different Markdown parsers.</p>
92
+ <p><a href="http://en.wikipedia.org/wiki/Tower_of_Babel"><img src="/content/images/2014/Sep/Confusion_of_Tongues.png" alt="" title=""/></a></p>
93
+ <p>In Markdown, we <em>literally</em> built a <a href="http://en.wikipedia.org/wiki/Tower_of_Babel">Tower of Babel</a>. </p>
94
+ <p>Have I mentioned that it's a good idea for a language to have a formal specification and test suites? Maybe now you can see why that is.</p>
95
+ <p>Oh, and in his spare time, John is also the chair of the department of philosophy at the University of California, Berkeley. <em>No big deal.</em> While I don't mean to minimize the contributions of anyone to the Standard Markdown project, we all owe a special thanks to John.</p>
96
+ <p>Markdown is indeed everywhere. And that's a good thing. But it needs to be sane, parseable, and standard. That's the goal of <a href="http://standardmarkdown.com/">Standard Markdown</a> &mdash; but we need your help to get there. If you use Markdown on a website, <strong>ask what it would take for that site to become compatible with Standard Markdown</strong>; when you see the word "Markdown" you have the right to expect consistent rendering across all the websites you visit. If you implement Markdown, <a href="http://spec.standardmarkdown.com">take a look at the spec</a>, try to <strong>make your parser compatible with Standard Markdown</strong>, and <a href="http://talk.standardmarkdown.com">discuss improvements or refinements</a> to the spec.</p>
97
+ <p><span style="color:red;">Update:</span> The project was renamed <a href="http://commonmark.org">CommonMark</a>. See <a href="http://blog.codinghorror.com/standard-markdown-is-now-common-markdown/">my subsequent blog post</a>.</p>
98
+ <table>
99
+ <tr><td class="welovecodinghorror">
100
+ [advertisement] How are you showing off your awesome? Create a <a href="http://careers.stackoverflow.com/cv" rel="nofollow">Stack Overflow Careers profile</a> and show off all of your hard work from Stack Overflow, Github, and virtually every other coding site. Who knows, you might even get recruited for a great <a href="http://careers.stackoverflow.com/jobs" rel="nofollow">new position</a>!
101
+ </td></tr>
102
+ </table>
103
+ </section>
104
+ <footer class="post-footer">
105
+ <section class="author">
106
+ <h4>Written by Jeff Atwood</h4>
107
+ <p>Indoor enthusiast. Co-founder of Stack Exchange and Discourse. Disclaimer: I have no idea what I&#x27;m talking about. Find me here: <a href="http://twitter.com/codinghorror">http://twitter.com/codinghorror</a></p>
108
+ </section>
109
+ </footer>
110
+ <div id="nrelate_related_placeholder"></div> <script async id="nrelate_loader_script" type="text/javascript" src="http://static.nrelate.com/common_js/0.52.1/loader.min.js"></script>
111
+ </article>
112
+ <div id="discourse-comments"></div>
113
+ <script type="text/javascript">
114
+ var discourseUrl = "http://discourse.codinghorror.com/",
115
+ discourseEmbedUrl = 'http://blog.codinghorror.com/standard-flavored-markdown/';
116
+
117
+ (function() {
118
+ var d = document.createElement('script'); d.type = 'text/javascript'; d.async = true;
119
+ d.src = discourseUrl + 'javascripts/embed.js';
120
+ (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(d);
121
+ })();
122
+ </script>
123
+ </main>
124
+ <aside class="sidebar">
125
+
126
+ <div id="carbonads-container"><div class="carbonad"><div id="azcarbon"></div><script type="text/javascript">var z = document.createElement("script"); z.type = "text/javascript"; z.async = true; z.src = "http://engine.carbonads.com/z/56742/azcarbon_2_1_0_VERT"; var s = document.getElementsByTagName("script")[0]; s.parentNode.insertBefore(z, s);</script></div></div>
127
+ <div id="hireme" class="hireme codinghorror" style="min-height: 220px; margin-bottom: 15px;"></div>
128
+ <script>
129
+ setTimeout(function () {
130
+ var a = document.createElement("script");
131
+ var b = document.getElementsByTagName('script')[0];
132
+ a.src = "http://careers.stackoverflow.com/ad/js";
133
+ a.async = true;
134
+ a.type = "text/javascript";
135
+ b.parentNode.insertBefore(a, b);
136
+ }, 5);
137
+ </script>
138
+ <div class="welovecodinghorror" style="margin-bottom:15px">
139
+ [ad] Enjoy the blog? Read <b><a href="http://www.hyperink.com/Effective-Programming-More-Than-Writing-Code-b1559">Effective Programming: More than Writing Code</a></b> and <b><a href="http://www.hyperink.com/How-To-Stop-Sucking-And-Be-Awesome-Instead-b9A74B5CBA6">How to Stop Sucking and Be Awesome Instead</a></b> on your Kindle, iPad, Nook, or as a PDF.
140
+ </div>
141
+ <h3>Resources</h3>
142
+ <ul>
143
+ <li><a href="/about-me/">About Me</a></li>
144
+ <li><a href="http://twitter.com/codinghorror">@codinghorror</a></li>
145
+ <li><a href="http://www.discourse.org/">discourse.org</a></li>
146
+ <li><a href="http://stackexchange.com/">stackexchange.com</a></li>
147
+ <li><a href="/recommended-reading-for-developers/">Recommended Reading</a></li>
148
+ </ul>
149
+ <ul>
150
+ <li><a href="http://feeds.feedburner.com/codinghorror" class="icon-feed">&nbsp;Subscribe in a reader</a></li>
151
+ <li><a href="http://feedburner.google.com/fb/a/mailverify?uri=codinghorror&amp;loc=en_US" class="icon-email">&nbsp;Subscribe via email</a></li>
152
+ </ul>
153
+ <p>Coding Horror has been continuously published since 2004</p>
154
+ <ul>
155
+ <li><img src="http://feeds.feedburner.com/~fc/codinghorror?bg=EEEEEE&amp;fg=111111&amp;anim=0" height="26" width="88" style="border:0" alt="Count of RSS readers"></li>
156
+ <li><a href="http://my.statcounter.com/project/standard/stats.php?project_id=2600027&amp;guest=1">Traffic Stats</a></li>
157
+ </ul>
158
+ <footer class="site-footer">
159
+ <section class="copyright">Copyright <a rel="author" href="https://profiles.google.com/codinghorror1">Jeff Atwood</a> &copy; 2014<br/>
160
+ Logo image &copy; 1993 Steven C. McConnell <br/>
161
+ Proudly published with <a class="icon-ghost" href="http://ghost.org">Ghost</a></section>
162
+ </footer></aside>
163
+ </div>
164
+ <script src="/public/jquery.min.js?v=8684b6a35e"></script>
165
+ <script type="text/javascript" src="/assets/js/jquery.fitvids.js?v=8684b6a35e"></script>
166
+ <script type="text/javascript" src="/assets/js/index.js?v=8684b6a35e"></script>
167
+ <script async src="http://www.statcounter.com/counter/counter.js"></script>
168
+ <noscript><a href="http://www.statcounter.com/"><img src="http://c26.statcounter.com/counter.php?sc_project=2600027&amp;java=0&amp;security=dcff5548&amp;invisible=0" alt="web metrics"></a> </noscript>
169
+
170
+ <script>
171
+ document.write(unescape("%3Cscript src='" + (document.location.protocol == "https:" ? "https://sb" : "http://b") + ".scorecardresearch.com/beacon.js'%3E%3C/script%3E"));
172
+ </script>
173
+ <script>
174
+ COMSCORE.beacon({
175
+ c1: 2,
176
+ c2: "6035669",
177
+ c3: "",
178
+ c4: "http://www.codinghorror.com/blog/",
179
+ c5: "",
180
+ c6: "",
181
+ c15: ""
182
+ });
183
+ </script>
184
+ <noscript>
185
+ <img src="http://b.scorecardresearch.com/b?c1=2&amp;c2=6035669&amp;c3=&amp;c4=http%3A%2F%2Fwww.codinghorror.com%2Fblog%2F&amp;c5=&amp;c6=&amp;c15=&amp;cv=1.3&amp;cj=1" style="display:none" width="0" height="0" alt=""/>
186
+ </noscript>
187
+ <img src="/view.gif?page=/standard-flavored-markdown/" alt="" style="display:none" hidden />
188
+ </body>
189
+ </html>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <body>
3
+ <article>
4
+ <section>
5
+ <figure>
6
+ <img src="http://example.com/image.jpeg" />
7
+ </figure>
8
+ </section>
9
+ </article>
10
+ </body>
11
+ </html>
@@ -19,7 +19,7 @@ describe Readability do
19
19
  </body>
20
20
  </html>
21
21
  HTML
22
-
22
+
23
23
  @simple_html_with_img_no_text = <<-HTML
24
24
  <html>
25
25
  <head>
@@ -32,7 +32,7 @@ describe Readability do
32
32
  </body>
33
33
  </html>
34
34
  HTML
35
-
35
+
36
36
  @simple_html_with_img_in_noscript = <<-HTML
37
37
  <html>
38
38
  <head>
@@ -40,8 +40,8 @@ describe Readability do
40
40
  </head>
41
41
  <body class='main'>
42
42
  <div class="article-img">
43
- <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
44
- height="317" alt="test" class="lazy"
43
+ <img src="http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif" width="660"
44
+ height="317" alt="test" class="lazy"
45
45
  data-original="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg">
46
46
  <noscript><img src="http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"></noscript>
47
47
  </div>
@@ -54,30 +54,65 @@ describe Readability do
54
54
  before do
55
55
  @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
56
56
  @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
57
- @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
57
+ @thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
58
+ @ch = File.read(File.dirname(__FILE__) + "/fixtures/codinghorror.html")
59
+ @nested = File.read(File.dirname(__FILE__) + "/fixtures/nested_images.html")
58
60
 
59
61
  FakeWeb::Registry.instance.clean_registry
62
+
60
63
  FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg",
61
64
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/dim_1416768a.jpg"))
62
-
65
+
63
66
  FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif",
64
67
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703711a.gif"))
65
-
66
- FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
68
+
69
+ FakeWeb.register_uri(:get, "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif",
67
70
  :body => File.read(File.dirname(__FILE__) + "/fixtures/images/sign_up_emails_682__703712a.gif"))
68
-
71
+
72
+ # Register images for codinghorror
73
+ FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
74
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/JohnPinhole.jpg"))
75
+ FakeWeb.register_uri(:get, 'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png',
76
+ :body => File.read(File.dirname(__FILE__) + "/fixtures/images/Confusion_of_Tongues.png"))
69
77
  end
70
78
 
71
79
  it "should show one image, but outside of the best candidate" do
72
- @doc = Readability::Document.new(@thesum)
73
- @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"]
74
- @doc.best_candidate_has_image.should == false
80
+ @doc = Readability::Document.new(@thesun)
81
+ expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703712a.gif"])
82
+ expect(@doc.best_candidate_has_image).to eq(false)
75
83
  end
76
84
 
77
85
  it "should show one image inside of the best candidate" do
78
86
  @doc = Readability::Document.new(@nytimes)
79
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
80
- @doc.best_candidate_has_image.should == true
87
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
88
+ expect(@doc.best_candidate_has_image).to eq(true)
89
+ end
90
+
91
+ it "should expand relative image url" do
92
+ url = 'http://blog.codinghorror.com/standard-flavored-markdown/'
93
+ @doc = Readability::Document.new(@ch, tags: %w[div p img a],
94
+ attributes: %w[src href],
95
+ remove_empty_nodes: false)
96
+ @doc.images_with_fqdn_uris!(url)
97
+
98
+ expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg')
99
+ expect(@doc.content).to include('http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png')
100
+
101
+ expect(@doc.images).to match_array([
102
+ 'http://blog.codinghorror.com/content/images/2014/Sep/JohnPinhole.jpg',
103
+ 'http://blog.codinghorror.com/content/images/2014/Sep/Confusion_of_Tongues.png'
104
+ ])
105
+ end
106
+
107
+ it "should be able to preserve deeply nested image tags in the article's content by whitelisting all tags" do
108
+ @doc = Readability::Document.new(@nested, attributes: ["src"])
109
+ expect(@doc.images).to be_empty
110
+
111
+ @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["figure", "image"])
112
+ expect(@doc.images).to be_empty
113
+
114
+ @doc = Readability::Document.new(@nested, attributes: ["src"], tags: ["*"])
115
+ expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
81
116
  end
82
117
 
83
118
  it "should not try to download local images" do
@@ -93,69 +128,69 @@ describe Readability do
93
128
  </body>
94
129
  </html>
95
130
  HTML
96
- do_not_allow(@doc).load_image(anything)
97
- @doc.images.should == []
131
+ expect(@doc).not_to receive(:get_image_size)
132
+ expect(@doc.images).to eq([])
98
133
  end
99
134
 
100
135
  describe "no images" do
101
136
  it "shouldn't show images" do
102
137
  @doc = Readability::Document.new(@bbc, :min_image_height => 600)
103
- @doc.images.should == []
104
- @doc.best_candidate_has_image.should == false
138
+ expect(@doc.images).to eq([])
139
+ expect(@doc.best_candidate_has_image).to eq(false)
105
140
  end
106
141
  end
107
142
 
108
143
  describe "poll of images" do
109
144
  it "should show some images inside of the best candidate" do
110
145
  @doc = Readability::Document.new(@bbc)
111
- @doc.images.should =~ ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
146
+ expect(@doc.images).to match_array(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg",
112
147
  "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg",
113
148
  "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif",
114
- "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
115
- @doc.best_candidate_has_image.should == true
149
+ "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
150
+ expect(@doc.best_candidate_has_image).to eq(true)
116
151
  end
117
152
 
118
153
  it "should show some images inside of the best candidate, include gif format" do
119
154
  @doc = Readability::Document.new(@bbc, :ignore_image_format => [])
120
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
121
- @doc.best_candidate_has_image.should == true
155
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027786_john_capes229_rnsm.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
156
+ expect(@doc.best_candidate_has_image).to eq(true)
122
157
  end
123
158
 
124
159
  describe "width, height and format" do
125
160
  it "should show some images inside of the best candidate, but with width most equal to 400px" do
126
161
  @doc = Readability::Document.new(@bbc, :min_image_width => 400, :ignore_image_format => [])
127
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"]
128
- @doc.best_candidate_has_image.should == true
162
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg"])
163
+ expect(@doc.best_candidate_has_image).to eq(true)
129
164
  end
130
165
 
131
166
  it "should show some images inside of the best candidate, but with width most equal to 304px" do
132
167
  @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => [])
133
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"]
134
- @doc.best_candidate_has_image.should == true
168
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57027000/jpg/_57027794_perseus_getty.jpg", "http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif", "http://news.bbcimg.co.uk/media/images/57055000/jpg/_57055063_perseus_thoctarides.jpg"])
169
+ expect(@doc.best_candidate_has_image).to eq(true)
135
170
  end
136
171
 
137
172
  it "should show some images inside of the best candidate, but with width most equal to 304px and ignoring JPG format" do
138
173
  @doc = Readability::Document.new(@bbc, :min_image_width => 304, :ignore_image_format => ["jpg"])
139
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
140
- @doc.best_candidate_has_image.should == true
174
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
175
+ expect(@doc.best_candidate_has_image).to eq(true)
141
176
  end
142
177
 
143
178
  it "should show some images inside of the best candidate, but with height most equal to 400px, no ignoring no format" do
144
179
  @doc = Readability::Document.new(@bbc, :min_image_height => 400, :ignore_image_format => [])
145
- @doc.images.should == ["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"]
146
- @doc.best_candidate_has_image.should == true
180
+ expect(@doc.images).to eq(["http://news.bbcimg.co.uk/media/images/57060000/gif/_57060487_sub_escapes304x416.gif"])
181
+ expect(@doc.best_candidate_has_image).to eq(true)
147
182
  end
148
-
183
+
149
184
  it "should not miss an image if it exists by itself in a div without text" do
150
185
  @doc = Readability::Document.new(@simple_html_with_img_no_text,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
151
- @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
186
+ expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
152
187
  end
153
-
188
+
154
189
  it "should not double count an image between script and noscript" do
155
190
  @doc = Readability::Document.new(@simple_html_with_img_in_noscript,:tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false, :do_not_guess_encoding => true)
156
- @doc.images.should == ["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"]
191
+ expect(@doc.images).to eq(["http://img.thesun.co.uk/multimedia/archive/00703/sign_up_emails_682__703711a.gif", "http://img.thesun.co.uk/multimedia/archive/01416/dim_1416768a.jpg"])
157
192
  end
158
-
193
+
159
194
  end
160
195
  end
161
196
  end
@@ -167,11 +202,11 @@ describe Readability do
167
202
  end
168
203
 
169
204
  it "should transform divs containing no block elements into <p>s" do
170
- @doc.html.css("#body").first.name.should == "p"
205
+ expect(@doc.html.css("#body").first.name).to eq("p")
171
206
  end
172
207
 
173
208
  it "should not transform divs that contain block elements" do
174
- @doc.html.css("#contains_blockquote").first.name.should == "div"
209
+ expect(@doc.html.css("#contains_blockquote").first.name).to eq("div")
175
210
  end
176
211
  end
177
212
 
@@ -185,9 +220,9 @@ describe Readability do
185
220
  <body></body>
186
221
  </html>
187
222
  HTML
188
- doc.author.should eql("Austin Fonacier")
223
+ expect(doc.author).to eql("Austin Fonacier")
189
224
  end
190
-
225
+
191
226
  it "should pick up readability's recommended author format" do
192
227
  doc = Readability::Document.new(<<-HTML)
193
228
  <html>
@@ -200,9 +235,9 @@ describe Readability do
200
235
  </body>
201
236
  </html>
202
237
  HTML
203
- doc.author.should eql("Austin Fonacier")
238
+ expect(doc.author).to eql("Austin Fonacier")
204
239
  end
205
-
240
+
206
241
  it "should pick up vcard fn" do
207
242
  doc = Readability::Document.new(<<-HTML)
208
243
  <html>
@@ -216,9 +251,9 @@ describe Readability do
216
251
  </body>
217
252
  </html>
218
253
  HTML
219
- doc.author.should eql("Austin Fonacier")
254
+ expect(doc.author).to eql("Austin Fonacier")
220
255
  end
221
-
256
+
222
257
  it "should pick up <a rel='author'>" do
223
258
  doc = Readability::Document.new(<<-HTML)
224
259
  <html>
@@ -228,9 +263,9 @@ describe Readability do
228
263
  </body>
229
264
  </html>
230
265
  HTML
231
- doc.author.should eql("Danny Banks (rel)")
266
+ expect(doc.author).to eql("Danny Banks (rel)")
232
267
  end
233
-
268
+
234
269
  it "should pick up <div id='author'>" do
235
270
  doc = Readability::Document.new(<<-HTML)
236
271
  <html>
@@ -240,7 +275,7 @@ describe Readability do
240
275
  </body>
241
276
  </html>
242
277
  HTML
243
- doc.author.should eql("Austin Fonacier (author)")
278
+ expect(doc.author).to eql("Austin Fonacier (author)")
244
279
  end
245
280
  end
246
281
 
@@ -263,15 +298,15 @@ describe Readability do
263
298
  end
264
299
 
265
300
  it "should like <div>s more than <th>s" do
266
- @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
301
+ expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
267
302
  end
268
303
 
269
304
  it "should like classes like text more than classes like comment" do
270
305
  @elem2.name = "div"
271
- @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score]
306
+ expect(@doc.score_node(@elem1)[:content_score]).to eq(@doc.score_node(@elem2)[:content_score])
272
307
  @elem1['class'] = "text"
273
308
  @elem2['class'] = "comment"
274
- @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score]
309
+ expect(@doc.score_node(@elem1)[:content_score]).to be > @doc.score_node(@elem2)[:content_score]
275
310
  end
276
311
  end
277
312
 
@@ -282,15 +317,15 @@ describe Readability do
282
317
  end
283
318
 
284
319
  it "should remove things that have class comment" do
285
- @doc.html.inner_html.should_not =~ /a comment/
320
+ expect(@doc.html.inner_html).not_to match(/a comment/)
286
321
  end
287
322
 
288
323
  it "should not remove body tags" do
289
- @doc.html.inner_html.should =~ /<\/body>/
324
+ expect(@doc.html.inner_html).to match(/<\/body>/)
290
325
  end
291
326
 
292
327
  it "should not remove things with class comment and id body" do
293
- @doc.html.inner_html.should =~ /real content/
328
+ expect(@doc.html.inner_html).to match(/real content/)
294
329
  end
295
330
  end
296
331
 
@@ -318,13 +353,13 @@ describe Readability do
318
353
  end
319
354
 
320
355
  it "should score elements in the document" do
321
- @candidates.values.length.should == 3
356
+ expect(@candidates.values.length).to eq(3)
322
357
  end
323
358
 
324
359
  it "should prefer the body in this particular example" do
325
- @candidates.values.sort { |a, b|
360
+ expect(@candidates.values.sort { |a, b|
326
361
  b[:content_score] <=> a[:content_score]
327
- }.first[:elem][:id].should == "body"
362
+ }.first[:elem][:id]).to eq("body")
328
363
  end
329
364
 
330
365
  context "when two consequent br tags are used instead of p" do
@@ -349,9 +384,129 @@ describe Readability do
349
384
  </html>
350
385
  HTML
351
386
  @candidates = @doc.score_paragraphs(0)
352
- @candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
387
+ expect(@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id]).to eq('post1')
353
388
  end
354
389
  end
390
+
391
+ it "does not include short paragraphs as related siblings in the output" do
392
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
393
+ <html>
394
+ <head>
395
+ <title>title!</title>
396
+ </head>
397
+ <body>
398
+ <section>
399
+ <p>Paragraph 1</p>
400
+ <p>Paragraph 2</p>
401
+ </section>
402
+ <section>
403
+ <p>Too short</p>
404
+ </section>
405
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
406
+ </body>
407
+ </html>
408
+ HTML
409
+
410
+ expect(@doc.content).to include("Paragraph 1")
411
+ expect(@doc.content).to include("Paragraph 2")
412
+ expect(@doc.content).not_to include("Too short")
413
+ end
414
+
415
+ it "includes long paragraphs as related siblings in the output" do
416
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
417
+ <html>
418
+ <head>
419
+ <title>title!</title>
420
+ </head>
421
+ <body>
422
+ <section>
423
+ <p>Paragraph 1</p>
424
+ <p>Paragraph 2</p>
425
+ </section>
426
+ <p>This paragraph is longer than 80 characters so should be included as a sibling in the output.</p>
427
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
428
+ </body>
429
+ </html>
430
+ HTML
431
+
432
+ expect(@doc.content).to include("Paragraph 1")
433
+ expect(@doc.content).to include("Paragraph 2")
434
+ expect(@doc.content).to include("This paragraph is longer")
435
+ end
436
+
437
+ it "does not include non-paragraph tags in the output, even when longer than 80 characters" do
438
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
439
+ <html>
440
+ <head>
441
+ <title>title!</title>
442
+ </head>
443
+ <body>
444
+ <section>
445
+ <p>Paragraph 1</p>
446
+ <p>Paragraph 2</p>
447
+ </section>
448
+ <section>
449
+ <p>Although this paragraph is longer than 80 characters, the sibling is the section so it should not be included.</p>
450
+ </section>
451
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
452
+ </body>
453
+ </html>
454
+ HTML
455
+
456
+ expect(@doc.content).to include("Paragraph 1")
457
+ expect(@doc.content).to include("Paragraph 2")
458
+ expect(@doc.content).not_to include("Although this paragraph")
459
+ end
460
+
461
+ it "does include non-paragraph tags in the output if their content score is high enough" do
462
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"])
463
+ <html>
464
+ <head>
465
+ <title>title!</title>
466
+ </head>
467
+ <body>
468
+ <section>
469
+ <p>Paragraph 1</p>
470
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
471
+ </section>
472
+ <section>
473
+ <p>This should be included in the output because the content is score is high enough.<p>
474
+ <p>The, inclusion, of, lots, of, commas, increases, the, score, of, an, element.</p>
475
+ </section>
476
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
477
+ </body>
478
+ </html>
479
+ HTML
480
+
481
+ expect(@doc.content).to include("Paragraph 1")
482
+ expect(@doc.content).to include("Paragraph 2")
483
+ expect(@doc.content).to include("This should be included")
484
+ end
485
+
486
+ it "can optionally include other related siblings in the output if they meet the 80 character threshold" do
487
+ @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"])
488
+ <html>
489
+ <head>
490
+ <title>title!</title>
491
+ </head>
492
+ <body>
493
+ <section>
494
+ <p>Paragraph 1</p>
495
+ #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
496
+ </section>
497
+ <section>
498
+ <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
499
+ <p>The likely_siblings now include the section tag so it should be included in the output.</p>
500
+ </section>
501
+ #{'<a href="/">This link lowers the body score.</a>' * 5}
502
+ </body>
503
+ </html>
504
+ HTML
505
+
506
+ expect(@doc.content).to include("Paragraph 1")
507
+ expect(@doc.content).to include("Paragraph 2")
508
+ expect(@doc.content).to include("should be included")
509
+ end
355
510
  end
356
511
 
357
512
  describe "the cant_read.html fixture" do
@@ -359,7 +514,7 @@ describe Readability do
359
514
  allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a]
360
515
  allowed_attributes = %w[href]
361
516
  html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html")
362
- Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/)
517
+ expect(Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content).to match(/Can you talk a little about how you developed the looks for the/)
363
518
  end
364
519
  end
365
520
 
@@ -370,15 +525,15 @@ describe Readability do
370
525
  end
371
526
 
372
527
  it "should return the main page content" do
373
- @doc.content.should match("Some content")
528
+ expect(@doc.content).to match("Some content")
374
529
  end
375
530
 
376
531
  it "should return the page title if present" do
377
- @doc.title.should match("title!")
532
+ expect(@doc.title).to match("title!")
378
533
 
379
534
  doc = Readability::Document.new("<html><head></head><body><div><p>Some content</p></div></body>",
380
535
  :min_text_length => 0, :retry_length => 1)
381
- doc.title.should be_nil
536
+ expect(doc.title).to be_nil
382
537
  end
383
538
  end
384
539
 
@@ -389,7 +544,7 @@ describe Readability do
389
544
  end
390
545
 
391
546
  it "should not return the sidebar" do
392
- @doc.content.should_not match("sidebar")
547
+ expect(@doc.content).not_to match("sidebar")
393
548
  end
394
549
  end
395
550
 
@@ -407,7 +562,7 @@ describe Readability do
407
562
  end
408
563
 
409
564
  it "should not return the sidebar" do
410
- @doc.content.should_not match("a b c d f")
565
+ expect(@doc.content).not_to match("a b c d f")
411
566
  end
412
567
  end
413
568
 
@@ -427,12 +582,12 @@ describe Readability do
427
582
  #puts "testing #{sample}..."
428
583
 
429
584
  $required_fragments.each do |required_text|
430
- doc.should include(required_text)
585
+ expect(doc).to include(required_text)
431
586
  checks += 1
432
587
  end
433
588
 
434
589
  $excluded_fragments.each do |text_to_avoid|
435
- doc.should_not include(text_to_avoid)
590
+ expect(doc).not_to include(text_to_avoid)
436
591
  checks += 1
437
592
  end
438
593
  end
@@ -446,18 +601,18 @@ describe Readability do
446
601
  it "should correctly guess and enforce HTML encoding" do
447
602
  doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!</div></body></html>")
448
603
  content = doc.content
449
- content.encoding.to_s.should == "ISO-8859-1"
450
- content.should be_valid_encoding
604
+ expect(content.encoding.to_s).to eq("ISO-8859-1")
605
+ expect(content).to be_valid_encoding
451
606
  end
452
607
 
453
608
  it "should allow encoding guessing to be skipped" do
454
- do_not_allow(GuessHtmlEncoding).encode
609
+ expect(GuessHtmlEncoding).to_not receive(:encode)
455
610
  doc = Readability::Document.new(@simple_html_fixture, :do_not_guess_encoding => true)
456
611
  doc.content
457
612
  end
458
613
 
459
614
  it "should allow encoding guessing to be overridden" do
460
- do_not_allow(GuessHtmlEncoding).encode
615
+ expect(GuessHtmlEncoding).to_not receive(:encode)
461
616
  doc = Readability::Document.new(@simple_html_fixture, :encoding => "UTF-8")
462
617
  doc.content
463
618
  end
@@ -469,54 +624,54 @@ describe Readability do
469
624
  it "should strip the html comments tag" do
470
625
  doc = Readability::Document.new("<html><head><meta http-equiv='content-type' content='text/html; charset=LATIN1'></head><body><div>hi!<!-- bye~ --></div></body></html>")
471
626
  content = doc.content
472
- content.should include("hi!")
473
- content.should_not include("bye")
627
+ expect(content).to include("hi!")
628
+ expect(content).not_to include("bye")
474
629
  end
475
630
 
476
631
  it "should not error with empty content" do
477
- Readability::Document.new('').content.should == '<div><div></div></div>'
632
+ expect(Readability::Document.new('').content).to eq('<div><div></div></div>')
478
633
  end
479
634
 
480
635
  it "should not error with a document with no <body>" do
481
- Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content.should == '<div><div></div></div>'
636
+ expect(Readability::Document.new('<html><head><meta http-equiv="refresh" content="0;URL=http://example.com"></head></html>').content).to eq('<div><div></div></div>')
482
637
  end
483
638
  end
484
-
639
+
485
640
  describe "No side-effects" do
486
641
  before do
487
642
  @bbc = File.read(File.dirname(__FILE__) + "/fixtures/bbc.html")
488
643
  @nytimes = File.read(File.dirname(__FILE__) + "/fixtures/nytimes.html")
489
- @thesum = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
644
+ @thesun = File.read(File.dirname(__FILE__) + "/fixtures/thesun.html")
490
645
  end
491
-
646
+
492
647
  it "should not have any side-effects when calling content() and then images()" do
493
- @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
648
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
494
649
  :do_not_guess_encoding => true)
495
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
650
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
496
651
  @doc.content
497
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
652
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
498
653
  end
499
-
654
+
500
655
  it "should not have any side-effects when calling content() multiple times" do
501
- @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
656
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
502
657
  :do_not_guess_encoding => true)
503
- @doc.content.should == @doc.content
658
+ expect(@doc.content).to eq(@doc.content)
504
659
  end
505
-
660
+
506
661
  it "should not have any side-effects when calling content and images multiple times" do
507
- @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
662
+ @doc=Readability::Document.new(@nytimes, :tags => %w[div p img a], :attributes => %w[src href], :remove_empty_nodes => false,
508
663
  :do_not_guess_encoding => true)
509
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
510
- @doc.content.should == @doc.content
511
- @doc.images.should == ["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"]
664
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
665
+ expect(@doc.content).to eq(@doc.content)
666
+ expect(@doc.images).to eq(["http://graphics8.nytimes.com/images/2011/12/02/opinion/02fixes-freelancersunion/02fixes-freelancersunion-blog427.jpg"])
512
667
  end
513
-
668
+
514
669
  end
515
-
670
+
516
671
  describe "Code blocks" do
517
672
  before do
518
673
  @code = File.read(File.dirname(__FILE__) + "/fixtures/code.html")
519
- @content = Readability::Document.new(@code,
674
+ @content = Readability::Document.new(@code,
520
675
  :tags => %w[div p img a ul ol li h1 h2 h3 h4 h5 h6 blockquote strong em b code pre],
521
676
  :attributes => %w[src href],
522
677
  :remove_empty_nodes => false).content
@@ -524,29 +679,29 @@ describe Readability do
524
679
  end
525
680
 
526
681
  it "preserve the code blocks" do
527
- @doc.css("code pre").text.should == "\nroot\n indented\n "
682
+ expect(@doc.css("code pre").text).to eq("\nroot\n indented\n ")
528
683
  end
529
684
 
530
685
  it "preserve backwards code blocks" do
531
- @doc.css("pre code").text.should == "\nsecond\n indented\n "
686
+ expect(@doc.css("pre code").text).to eq("\nsecond\n indented\n ")
532
687
  end
533
688
  end
534
689
 
535
690
  describe "remove all tags" do
536
691
  it "should work for an incomplete piece of HTML" do
537
692
  doc = Readability::Document.new('<div>test</div', :tags => [])
538
- doc.content.should == 'test'
693
+ expect(doc.content).to eq('test')
539
694
  end
540
695
 
541
696
  it "should work for a HTML document" do
542
697
  doc = Readability::Document.new('<html><head><title>title!</title></head><body><div><p>test</p></div></body></html>',
543
698
  :tags => [])
544
- doc.content.should == 'test'
699
+ expect(doc.content).to eq('test')
545
700
  end
546
701
 
547
702
  it "should work for a plain text" do
548
703
  doc = Readability::Document.new('test', :tags => [])
549
- doc.content.should == 'test'
704
+ expect(doc.content).to eq('test')
550
705
  end
551
706
  end
552
707
 
@@ -563,8 +718,8 @@ describe Readability do
563
718
  doc = Readability::Document.new(boing_boing)
564
719
 
565
720
  content = doc.content
566
- (content !~ /Bees and Bombs/).should == true
567
- content.should =~ /ADVERTISE/
721
+ expect(content !~ /Bees and Bombs/).to eq(true)
722
+ expect(content).to match(/ADVERTISE/)
568
723
  end
569
724
 
570
725
  it "should apply whitelist" do
@@ -572,13 +727,13 @@ describe Readability do
572
727
  doc = Readability::Document.new(boing_boing,
573
728
  whitelist: ".post-content")
574
729
  content = doc.content
575
- content.should =~ /Bees and Bombs/
730
+ expect(content).to match(/Bees and Bombs/)
576
731
  end
577
732
 
578
733
  it "should apply blacklist" do
579
734
  doc = Readability::Document.new(boing_boing, blacklist: "#sidebar_adblock")
580
735
  content = doc.content
581
- (content !~ /ADVERTISE/).should == true
736
+ expect(content !~ /ADVERTISE/).to eq(true)
582
737
 
583
738
  end
584
739
  end
data/spec/spec_helper.rb CHANGED
@@ -1,11 +1,5 @@
1
1
  require 'rubygems'
2
2
  require 'readability'
3
- require 'rr'
4
3
  require 'fakeweb'
5
4
 
6
5
  FakeWeb.allow_net_connect = false
7
-
8
- RSpec.configure do |config|
9
- config.mock_with :rr
10
- end
11
-
metadata CHANGED
@@ -1,86 +1,72 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ruby-readability
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Cantino
8
8
  - starrhorne
9
9
  - libc
10
10
  - Kyle Maxwell
11
- autorequire:
11
+ autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2014-08-17 00:00:00.000000000 Z
14
+ date: 2024-06-11 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: rspec
18
18
  requirement: !ruby/object:Gem::Requirement
19
19
  requirements:
20
- - - '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: '2.8'
23
23
  type: :development
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - '>='
27
+ - - ">="
28
28
  - !ruby/object:Gem::Version
29
29
  version: '2.8'
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: rspec-expectations
32
32
  requirement: !ruby/object:Gem::Requirement
33
33
  requirements:
34
- - - '>='
34
+ - - ">="
35
35
  - !ruby/object:Gem::Version
36
36
  version: '2.8'
37
37
  type: :development
38
38
  prerelease: false
39
39
  version_requirements: !ruby/object:Gem::Requirement
40
40
  requirements:
41
- - - '>='
41
+ - - ">="
42
42
  - !ruby/object:Gem::Version
43
43
  version: '2.8'
44
- - !ruby/object:Gem::Dependency
45
- name: rr
46
- requirement: !ruby/object:Gem::Requirement
47
- requirements:
48
- - - '>='
49
- - !ruby/object:Gem::Version
50
- version: '1.0'
51
- type: :development
52
- prerelease: false
53
- version_requirements: !ruby/object:Gem::Requirement
54
- requirements:
55
- - - '>='
56
- - !ruby/object:Gem::Version
57
- version: '1.0'
58
44
  - !ruby/object:Gem::Dependency
59
45
  name: nokogiri
60
46
  requirement: !ruby/object:Gem::Requirement
61
47
  requirements:
62
- - - '>='
48
+ - - ">="
63
49
  - !ruby/object:Gem::Version
64
50
  version: 1.6.0
65
51
  type: :runtime
66
52
  prerelease: false
67
53
  version_requirements: !ruby/object:Gem::Requirement
68
54
  requirements:
69
- - - '>='
55
+ - - ">="
70
56
  - !ruby/object:Gem::Version
71
57
  version: 1.6.0
72
58
  - !ruby/object:Gem::Dependency
73
59
  name: guess_html_encoding
74
60
  requirement: !ruby/object:Gem::Requirement
75
61
  requirements:
76
- - - '>='
62
+ - - ">="
77
63
  - !ruby/object:Gem::Version
78
64
  version: 0.0.4
79
65
  type: :runtime
80
66
  prerelease: false
81
67
  version_requirements: !ruby/object:Gem::Requirement
82
68
  requirements:
83
- - - '>='
69
+ - - ">="
84
70
  - !ruby/object:Gem::Version
85
71
  version: 0.0.4
86
72
  description: Port of arc90's readability project to ruby
@@ -91,10 +77,10 @@ executables:
91
77
  extensions: []
92
78
  extra_rdoc_files: []
93
79
  files:
94
- - .gitignore
95
- - .rspec
96
- - .travis.yml
97
- - .yardopts
80
+ - ".github/workflows/ruby.yml"
81
+ - ".gitignore"
82
+ - ".rspec"
83
+ - ".yardopts"
98
84
  - Gemfile
99
85
  - Guardfile
100
86
  - LICENSE
@@ -108,9 +94,13 @@ files:
108
94
  - spec/fixtures/boing_boing.html
109
95
  - spec/fixtures/cant_read.html
110
96
  - spec/fixtures/code.html
97
+ - spec/fixtures/codinghorror.html
98
+ - spec/fixtures/images/Confusion_of_Tongues.png
99
+ - spec/fixtures/images/JohnPinhole.jpg
111
100
  - spec/fixtures/images/dim_1416768a.jpg
112
101
  - spec/fixtures/images/sign_up_emails_682__703711a.gif
113
102
  - spec/fixtures/images/sign_up_emails_682__703712a.gif
103
+ - spec/fixtures/nested_images.html
114
104
  - spec/fixtures/nytimes.html
115
105
  - spec/fixtures/sample.html
116
106
  - spec/fixtures/samples/blogpost_with_links-fragments.rb
@@ -129,24 +119,23 @@ files:
129
119
  homepage: http://github.com/cantino/ruby-readability
130
120
  licenses: []
131
121
  metadata: {}
132
- post_install_message:
122
+ post_install_message:
133
123
  rdoc_options: []
134
124
  require_paths:
135
125
  - lib
136
126
  required_ruby_version: !ruby/object:Gem::Requirement
137
127
  requirements:
138
- - - '>='
128
+ - - ">="
139
129
  - !ruby/object:Gem::Version
140
130
  version: '0'
141
131
  required_rubygems_version: !ruby/object:Gem::Requirement
142
132
  requirements:
143
- - - '>='
133
+ - - ">="
144
134
  - !ruby/object:Gem::Version
145
135
  version: '0'
146
136
  requirements: []
147
- rubyforge_project: ruby-readability
148
- rubygems_version: 2.0.3
149
- signing_key:
137
+ rubygems_version: 3.5.10
138
+ signing_key:
150
139
  specification_version: 4
151
140
  summary: Port of arc90's readability project to ruby
152
141
  test_files:
@@ -154,9 +143,13 @@ test_files:
154
143
  - spec/fixtures/boing_boing.html
155
144
  - spec/fixtures/cant_read.html
156
145
  - spec/fixtures/code.html
146
+ - spec/fixtures/codinghorror.html
147
+ - spec/fixtures/images/Confusion_of_Tongues.png
148
+ - spec/fixtures/images/JohnPinhole.jpg
157
149
  - spec/fixtures/images/dim_1416768a.jpg
158
150
  - spec/fixtures/images/sign_up_emails_682__703711a.gif
159
151
  - spec/fixtures/images/sign_up_emails_682__703712a.gif
152
+ - spec/fixtures/nested_images.html
160
153
  - spec/fixtures/nytimes.html
161
154
  - spec/fixtures/sample.html
162
155
  - spec/fixtures/samples/blogpost_with_links-fragments.rb
data/.travis.yml DELETED
@@ -1,6 +0,0 @@
1
- language: ruby
2
- rvm:
3
- - 2.1.1
4
- - 2.0.0
5
- - 1.9.3
6
- script: "bundle exec rspec"