RubyGems - ruby-readability - Versions diffs - 0.7.1 → 0.7.2 - Mend

ruby-readability 0.7.1 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 906a25fd00e8fc221c84aa41fedf38bbd3045aa0e4a543ff16a1d494e59c3a92
-  data.tar.gz: bf28e458f7fb7f87a49ea71f16e736191c53130b91bdf2203cf260e6dce99aee
+  metadata.gz: f83eb55e4c0c4c30ad54e8e7104d68da8a5eb2b4d9cc76b45255055d89bf4b5c
+  data.tar.gz: 4d003c39b589477449bedd34634c5482dd503e94bfe24b9a5c29ea94f9b49f83
 SHA512:
-  metadata.gz: e2d262b6c4f0d7a2146718d3e16c0dd8973b217a9fe0ba850d03a456c68b7bd4355cbdd0a78454b09f6f50717c87ac8da524d42d99e78e0f362830c554376fdd
-  data.tar.gz: 6306f195c8d40842c0a4ed8ab2cfab1648fc562b03ba3137a0fd8c68ecb7a3668357c83abefd2b76bcac06efc961cdd042be10f44760aa102e34cdce2fe5d6d4
+  metadata.gz: e799e831297b18b381c3b1caad19531f99fe084f640afbddd1cf91e75fe234d3af4618f07e02a0c6214824726e3afe79accbb8ea5f0d66d9117b13112d22e8ef
+  data.tar.gz: 404d3a1bc702f3bd609e8c3ba8e37d6f023b2a3c126c278e7463a3dfee1cc5bf683f6c0c75cfabbb14e477f582b33cc8204d8682f33ed9a235b6fac8e90d9ad2

data/README.md CHANGED Viewed

@@ -41,7 +41,7 @@ You may provide options to `Readability::Document.new`, including:
 * `:remove_empty_nodes`: remove `<p>` tags that have no text content; also
   removes `<p>` tags that contain only images;
 * `:attributes`: whitelist of allowed attributes;
-* `:debug`: provide debugging output, defaults false;
+* `:debug`: provide debugging output, defaults false; supports setting a Proc;
 * `:encoding`: if the page is of a known encoding, you can specify it; if left
    unspecified, the encoding will be guessed (only in Ruby 1.9.x). If you wish
    to disable guessing, supply `:do_not_guess_encoding => true`;

data/lib/readability.rb CHANGED Viewed

@@ -19,9 +19,10 @@ module Readability
       :blacklist                  => nil,
       :whitelist                  => nil,
       :elements_to_score          => ["p", "td", "pre"],
-      :likely_siblings            => ["p"]
+      :likely_siblings            => ["p"],
+      :ignore_redundant_nesting   => false
     }.freeze
     REGEXES = {
         :unlikelyCandidatesRe => /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup/i,
         :okMaybeItsACandidateRe => /and|article|body|column|main|shadow/i,
@@ -35,7 +36,7 @@ module Readability
         :killBreaksRe => /(<br\s*\/?>(\s|&nbsp;?)*){1,}/,
         :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i
     }
     attr_accessor :options, :html, :best_candidate, :candidates, :best_candidate_has_image
     def initialize(input, options = {})
@@ -50,7 +51,7 @@ module Readability
       @input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
       @remove_unlikely_candidates = @options[:remove_unlikely_candidates]
       @weight_classes = @options[:weight_classes]
-      @clean_conditionally = @options[:clean_conditionally]
+      @clean_conditionally = !!@options[:clean_conditionally]
       @best_candidate_has_image = true
       make_html
       handle_exclusions!(@options[:whitelist], @options[:blacklist])
@@ -145,11 +146,11 @@ module Readability
       (list_images.empty? and content != @html) ? images(@html, true) : list_images
     end
     def images_with_fqdn_uris!(source_uri)
       images_with_fqdn_uris(@html, source_uri)
     end
     def images_with_fqdn_uris(document = @html.dup, source_uri)
       uri = URI.parse(source_uri)
       host = uri.host
@@ -161,7 +162,7 @@ module Readability
       images = []
       document.css("img").each do |elem|
         begin
-          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
+          elem['src'] = URI.join(base,elem['src']).to_s if URI.parse(elem['src']).host == nil
           images << elem['src'].to_s
         rescue URI::InvalidURIError => exc
           elem.remove
@@ -264,14 +265,25 @@ module Readability
       sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max
       downcased_likely_siblings = options[:likely_siblings].map(&:downcase)
       output = Nokogiri::XML::Node.new('div', @html)
-      best_candidate[:elem].parent.children.each do |sibling|
+      # If the best candidate is the only element in its parent then we will never find any siblings. Therefore,
+      # find the closest ancestor that has siblings (if :ignore_redundant_nesting is true). This improves the
+      # related content detection, but could lead to false positives. Not supported in arc90's readability.
+      node =
+        if options[:ignore_redundant_nesting]
+          closest_node_with_siblings(best_candidate[:elem])
+        else
+          best_candidate[:elem] # This is the default behaviour for consistency with arc90's readability.
+        end
+      node.parent.children.each do |sibling|
         append = false
-        append = true if sibling == best_candidate[:elem]
+        append = true if sibling == node
         append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold
         if downcased_likely_siblings.include?(sibling.name.downcase)
           link_density = get_link_density(sibling)
-          node_content = sibling.text
+          node_content = sibling.text.strip
           node_length = node_content.length
           append = if node_length > 80 && link_density < 0.25
@@ -291,6 +303,23 @@ module Readability
       output
     end
+    def closest_node_with_siblings(element)
+      node = element
+      until node.node_name == 'body'
+        siblings = node.parent.children
+        non_empty = siblings.reject { |sibling| sibling.text? && sibling.text.strip.empty? }
+        if non_empty.size > 1
+          return node
+        else
+          node = node.parent
+        end
+      end
+      node
+    end
     def select_best_candidate(candidates)
       sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] }
@@ -372,7 +401,11 @@ module Readability
     end
     def debug(str)
-      puts str if options[:debug]
+      if options[:debug].respond_to?(:call)
+        options[:debug].call(str)
+      elsif options[:debug]
+        puts str
+      end
     end
     def remove_unlikely_candidates!
@@ -426,7 +459,8 @@ module Readability
       # We'll sanitize all elements using a whitelist
       base_whitelist = @options[:tags] || %w[div p]
-      all_whitelisted = base_whitelist.include?("*")
+      all_tags_whitelisted = base_whitelist.include?("*")
+      all_attr_whitelisted = @options[:attributes] && @options[:attributes].include?("*")
       # We'll add whitespace instead of block elements,
       # so a<br>b will have a nice space between them
@@ -440,8 +474,8 @@ module Readability
       ([node] + node.css("*")).each do |el|
         # If element is in whitelist, delete all its attributes
-        if all_whitelisted || whitelist[el.node_name]
-          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) }
+        if all_tags_whitelisted || whitelist[el.node_name]
+          el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } unless all_attr_whitelisted
           # Otherwise, replace the element with its contents
         else
@@ -470,30 +504,43 @@ module Readability
     def clean_conditionally(node, candidates, selector)
       return unless @clean_conditionally
       node.css(selector).each do |el|
         weight = class_weight(el)
         content_score = candidates[el] ? candidates[el][:content_score] : 0
         name = el.name.downcase
+        remove = false
+        message = nil
         if weight + content_score < 0
-          el.remove
-          debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.")
+          remove = true
+          message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero."
         elsif el.text.count(",") < 10
           counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m }
           counts["li"] -= 100
           # For every img under a noscript tag discount one from the count to avoid double counting
           counts["img"] -= el.css("noscript").css("img").length
           content_length = el.text.strip.length  # Count the text length excluding any surrounding whitespace
           link_density = get_link_density(el)
           reason = clean_conditionally_reason?(name, counts, content_length, options, weight, link_density)
           if reason
-            debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.")
-            el.remove
+            message = "Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}."
+            remove = true
           end
         end
+        if options[:clean_conditionally].respond_to?(:call)
+          context = { remove: remove, message: message, weight: weight, content_score: content_score, el: el }
+          remove = options[:clean_conditionally].call(context) # Allow the user to override the decision for whether to remove the element.
+        end
+        if remove
+          debug(message || "Conditionally cleaned by user-specified function.")
+          el.remove
+        end
       end
     end

data/ruby-readability.gemspec CHANGED Viewed

@@ -3,7 +3,7 @@ $:.push File.expand_path("../lib", __FILE__)
 Gem::Specification.new do |s|
   s.name        = "ruby-readability"
-  s.version     = '0.7.1'
+  s.version     = '0.7.2'
   s.authors     = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
   s.email       = ["andrew@iterationlabs.com"]
   s.homepage    = "http://github.com/cantino/ruby-readability"

data/spec/readability_spec.rb CHANGED Viewed

@@ -115,6 +115,11 @@ describe Readability do
       expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
     end
+    it "should be able to whitelist all attributes" do
+      @doc = Readability::Document.new(@nested, attributes: ["*"], tags: ["*"])
+      expect(@doc.content).to include('<img src="http://example.com/image.jpeg" />')
+    end
     it "should not try to download local images" do
       @doc = Readability::Document.new(<<-HTML)
         <html>
@@ -498,6 +503,39 @@ describe Readability do
               <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the best_candidate.</p>
               <p>The likely_siblings now include the section tag so it should be included in the output.</p>
             </section>
+            <section>
+              <p>too short when stripped                                                                                  </p>
+            </section>
+            #{'<a href="/">This link lowers the body score.</a>' * 5}
+          </body>
+        </html>
+      HTML
+      expect(@doc.content).to include("Paragraph 1")
+      expect(@doc.content).to include("Paragraph 2")
+      expect(@doc.content).to include("should be included")
+      expect(@doc.content).not_to include("too short when stripped")
+    end
+    it "climbs the DOM tree to the closest ancestor that has siblings when checking for related siblings" do
+      @doc = Readability::Document.new(<<-HTML, min_text_length: 1, elements_to_score: ["h1", "p"], likely_siblings: ["section"], ignore_redundant_nesting: true)
+        <html>
+          <head>
+            <title>title!</title>
+          </head>
+          <body>
+            <div> <!-- This is the closest node of the best candidate that has siblings. -->
+              <div>
+                <section>
+                  <p>Paragraph 1</p>
+                  #{'<p>Paragraph 2</p>' * 10} <!-- Ensure this section remains the best_candidate. -->
+                </section>
+              </div>
+            </div>
+            <section>
+              <p>This paragraph is longer than 80 characters and inside a section that is a sibling of the ancestor node.</p>
+              <p>The likely_siblings now include the section tag so it should be included in the output.</p>
+            </section>
             #{'<a href="/">This link lowers the body score.</a>' * 5}
           </body>
         </html>
@@ -739,11 +777,33 @@ describe Readability do
   end
   describe "clean_conditionally_reason?" do
-    let (:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
+    let(:list_fixture) { "<div><p>test</p>#{'<li></li>' * 102}" }
     it "does not raise error" do
       @doc = Readability::Document.new(list_fixture)
       expect { @doc.content }.to_not raise_error
     end
   end
+  describe "clean_conditionally" do
+    let(:fixture) { "<html><head><title>title!</title></head><body><div><p>Some content</p></div><div class='sidebar'><p>sidebar<p></div></body>" }
+    it "can set a clean_conditionally function to allow overriding the default decision" do
+      clean_conditionally_fn = lambda { |context| !context[:remove] } # Flip the decision.
+      content = Readability::Document.new(fixture, clean_conditionally: clean_conditionally_fn, min_text_length: 0, retry_length: 1).content
+      expect(content).to include("sidebar")
+      expect(content).not_to include('Some content')
+    end
+  end
+  describe "debug" do
+    it "can set a debug function, e.g. to send output to Rails logger" do
+      output = []
+      debug_fn = lambda { |str| output << str }
+      Readability::Document.new(@simple_html_fixture, debug: debug_fn).content
+      expect(output).not_to be_empty
+    end
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: ruby-readability
 version: !ruby/object:Gem::Version
-  version: 0.7.1
+  version: 0.7.2
 platform: ruby
 authors:
 - Andrew Cantino
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-06-11 00:00:00.000000000 Z
+date: 2024-08-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -134,7 +134,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.10
+rubygems_version: 3.5.14
 signing_key:
 specification_version: 4
 summary: Port of arc90's readability project to ruby