RubyGems - readability-rb - Versions diffs - 0.1.0 → 0.3.0 - Mend

readability-rb 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: bcab6fa16fd851068954a9fa17a234e29f23f7406e094bc6f265954c29ce231a
-  data.tar.gz: c2866940fbd118a73065bd51de75491e9a02284ca0287953a53e8841d117029d
+  metadata.gz: b7c8a5adc3a628af9f665a4a90129612b5be891797f0b90681ed96be5566a9fb
+  data.tar.gz: 76eb683b6a38605b637cbcbda8c25b96ef3060f7f8d277acfd3f4885537a6708
 SHA512:
-  metadata.gz: 4ce2c23e7ddc3321dabd4bd45762c8c7950ceb4e3fb2bf1f48852c66937d0715f98027a0233b8e6f2a09628fc28f078eba8e68b2336c7f219890af3ddc9daae3
-  data.tar.gz: 86b0254a34654db1e910b3b8b0640fca6f77a1f518d29bd9816f49035e9308f492fb747be8e6b82257bdae7a2b184a7e13d469b91ed83e9fd1acdcacdbed3403
+  metadata.gz: 94f81c5b0502338b55ed611a6c289dfbd8250b1c058df5b45dff87bd126bcff2038518e85da4dfff0f6baea366f4cdb76305e258936862d063e1f964e621e1bf
+  data.tar.gz: ae056bc80680ad1416829e7454f0045c49ac7978450269ea2a5ee29955a7b84a11511100fe4ae2283abba835d4449be3fc22caf037e51fb9cfad4580cc61b5ef

data/lib/readability/cleaner.rb CHANGED Viewed

@@ -447,11 +447,15 @@ module Readability
           next true
         end
-        if get_char_count(node, ",") < 10
-          p_count = node.css("p").length
-          img_count = node.css("img").length
-          li_count = node.css("li").length - 100
-          input_count = node.css("input").length
+        inner_text = get_inner_text(node)
+        if inner_text.split(COMMAS).length - 1 < 10
+          tag_counts = Hash.new(0)
+          node.css("p, img, li, input").each { |n| tag_counts[n.name] += 1 }
+          p_count = tag_counts["p"]
+          img_count = tag_counts["img"]
+          li_count = tag_counts["li"] - 100
+          input_count = tag_counts["input"]
           heading_density = get_text_density(node, ["h1", "h2", "h3", "h4", "h5", "h6"])
           embed_count = 0
@@ -477,15 +481,13 @@ module Readability
           end
           next false if skip_removal
-          inner_text = get_inner_text(node)
           # Toss any node whose inner text contains nothing but suspicious words
           if AD_WORDS.match?(inner_text) || LOADING_WORDS.match?(inner_text)
             next true
           end
           content_length = inner_text.length
-          link_density = get_link_density(node)
+          link_density = get_link_density(node, text_length: content_length)
           textish_tags = %w[span li td] + DIV_TO_P_ELEMS.to_a
           text_density = get_text_density(node, textish_tags)
           is_figure_child = has_ancestor_tag?(node, "figure")

data/lib/readability/document.rb CHANGED Viewed

@@ -42,9 +42,14 @@ module Readability
     def parse
       # Avoid parsing too large documents
       if @max_elems_to_parse > 0
-        num_tags = @doc.css("*").length
-        if num_tags > @max_elems_to_parse
-          raise "Aborting parsing document; #{num_tags} elements found"
+        count = 0
+        @doc.traverse do |n|
+          if n.element?
+            count += 1
+            if count > @max_elems_to_parse
+              raise "Aborting parsing document; #{count} elements found"
+            end
+          end
         end
       end
@@ -59,6 +64,9 @@ module Readability
       prep_document
+      # Cache the prepped body HTML for retry re-parsing (avoids innerHTML= cost)
+      @prepped_body_html = @doc.at_css("body")&.inner_html
       metadata = get_article_metadata(json_ld)
       @metadata = metadata
       @article_title = metadata["title"]
@@ -109,7 +117,9 @@ module Readability
         return nil
       end
-      page_cache_html = page.inner_html
+      # Preserve the lang attribute from the HTML element before any retry re-parsing
+      preserved_article_lang = @doc.root && @doc.root["lang"]
+      preserved_article_dir = @doc.root && @doc.root["dir"]
       while true
         log("Starting grabArticle loop")
@@ -497,10 +507,10 @@ module Readability
         text_length = get_inner_text(article_content, true).length
         if text_length < @char_threshold
           parse_successful = false
-          page.inner_html = page_cache_html
+          # Store serialized HTML instead of node references to avoid pinning old documents
           @attempts << {
-            article_content: article_content,
+            html: article_content.inner_html,
             text_length: text_length
           }
@@ -517,9 +527,30 @@ module Readability
             # But first check if we actually have something
             return nil if @attempts[0][:text_length] == 0
-            article_content = @attempts[0][:article_content]
+            # Re-parse the best attempt from serialized HTML
+            best_doc = Nokogiri::HTML5("<html><body>#{@attempts[0][:html]}</body></html>")
+            best_doc.root["lang"] = preserved_article_lang if preserved_article_lang
+            best_doc.root["dir"] = preserved_article_dir if preserved_article_dir
+            article_content = best_doc.at_css("body")
+            @doc = best_doc
             parse_successful = true
           end
+          unless parse_successful
+            # Create a fresh document from the prepped body HTML, allowing the old one to be GC'd
+            @doc = Nokogiri::HTML5("<html><head></head><body>#{@prepped_body_html}</body></html>")
+            # Restore the lang attribute on the new HTML element so it's picked up during traversal
+            @doc.root["lang"] = preserved_article_lang if preserved_article_lang
+            @doc.root["dir"] = preserved_article_dir if preserved_article_dir
+            page = @doc.at_css("body")
+            # Clear node-referencing instance variables since they point to the old document
+            @candidates = {}
+            @data_tables = Set.new
+            @article_byline = nil
+            @article_dir = nil
+            @article_lang = preserved_article_lang
+          end
         end
         if parse_successful

data/lib/readability/scoring.rb CHANGED Viewed

@@ -50,8 +50,8 @@ module Readability
     # Port of _getLinkDensity (JS line 2143)
     # Returns the ratio of anchor text length to total text length.
     # Fragment-only links (#...) count at 0.3 coefficient.
-    def get_link_density(element)
-      text_length = get_inner_text(element).length
+    def get_link_density(element, text_length: nil)
+      text_length ||= get_inner_text(element).length
       return 0 if text_length == 0
       link_length = 0.0

data/lib/readability/utils.rb CHANGED Viewed

@@ -76,7 +76,7 @@ module Readability
         js_trim(node.text).empty? &&
         (node.element_children.empty? ||
           node.element_children.length ==
-            node.css("br").length + node.css("hr").length)
+            node.css("br, hr").length)
     end
     # Port of _hasChildBlockElement (JS line 2044)

data/lib/readability/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Readability
-  VERSION = "0.1.0"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: readability-rb
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Andy Croll
-autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-04-13 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -52,7 +51,6 @@ metadata:
   source_code_uri: https://github.com/andycroll/readability-rb
   changelog_uri: https://github.com/andycroll/readability-rb/commits/main
   bug_tracker_uri: https://github.com/andycroll/readability-rb/issues
-post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -60,15 +58,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '3.1'
+      version: '3.2'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.3
-signing_key:
+rubygems_version: 3.6.9
 specification_version: 4
 summary: Extract readable article content from HTML pages
 test_files: []