RubyGems - readability_js - Versions diffs - 0.0.2 → 0.0.4 - Mend

readability_js 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/.gitattributes +2 -0
data/CHANGELOG.md +6 -0
data/README.md +4 -3
data/lib/readability_js/extended.rb +261 -0
data/lib/readability_js/node/readability-example.js +8 -2
data/lib/readability_js/nodo.rb +19 -5
data/lib/readability_js/version.rb +1 -1
data/lib/readability_js.rb +102 -153
metadata +3 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 03cb241180cad18709eb90a638563727c1209a5debdb4030685842abab87b86d
-  data.tar.gz: 7d1db45a012d4b45087201b621b2fad4cd8f8c85581eccea50dd1dc2cd11d671
+  metadata.gz: 8ce8fc0727f6b8ce1bfc45cf586b58c0e895deb95ba8f5f341a0d41ed6e9a9ba
+  data.tar.gz: 5baf642f9053d3c0adb9b81e04f9ff0d4b0252725274827143340fcb6f4067cf
 SHA512:
-  metadata.gz: 02211235463faf2f652d9d04667feb6a2a9bf1575966cec5b35fb8a2c11ce42972d0cf757ba637df8ffcab3bd10d647cf8181f9e2a5a3c0bdea7dae89ece14c0
-  data.tar.gz: cb77d8c08d3f0487238eff114e3adff58fe18514f9b1027703842e5bc85c8e8212437cc001ceb5de2abbcf6d53e5a7b8f83d8daf7b96c12c4a5c3dff0480a02e
+  metadata.gz: 3ee241f68497574ea9477afa66e1b605007446f36ea29fcd4760581891e7482b167249e2dd502254e4407bee1cb22501eea40b74be920a4097de656f808690ac
+  data.tar.gz: 8d8d6a3d85108e590762ed17f70f12142d8aca5787abaf8cb3fc5160b47aa5e7397605bf1bcbedb17fd3761335862223c45426e3be13d3bdfe1c288084396690

data/.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ spec/examples/* linguist-vendored
2	+ node_modules/* linguist-vendored

data/CHANGELOG.md CHANGED Viewed

@@ -1,2 +1,8 @@
+## 0.0.4
+* Do not add image_url to text_content result.
+## 0.0.3
+* Finally finished complete implementation of `isProbablyReaderable` including visibility function parameter.
 ## 0.0.1
 * Initial release with basic wrapper

data/README.md CHANGED Viewed

@@ -69,7 +69,7 @@ and includes a beautified markdown version of the content.
 ```
 ### Query parameters
-You can pass all parameters supported by readability, checkout the [rubydoc for more details](https://www.rubydoc.info/gems/readability_js/ReadabilityJs).
+You can pass all parameters supported by readability, checkout the [rubydoc for more details](https://rubydoc.info/github/magynhard/ruby-readability_js/ReadabilityJs).
 Here an example with all parameters, the camelCase parameters are converted to snake_case in ruby:
@@ -97,14 +97,15 @@ It contains the data returned by readability, with hash keys transformed in snak
   "dir" => "ltr",
   "site_name" => "example.com",
   "lang" => "en",
-  "published_time" => "2024-01-01T12:00:00Z"
+  "published_time" => "2024-01-01T12:00:00Z",
+  "image_url" => "https://example.com/image.jpg" # only for extended parse
 }
 ```
 <a name="documentation"></a>
 ## Documentation
 Check out the doc at RubyDoc:<br>
-https://www.rubydoc.info/gems/readability_js
+https://rubydoc.info/github/magynhard/ruby-readability_js
 As this library is only a wrapper, checkout the original readability documentation:<br>

data/lib/readability_js/extended.rb ADDED Viewed

@@ -0,0 +1,261 @@
+module ReadabilityJs
+  class Extended
+    SELECTOR_BLACKLIST = [
+      ".Article-Partner",
+      ".Article-Partner-Text",
+      ".Article-Comments-Button",
+      "#isl-5-AdCarousel",
+      "#isl-10-ArticleComments",
+      "*[data-element-tracking-name]",
+      "*[aria-label='Anzeige']",
+      "nav[aria-label='breadcrumb']",
+      # heise
+      "a-video",
+      "a-gift",
+      "a-collapse",
+      "a-opt-in",
+      # spiegel
+      "[data-area='related_articles']",
+      # welt
+      "nav[aria-label='Breadcrumb']",
+      ".c-inline-teaser-list",
+      "[width='1'][height='1']",
+      # golem
+      ".go-alink-list",
+      # faz
+      "[data-external-selector='related-articles-entries']",
+      ".BigBox",
+      # frankfurter rundschau
+      ".id-Breadcrumb-item",
+      ".id-Story-interactionBar",
+      "revenue-reel",
+      ".id-StoryElement-factBox",
+      # stern
+      ".breadcrumb",
+      ".teaser",
+      ".group-teaserblock__items",
+      ".title__kicker",
+      "ws-adtag",
+      # taz
+      "[data-for='webelement_bio']",
+      "[data-for='webelement_citation']",
+      "#articleTeaser",
+      ".article-produktteaser-container",
+      "[x-data='{}']",
+      "#komune",
+      ".community",
+    ]
+    def self.before_cleanup(html)
+      pre_parser html
+    end
+    def self.after_cleanup(result, html)
+      find_and_add_picture result, html
+      clean_up_and_enrich_result result
+    end
+    private
+    #
+    # Pre-parser to clean up HTML before passing it to Readability
+    #
+    # SELECTOR_BLACKLIST contains CSS selectors of elements to be removed from the HTML
+    # before parsing to improve content extraction.
+    #
+    # @param html [String] The HTML document as a string.
+    # @return [String] The cleaned HTML document as a string.
+    #
+    def self.pre_parser(html)
+      doc = Nokogiri::HTML(html)
+      # Remove blacklisted elements by selector
+      SELECTOR_BLACKLIST.each do |classname|
+        doc.css("#{classname}").remove
+      end
+      doc.to_html
+    end
+    #
+    # Post-parser to find and add lead image URL if missing.
+    #
+    # Will add a picture into the result hash under the key "image_url".
+    #
+    # Looks for Open Graph and Twitter Card meta tags to find a lead image URL.
+    # If not found, it will have a look into the markdown content for the first image.
+    #
+    # @param result [Hash] The result hash from Readability parsing.
+    # @param html [String] The original HTML document as a string.
+    # @return [Hash] The updated result hash.
+    #
+    def self.find_and_add_picture(result, html)
+      return result if result.key?("lead_image_url") && !result["lead_image_url"].to_s.strip.empty?
+      doc = Nokogiri::HTML(html)
+      # try to find og:image or twitter:image meta tags
+      meta_tags = doc.css('meta[property="og:image"], meta[name="og:image"], meta[name="twitter:image"]')
+      meta_tags.each do |meta_tag|
+        content = meta_tag['content']
+        if content && !content.strip.empty?
+          result["image_url"] = content.strip
+          break
+        end
+      end
+      # try to find first image in markdown content if no meta tag found before
+      if !result.key?("image_url") || result["image_url"].to_s.strip.empty?
+        if result.key?("markdown_content")
+          md_content = result["markdown_content"]
+          md_content.scan(/!\[.*?\]\((.*?)\)/).each do |match|
+            img_url = match[0]
+            if img_url && !img_url.strip.empty?
+              # check if img ends with common image file extensions
+              if img_url =~ /\.(jpg|jpeg|png|gif|webp|svg|tif|avif)(\?.*)?$/i
+                result["image_url"] = img_url.strip
+                break
+              end
+            end
+          end
+        end
+      end
+      result
+    end
+    #
+    # Post-parser to clean up extracted content after Readability processing
+    #
+    # Cleans up comment artifacts and beautifies HTML and adds beautified Markdown content.
+    #
+    # @param result [Hash] The result hash from Readability parsing.
+    # @return [Hash] The cleaned result hash.
+    #
+    def self.clean_up_and_enrich_result(result)
+      result["content"] = clean_up_comments(result["content"]) if result.key?("content")
+      result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
+      result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
+      result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
+      if result.key?("content")
+        result = beautify_html_and_text(result)
+        result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
+        result = beautify_markdown(result)
+      end
+      result
+    end
+    #
+    # Remove/replace comment / artifact noise like <!--[--&gt;, <!----&gt; etc.
+    #
+    # @param html [String] The HTML content as a string.
+    # @return [String] The cleaned HTML content as a string.
+    #
+    def self.clean_up_comments(html)
+      copy = html.dup
+      # Turn \x3C before comment start into '<'
+      copy.gsub!(/\\x3C(?=!--)/, '<')
+      # Decode encoded comment end --&gt; to -->
+      copy.gsub!(/--&gt;/, '-->')
+      # Remove fully empty or artifact comments ([], only whitespace)
+      copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
+      # Collapse multiple dummy comment chains
+      copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
+      # Remove remaining comment artifacts like <!--[-->, <!--]-->
+      copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
+      # Remove any remaining regular comments
+      copy.gsub!(/<!--.*?-->/m, '')
+      # Reduce excessive whitespace / blank lines (real newlines)
+      copy.gsub!(/\n[ \t]+\n/, "\n")
+      copy.gsub!(/\n{3,}/, "\n\n")
+      # Remove any remaining script tags (including encoded variants)
+      copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
+      # Preserve blocks where whitespace/newlines matter
+      preserve_tags = %w[pre code textarea]
+      preserved = {}
+      preserve_tags.each_with_index do |tag, idx|
+        copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
+          key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
+          preserved[key] = block
+          copy.sub!(block, key)
+        end
+      end
+      # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
+      copy.gsub!(/\\n\s*/, ' ')
+      # Collapse whitespace between tags to a single space or nothing
+      # Remove whitespace-only text nodes represented by spaces/newlines between tags
+      copy.gsub!(/>\s+</, '><')
+      # Normalize multiple spaces to a single space
+      copy.gsub!(/ {2,}/, ' ')
+      # Trim spaces directly inside tags (e.g., <p> text </p>)
+      copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
+      # Restore preserved blocks
+      preserved.each { |k, v| copy.sub!(k, v) }
+      copy.strip
+    end
+    #
+    # Beautify Markdown content by adding title if not present and fixing link spacing
+    #
+    # @param result [Hash] The result hash from Readability parsing.
+    # @return [Hash] The beautified result hash.
+    #
+    def self.beautify_markdown(result)
+      mark_down = result["markdown_content"]
+      # add title to markdown if not present
+      if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
+        mark_down = "# #{result['title']}\n\n" + mark_down
+      end
+      # Check for image and if none is found, add after title if available
+      if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
+        has_image = mark_down.match(/!\[.*?\]\(.*?\)/)
+        if !has_image
+          img_md = "![Lead Image](#{result['image_url']})\n\n"
+          mark_down = mark_down.sub(/^# .+?\n/, "\\0" + img_md)
+        end
+      end
+      # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
+      mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
+      result["markdown_content"] = mark_down
+      result
+    end
+    #
+    # Beautify HTML content by adding title if not present and fixing link spacing
+    #
+    # @param result [Hash] The result hash from Readability parsing.
+    # @return [String] The beautified HTML content as a string.
+    #
+    def self.beautify_html_and_text(result)
+      html = result["content"]
+      text = result["text_content"]
+      # Add title to html and text if not present
+      if (html.index(/h[1-2]/) && html.index(/h[1-2]/).to_i > 128 && result.key?("title") && !result["title"].to_s.strip.empty? && !html.include?(result["title"])) || html.index(/h[1-2]/).nil?
+        title_tag = "<h1>#{result['title']}</h1>\n"
+        html = title_tag + html
+        text = result['title'] + "\n\n" + text
+      end
+      # Check for image and if none is found, add after title if available
+      if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
+        doc = Nokogiri::HTML(html)
+        # check for img tags but also for picture tags
+        has_image = !doc.css('img, picture').empty?
+        if !has_image
+          img_tag = "<p><img src=\"#{result['image_url']}\" alt=\"Lead Image\"></p>\n"
+          h1 = doc.at_css('h1')
+          if h1
+            h1.add_next_sibling(Nokogiri::HTML::DocumentFragment.parse(img_tag))
+            html = doc.to_html
+          end
+        end
+      end
+      # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
+      doc = Nokogiri::HTML(html)
+      doc.css('a').each do |link|
+        next if link.next_sibling.nil?
+        if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
+          link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
+        end
+      end
+      result["content"] = doc.to_html
+      result["text_content"] = text
+      result
+    end
+  end
+end

data/lib/readability_js/node/readability-example.js CHANGED Viewed

@@ -1,4 +1,4 @@
-const { Readability } = require('@mozilla/readability');
+const { Readability, isProbablyReaderable } = require('@mozilla/readability');
 const { JSDOM } = require('jsdom');
 const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
@@ -7,4 +7,10 @@ const doc = new JSDOM("<body>Look at this cat: <img src='./cat.jpg'></body>", {
 let reader = new Readability(doc.window.document);
 let article = reader.parse();
-console.log(article);
+console.log(article);
+if(isProbablyReaderable(doc.window.document)) {
+    console.log("This document is probably readerable.");
+} else {
+    console.log("This document is probably not readerable.");
+}

data/lib/readability_js/nodo.rb CHANGED Viewed

@@ -14,15 +14,22 @@ module ReadabilityJs
     #
     def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
       begin
-        self.new.parse html
+        # remove style tags from html, so jsdom does not need to process css and its warnings are not shown
+        html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
+        self.new.parse html, url, debug, max_elems_to_parse, nb_top_candidates, char_threshold, classes_to_preserve, keep_classes, disable_json_ld, serializer, allow_video_regex, link_density_modifier
       rescue ::Nodo::JavaScriptError => e
         raise ReadabilityJs::Error.new "#{e.message}"
       end
     end
-    def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: 'isNodeVisible')
+    #
+    # instance wrapper method, as nodo does not support class methods
+    #
+    def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
       begin
-        self.new.is_probably_readerable html
+        # remove style tags from html, so jsdom does not need to process css and its warnings are not shown
+        html = html.gsub(/<style[^>]*>.*?<\/style>/m, '')
+        self.new.is_probably_readerable html, min_content_length, min_score, visibility_checker
       rescue ::Nodo::JavaScriptError => e
         raise ReadabilityJs::Error.new "#{e.message}"
       end
@@ -57,9 +64,16 @@ module ReadabilityJs
     JS
     function :is_probably_readerable, <<~JS
-      async (html) => {
+      async (html, minContentLength, minScore, visibilityChecker) => {
         const doc = new jsdom.JSDOM(html);
-        return readability.Readability.isProbablyReaderable(doc);
+        let readability_options = {};
+        if(minContentLength !== undefined && minContentLength !== null) readability_options.minContentLength = minContentLength;
+        if(minScore !== undefined && minScore !== null) readability_options.minScore = minScore;
+        if(visibilityChecker !== undefined && visibilityChecker !== null) {
+          readability_options.visibilityChecker = eval(visibilityChecker);
+        }
+        return readability.isProbablyReaderable(doc.window.document, readability_options);
       }
     JS

data/lib/readability_js/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module ReadabilityJs
-  VERSION = '0.0.2'.freeze
+  VERSION = '0.0.4'.freeze
 end

data/lib/readability_js.rb CHANGED Viewed

@@ -6,6 +6,7 @@ require 'nokogiri'
 require_relative 'readability_js/version'
 require_relative 'readability_js/nodo'
+require_relative 'readability_js/extended'
 require_relative 'custom_errors/error'
@@ -15,56 +16,27 @@ require_relative 'custom_errors/error'
 module ReadabilityJs
-  SELECTOR_BLACKLIST = [
-    ".Article-Partner",
-    ".Article-Partner-Text",
-    ".Article-Comments-Button",
-    "#isl-5-AdCarousel",
-    "#isl-10-ArticleComments",
-    "*[data-element-tracking-name]",
-    "*[aria-label='Anzeige']",
-    "nav[aria-label='breadcrumb']",
-    # heise
-    "a-video",
-    "a-gift",
-    "a-collapse",
-    "a-opt-in",
-    # spiegel
-    "[data-area='related_articles']",
-    # welt
-    "nav[aria-label='Breadcrumb']",
-    ".c-inline-teaser-list",
-    # golem
-    ".go-alink-list",
-    # faz
-    "[data-external-selector='related-articles-entries']",
-    ".BigBox",
-    # frankfurter rundschau
-    ".id-Breadcrumb-item",
-    ".id-Story-interactionBar",
-    "revenue-reel",
-    ".id-StoryElement-factBox",
-    # stern
-    ".breadcrumb",
-    ".teaser",
-    ".group-teaserblock__items",
-    ".title__kicker",
-    # taz
-    "[data-for='webelement_bio']",
-    "[data-for='webelement_citation']",
-    "#articleTeaser",
-    ".article-produktteaser-container",
-    "[x-data='{}']",
-    "#komune",
-    ".community",
-  ]
   #
   # Parse a HTML document and extract its main content using Mozilla's Readability library.
-  # Raises ReadabilityJs::Error on failure.
   #
   # 'html' is a required parameters, all others are optional.
   #
+  # @param html [String] The HTML document as a string.
+  # @param url [String, nil] The URL of the document (optional, used for resolving relative links).
+  # @param debug [Boolean] Enable debug mode (default: false).
+  # @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
+  # @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
+  # @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
+  # @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
+  # @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
+  # @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
+  # @param serializer [String, nil] Serializer to use for output (optional).
+  # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
+  # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
+  # @return [Hash] A hash containing the extracted content and metadata.
+  #
+  # @raise [ReadabilityJs::Error] if an error occurs during execution
+  #
   def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
     begin
       result = ReadabilityJs::Nodo.parse(html, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier)
@@ -74,13 +46,60 @@ module ReadabilityJs
     end
   end
+  #
+  # Like #parse but with additional pre- and post-processing to enhance content extraction.
+  #
+  # 'html' is a required parameters, all others are optional.
+  #
+  # @param html [String] The HTML document as a string.
+  # @param url [String, nil] The URL of the document (optional, used for resolving relative links).
+  # @param debug [Boolean] Enable debug mode (default: false).
+  # @param max_elems_to_parse [Integer] Maximum number of elements to parse (default: 0, meaning no limit).
+  # @param nb_top_candidates [Integer] Number of top candidates to consider (default: 5).
+  # @param char_threshold [Integer] Minimum number of characters for an element to be considered (default: 500).
+  # @param classes_to_preserve [Array<String>] List of CSS classes to preserve in the output (default: []).
+  # @param keep_classes [Boolean] Whether to keep the original classes in the output (default: false).
+  # @param disable_json_ld [Boolean] Disable JSON-LD parsing (default: false).
+  # @param serializer [String, nil] Serializer to use for output (optional).
+  # @param allow_video_regex [String, nil] Regular expression to allow video URLs (optional).
+  # @param link_density_modifier [Float] Modifier for link density calculation (default: 0).
+  # @return [Hash] A hash containing the extracted content and metadata.
+  #
+  # @raise [ReadabilityJs::Error] if an error occurs during execution
+  #
   def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0)
-    result = pre_parser html
+    result = Extended::before_cleanup html
     result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier
-    clean_up_result result
+    Extended::after_cleanup result, html
   end
-  def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: 'isNodeVisible')
+  #
+  # Decides whether a document is probably readerable without parsing the whole document.
+  #
+  # Only 'html' is a required parameter, all others are optional.
+  #
+  # @param html [String] The HTML document as a string.
+  # @param min_content_length [Integer] Minimum content length to consider the document readerable
+  # @param min_score [Integer] Minimum score to consider the document readerable
+  # @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
+  # @return [Boolean] true if the document is probably readerable, false otherwise.
+  #
+  # @raise [ReadabilityJs::Error] if an error occurs during execution
+  #
+  # @example
+  #
+  # html = "<html>...</html>"
+  #
+  # visibility_checker = <<~JS
+  #   (node) => {
+  #    const style = node.ownerDocument.defaultView.getComputedStyle(node);
+  #    return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
+  #   }
+  # JS
+  #
+  # ReadabilityJs.is_probably_readerable(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
+  #
+  def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
     begin
       ReadabilityJs::Nodo.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
     rescue => e
@@ -88,12 +107,45 @@ module ReadabilityJs
     end
   end
-  def self.probably_readerable?(html)
-    self.is_probably_readerable(html)
+  #
+  # Decides whether a document is probably readerable without parsing the whole document.
+  #
+  # Only 'html' is a required parameter, all others are optional.
+  #
+  # @param html [String] The HTML document as a string.
+  # @param min_content_length [Integer] Minimum content length to consider the document readerable
+  # @param min_score [Integer] Minimum score to consider the document readerable
+  # @param visibility_checker [String] anonymous JavaScript function definition to check node visibility as string. Uses default visibility checker if not provided.
+  # @return [Boolean] true if the document is probably readerable, false otherwise.
+  #
+  # @raise [ReadabilityJs::Error] if an error occurs during execution
+  #
+  # @example
+  #
+  # html = "<html>...</html>"
+  #
+  # visibility_checker = <<~JS
+  #   (node) => {
+  #    const style = node.ownerDocument.defaultView.getComputedStyle(node);
+  #    return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
+  #   }
+  # JS
+  #
+  # ReadabilityJs.probably_readerable?(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
+  #
+  def self.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil)
+    self.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker)
   end
   private
+  #
+  # Normalize result keys to snake_case for ruby style
+  #
+  # @param result [Hash] The result hash from Readability
+  # @return [Hash] The normalized result hash
+  #
   def self.normalize_result(result)
     result["text_content"] = result.delete("textContent") if result.key?("textContent")
     result["site_name"] = result.delete("siteName") if result.key?("siteName")
@@ -101,108 +153,5 @@ module ReadabilityJs
     result
   end
-  def self.clean_up_result(result)
-    result["content"] = clean_up_comments(result["content"]) if result.key?("content")
-    result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
-    result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
-    result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
-    if result.key?("content")
-      result["content"] = beautify_html(result["content"])
-      result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
-      result = beautify_markdown(result)
-    end
-    result
-  end
-  # Replaces comment / artifact noise like <!--[--&gt;, <!----&gt; etc.
-  def self.clean_up_comments(html)
-    copy = html.dup
-    # Turn \x3C before comment start into '<'
-    copy.gsub!(/\\x3C(?=!--)/, '<')
-    # Decode encoded comment end --&gt; to -->
-    copy.gsub!(/--&gt;/, '-->')
-    # Remove fully empty or artifact comments ([], only whitespace)
-    copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
-    # Collapse multiple dummy comment chains
-    copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
-    # Remove remaining comment artifacts like <!--[-->, <!--]-->
-    copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
-    # Remove any remaining regular comments
-    copy.gsub!(/<!--.*?-->/m, '')
-    # Reduce excessive whitespace / blank lines (real newlines)
-    copy.gsub!(/\n[ \t]+\n/, "\n")
-    copy.gsub!(/\n{3,}/, "\n\n")
-    # Remove any remaining script tags (including encoded variants)
-    copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
-    # Preserve blocks where whitespace/newlines matter
-    preserve_tags = %w[pre code textarea]
-    preserved = {}
-    preserve_tags.each_with_index do |tag, idx|
-      copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
-        key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
-        preserved[key] = block
-        copy.sub!(block, key)
-      end
-    end
-    # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
-    copy.gsub!(/\\n\s*/, ' ')
-    # Collapse whitespace between tags to a single space or nothing
-    # Remove whitespace-only text nodes represented by spaces/newlines between tags
-    copy.gsub!(/>\s+</, '><')
-    # Normalize multiple spaces to a single space
-    copy.gsub!(/ {2,}/, ' ')
-    # Trim spaces directly inside tags (e.g., <p> text </p>)
-    copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
-    # Restore preserved blocks
-    preserved.each { |k, v| copy.sub!(k, v) }
-    copy.strip
-  end
-  def self.beautify_markdown(result)
-    mark_down = result["markdown_content"]
-    # add title to markdown if not present
-    if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
-      mark_down = "# #{result['title']}\n\n" + mark_down
-    end
-    # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
-    mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
-    result["markdown_content"] = mark_down
-    result
-  end
-  def self.beautify_html(html)
-    doc = Nokogiri::HTML(html)
-    # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
-    doc.css('a').each do |link|
-      next if link.next_sibling.nil?
-      if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
-        link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
-      end
-    end
-    doc.to_html
-  end
-  def self.pre_parser(html)
-    doc = Nokogiri::HTML(html)
-    # Remove blacklisted classes
-    SELECTOR_BLACKLIST.each do |classname|
-      doc.css("#{classname}").remove
-    end
-    doc.to_html
-  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: readability_js
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.4
 platform: ruby
 authors:
 - Matthäus Beyrle
@@ -145,6 +145,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".gitattributes"
 - ".gitignore"
 - ".rspec"
 - CHANGELOG.md
@@ -158,6 +159,7 @@ files:
 - cli/pry.rb
 - lib/custom_errors/error.rb
 - lib/readability_js.rb
+- lib/readability_js/extended.rb
 - lib/readability_js/node/node_modules/.bin/tldts
 - lib/readability_js/node/node_modules/.yarn-integrity
 - lib/readability_js/node/node_modules/@asamuzakjp/css-color/LICENSE