RubyGems - readability-rb - Versions diffs - 0.1.0 - Mend

readability-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +7 -0
data/LICENSE +189 -0
data/README.md +107 -0
data/lib/readability/cleaner.rb +742 -0
data/lib/readability/document.rb +555 -0
data/lib/readability/metadata.rb +299 -0
data/lib/readability/readerable.rb +61 -0
data/lib/readability/regexps.rb +91 -0
data/lib/readability/result.rb +17 -0
data/lib/readability/scoring.rb +99 -0
data/lib/readability/utils.rb +221 -0
data/lib/readability/version.rb +5 -0
data/lib/readability.rb +27 -0
metadata +74 -0

data/lib/readability/metadata.rb ADDED Viewed

@@ -0,0 +1,299 @@
+# frozen_string_literal: true
+module Readability
+  module Metadata
+    private
+    # Port of _unescapeHtmlEntities (JS line 1631-1651)
+    # Replaces named HTML entities and numeric character references.
+    def unescape_html_entities(str)
+      return str unless str
+      str
+        .gsub(/&(quot|amp|apos|lt|gt);/) { HTML_ESCAPE_MAP[$1] }
+        .gsub(/&#(?:x([0-9a-f]+)|([0-9]+));/i) do
+          hex, num_str = $1, $2
+          num = (hex || num_str).to_i(hex ? 16 : 10)
+          # Replace invalid code points with U+FFFD
+          if num == 0 || num > 0x10FFFF || (num >= 0xD800 && num <= 0xDFFF)
+            num = 0xFFFD
+          end
+          [num].pack("U")
+        end
+    end
+    # Port of _getJSONLD (JS line 1658-1773)
+    # Extracts metadata from JSON-LD script tags with schema.org context.
+    def get_json_ld(doc)
+      scripts = get_all_nodes_with_tag(doc, ["script"])
+      metadata = nil
+      scripts.each do |json_ld_element|
+        next if metadata
+        next unless json_ld_element["type"] == "application/ld+json"
+        begin
+          # Strip CDATA markers if present
+          content = json_ld_element.text.gsub(/\A\s*<!\[CDATA\[|\]\]>\s*\z/, "")
+          parsed = JSON.parse(content)
+          if parsed.is_a?(Array)
+            parsed = parsed.find do |it|
+              it["@type"] && it["@type"].match?(JSON_LD_ARTICLE_TYPES)
+            end
+            next unless parsed
+          end
+          schema_dot_org_regex = /\Ahttps?:\/\/schema\.org\/?\z/
+          matches = (parsed["@context"].is_a?(String) &&
+                      parsed["@context"].match?(schema_dot_org_regex)) ||
+                    (parsed["@context"].is_a?(Hash) &&
+                      parsed["@context"]["@vocab"].is_a?(String) &&
+                      parsed["@context"]["@vocab"].match?(schema_dot_org_regex))
+          next unless matches
+          if !parsed["@type"] && parsed["@graph"].is_a?(Array)
+            parsed = parsed["@graph"].find do |it|
+              (it["@type"] || "").match?(JSON_LD_ARTICLE_TYPES)
+            end
+          end
+          next if !parsed || !parsed["@type"] || !parsed["@type"].match?(JSON_LD_ARTICLE_TYPES)
+          metadata = {}
+          if parsed["name"].is_a?(String) && parsed["headline"].is_a?(String) &&
+              parsed["name"] != parsed["headline"]
+            # Both name and headline exist and differ — compare similarity to HTML title
+            title = get_article_title
+            name_matches = text_similarity(parsed["name"], title) > 0.75
+            headline_matches = text_similarity(parsed["headline"], title) > 0.75
+            if headline_matches && !name_matches
+              metadata["title"] = parsed["headline"]
+            else
+              metadata["title"] = parsed["name"]
+            end
+          elsif parsed["name"].is_a?(String)
+            metadata["title"] = parsed["name"].strip
+          elsif parsed["headline"].is_a?(String)
+            metadata["title"] = parsed["headline"].strip
+          end
+          if parsed["author"]
+            if parsed["author"].is_a?(Hash) && parsed["author"]["name"].is_a?(String)
+              metadata["byline"] = parsed["author"]["name"].strip
+            elsif parsed["author"].is_a?(Array) &&
+                parsed["author"][0] &&
+                parsed["author"][0]["name"].is_a?(String)
+              metadata["byline"] = parsed["author"]
+                .select { |author| author && author["name"].is_a?(String) }
+                .map { |author| author["name"].strip }
+                .join(", ")
+            end
+          end
+          if parsed["description"].is_a?(String)
+            metadata["excerpt"] = parsed["description"].strip
+          end
+          if parsed["publisher"].is_a?(Hash) && parsed["publisher"]["name"].is_a?(String)
+            metadata["site_name"] = parsed["publisher"]["name"].strip
+          end
+          if parsed["datePublished"].is_a?(String)
+            metadata["date_published"] = parsed["datePublished"].strip
+          end
+        rescue JSON::ParserError => e
+          # Handle malformed JSON gracefully
+          log(e.message) if respond_to?(:log, true)
+        end
+      end
+      metadata || {}
+    end
+    # Port of _getArticleTitle (JS line 573-661)
+    # Extracts and cleans the article title from the document.
+    def get_article_title
+      cur_title = ""
+      orig_title = ""
+      begin
+        cur_title = orig_title = (@doc.at_css("title")&.text&.strip || "")
+        # If title came back as something other than a string (shouldn't happen
+        # with Nokogiri, but match JS logic)
+        if !cur_title.is_a?(String)
+          cur_title = orig_title = get_inner_text(@doc.css("title").first)
+        end
+      rescue
+        # ignore exceptions setting the title
+      end
+      title_had_hierarchical_separators = false
+      word_count = ->(str) { str.split(/\s+/).length }
+      # Title separator characters — exact JS source string from line 597
+      title_separators = '\|\-\u2013\u2014\\\\\/>»'
+      if cur_title.match?(/\s[#{title_separators}]\s/)
+        title_had_hierarchical_separators = cur_title.match?(/\s[\\\/>\u00BB]\s/)
+        # Find all separator positions and remove everything after the last one
+        all_separators = orig_title.to_enum(:scan, /\s[#{title_separators}]\s/i).map { Regexp.last_match }
+        cur_title = orig_title[0, all_separators.last.begin(0)]
+        # If the resulting title is too short, remove the first part instead
+        if word_count.call(cur_title) < 3
+          cur_title = orig_title.sub(/\A[^#{title_separators}]*[#{title_separators}]/i, "")
+        end
+      elsif cur_title.include?(": ")
+        # Check if we have a heading containing this exact string
+        headings = get_all_nodes_with_tag(@doc, ["h1", "h2"])
+        trimmed_title = cur_title.strip
+        match = headings.any? { |heading| heading.text.strip == trimmed_title }
+        # If we don't, extract the title out of the original title string
+        unless match
+          cur_title = orig_title[(orig_title.rindex(":") + 1)..]
+          # If the title is now too short, try the first colon instead
+          if word_count.call(cur_title) < 3
+            cur_title = orig_title[(orig_title.index(":") + 1)..]
+          # But if we have too many words before the colon there's something weird
+          elsif word_count.call(orig_title[0, orig_title.index(":")]) > 5
+            cur_title = orig_title
+          end
+        end
+      elsif cur_title.length > 150 || cur_title.length < 15
+        h_ones = @doc.css("h1")
+        if h_ones.length == 1
+          cur_title = get_inner_text(h_ones[0])
+        end
+      end
+      cur_title = cur_title.strip.gsub(NORMALIZE, " ")
+      # If we now have 4 words or fewer as our title, and either no
+      # 'hierarchical' separators (\, /, > or ») were found in the original
+      # title or we decreased the number of words by more than 1 word, use
+      # the original title.
+      cur_title_word_count = word_count.call(cur_title)
+      if cur_title_word_count <= 4 &&
+          (!title_had_hierarchical_separators ||
+            cur_title_word_count !=
+              word_count.call(orig_title.gsub(/\s[#{title_separators}]\s/, "")) - 1)
+        cur_title = orig_title
+      end
+      cur_title
+    end
+    # Port of _getArticleMetadata (JS line 1783-1889)
+    # Extracts metadata from <meta> tags and merges with JSON-LD data.
+    def get_article_metadata(json_ld)
+      metadata = {}
+      values = {}
+      meta_elements = @doc.css("meta")
+      # property is a space-separated list of values
+      property_pattern = /\s*(article|dc|dcterm|og|twitter)\s*:\s*(author|creator|description|published_time|title|site_name)\s*/i
+      # name is a single value
+      name_pattern = /\A\s*(?:(dc|dcterm|og|twitter|parsely|weibo:(article|webpage))\s*[-.:]\s*)?(author|creator|pub-date|description|title|site_name)\s*\z/i
+      meta_elements.each do |element|
+        element_name = element["name"]
+        element_property = element["property"]
+        content = element["content"]
+        next unless content
+        matches = nil
+        name = nil
+        if element_property
+          matches = element_property.match(property_pattern)
+          if matches
+            # Convert to lowercase, and remove any whitespace
+            name = matches[0].downcase.gsub(/\s/, "")
+            values[name] = content.strip
+          end
+        end
+        if !matches && element_name && name_pattern.match?(element_name)
+          name = element_name
+          if content
+            # Convert to lowercase, remove whitespace, convert dots to colons
+            name = name.downcase.gsub(/\s/, "").gsub(".", ":")
+            values[name] = content.strip
+          end
+        end
+      end
+      # get title
+      metadata["title"] =
+        json_ld["title"] ||
+        values["dc:title"] ||
+        values["dcterm:title"] ||
+        values["og:title"] ||
+        values["weibo:article:title"] ||
+        values["weibo:webpage:title"] ||
+        values["title"] ||
+        values["twitter:title"] ||
+        values["parsely-title"]
+      metadata["title"] ||= get_article_title
+      article_author =
+        if values["article:author"].is_a?(String) && !is_url?(values["article:author"])
+          values["article:author"]
+        end
+      # get author
+      metadata["byline"] =
+        json_ld["byline"] ||
+        values["dc:creator"] ||
+        values["dcterm:creator"] ||
+        values["author"] ||
+        values["parsely-author"] ||
+        article_author
+      # get description
+      metadata["excerpt"] =
+        json_ld["excerpt"] ||
+        values["dc:description"] ||
+        values["dcterm:description"] ||
+        values["og:description"] ||
+        values["weibo:article:description"] ||
+        values["weibo:webpage:description"] ||
+        values["description"] ||
+        values["twitter:description"]
+      # get site name
+      metadata["siteName"] = json_ld["site_name"] || values["og:site_name"]
+      # get article published time
+      metadata["publishedTime"] =
+        json_ld["date_published"] ||
+        values["article:published_time"] ||
+        values["parsely-pub-date"] ||
+        nil
+      # Unescape HTML entities in all metadata values
+      metadata["title"] = unescape_html_entities(metadata["title"])
+      metadata["byline"] = unescape_html_entities(metadata["byline"])
+      metadata["excerpt"] = unescape_html_entities(metadata["excerpt"])
+      metadata["siteName"] = unescape_html_entities(metadata["siteName"])
+      metadata["publishedTime"] = unescape_html_entities(metadata["publishedTime"])
+      metadata
+    end
+  end
+end

data/lib/readability/readerable.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+require "set"
+module Readability
+  module Readerable
+    module_function
+    # For backward compat: accept a proc as second positional argument (matches JS API)
+    def probably_readerable?(doc, options_or_checker = {}, **kwargs)
+      if options_or_checker.is_a?(Proc)
+        kwargs[:visibility_checker] = options_or_checker
+        options_or_checker = {}
+      end
+      options = options_or_checker.is_a?(Hash) ? options_or_checker.merge(kwargs) : kwargs
+      min_score = options.fetch(:min_score, 20)
+      min_content_length = options.fetch(:min_content_length, 140)
+      visibility_checker = options.fetch(:visibility_checker, nil)
+      visibility_checker ||= method(:node_visible?)
+      nodes = doc.css("p, pre, article")
+      # Also include div parents of br nodes (some articles use div > br structure)
+      br_nodes = doc.css("div > br")
+      if br_nodes.any?
+        node_set = Set.new(nodes.to_a)
+        br_nodes.each { |br| node_set.add(br.parent) }
+        nodes = node_set.to_a
+      end
+      score = 0.0
+      nodes.any? do |node|
+        next false unless visibility_checker.call(node)
+        match_string = "#{node['class']} #{node['id']}"
+        next false if UNLIKELY_CANDIDATES.match?(match_string) && !OK_MAYBE_CANDIDATE.match?(match_string)
+        next false if node.matches?("li p")
+        text_length = node.text.strip.length
+        next false if text_length < min_content_length
+        score += Math.sqrt(text_length - min_content_length)
+        score > min_score
+      end
+    end
+    # NOTE: This matches JS isNodeVisible exactly — does NOT check visibility:hidden
+    def node_visible?(node)
+      style = node['style']
+      return false if style && style =~ /display:\s*none/i
+      return false if !node['hidden'].nil?
+      aria_hidden = node['aria-hidden']
+      if aria_hidden == "true"
+        class_name = node['class'] || ""
+        return false unless class_name.include?("fallback-image")
+      end
+      true
+    end
+  end
+end

data/lib/readability/regexps.rb ADDED Viewed

@@ -0,0 +1,91 @@
+# frozen_string_literal: true
+require "set"
+module Readability
+  # All regex patterns from Readability.js REGEXPS object
+  # NOTE: All tag name constants are LOWERCASE (Nokogiri convention)
+  # Flags
+  FLAG_STRIP_UNLIKELYS = 0x1
+  FLAG_WEIGHT_CLASSES = 0x2
+  FLAG_CLEAN_CONDITIONALLY = 0x4
+  # Defaults
+  DEFAULT_MAX_ELEMS_TO_PARSE = 0
+  DEFAULT_N_TOP_CANDIDATES = 5
+  DEFAULT_CHAR_THRESHOLD = 500
+  DEFAULT_TAGS_TO_SCORE = %w[section h2 h3 h4 h5 h6 p td pre].freeze
+  # Regexps — ported from the JS REGEXPS object
+  UNLIKELY_CANDIDATES = /-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i
+  OK_MAYBE_CANDIDATE = /and|article|body|column|content|main|mathjax|shadow/i
+  POSITIVE = /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i
+  NEGATIVE = /-ad-|hidden|\Ahid\z| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i
+  EXTRANEOUS = /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i
+  BYLINE = /byline|author|dateline|writtenby|p-author/i
+  REPLACE_FONTS = /<(\/?)font[^>]*>/i
+  NORMALIZE = /\s{2,}/
+  VIDEOS = /\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq|bilibili|live.bilibili)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i
+  SHARE_ELEMENTS = /(\b|_)(share|sharedaddy)(\b|_)/i
+  NEXT_LINK = /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i
+  PREV_LINK = /(prev|earl|old|new|<|«)/i
+  TOKENIZE = /\W+/
+  WHITESPACE = /\A\s*\z/
+  HAS_CONTENT = /\S\z/
+  HASH_URL = /\A#.+/
+  SRCSET_URL = /(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))/
+  B64_DATA_URL = /\Adata:\s*([^\s;,]+)\s*;\s*base64\s*,/i
+  # Commas as used in Latin, Sindhi, Chinese and various other scripts.
+  # see: https://en.wikipedia.org/wiki/Comma#Comma_variants
+  COMMAS = /\u{002C}|\u{060C}|\u{FE50}|\u{FE10}|\u{FE11}|\u{2E41}|\u{2E34}|\u{2E32}|\u{FF0C}/
+  # See: https://schema.org/Article
+  JSON_LD_ARTICLE_TYPES = /\A(Article|AdvertiserContentArticle|NewsArticle|AnalysisNewsArticle|AskPublicNewsArticle|BackgroundNewsArticle|OpinionNewsArticle|ReportageNewsArticle|ReviewNewsArticle|Report|SatiricalArticle|ScholarlyArticle|MedicalScholarlyArticle|SocialMediaPosting|BlogPosting|LiveBlogPosting|DiscussionForumPosting|TechArticle|APIReference)\z/
+  AD_WORDS = /\A(ad(vertising|vertisement)?|pub(licité)?|werb(ung)?|广告|Реклама|Anuncio)\z/iu
+  LOADING_WORDS = /\A(loading|正在加载|Загрузка|chargement|cargando)(…|\.\.\.)?\z/iu
+  # Element/role lists — ALL LOWERCASE
+  UNLIKELY_ROLES = %w[menu menubar complementary navigation alert alertdialog dialog].freeze
+  DIV_TO_P_ELEMS = Set.new(%w[blockquote dl div img ol p pre table ul]).freeze
+  ALTER_TO_DIV_EXCEPTIONS = %w[div article section p ol ul].freeze
+  PRESENTATIONAL_ATTRIBUTES = %w[align background bgcolor border cellpadding cellspacing frame hspace rules style valign vspace].freeze
+  DEPRECATED_SIZE_ATTRIBUTE_ELEMS = %w[table th td hr pre].freeze
+  PHRASING_ELEMS = %w[abbr audio b bdo br button cite code data datalist dfn em embed i img input kbd label mark math meter noscript object output progress q ruby samp script select small span strong sub sup textarea time var wbr].freeze
+  CLASSES_TO_PRESERVE = %w[page].freeze
+  HTML_ESCAPE_MAP = {
+    "lt" => "<",
+    "gt" => ">",
+    "amp" => "&",
+    "quot" => '"',
+    "apos" => "'",
+  }.freeze
+end

data/lib/readability/result.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+module Readability
+  Result = Struct.new(
+    :title,
+    :byline,
+    :dir,
+    :lang,
+    :content,
+    :text_content,
+    :length,
+    :excerpt,
+    :site_name,
+    :published_time,
+    keyword_init: true
+  )
+end

data/lib/readability/scoring.rb ADDED Viewed

@@ -0,0 +1,99 @@
+# frozen_string_literal: true
+module Readability
+  module Scoring
+    private
+    # Port of _initializeNode (JS line 903)
+    # Sets up a node in @candidates with a base score derived from its tag name,
+    # then adds the class/id weight.
+    def initialize_node(node)
+      base_score = case node.name
+                   when "div"
+                     5
+                   when "pre", "td", "blockquote"
+                     3
+                   when "address", "ol", "ul", "dl", "dd", "dt", "li", "form"
+                     -3
+                   when "h1", "h2", "h3", "h4", "h5", "h6", "th"
+                     -5
+                   else
+                     0
+                   end
+      @candidates[node] = { content_score: base_score + get_class_weight(node) }
+    end
+    # Port of _getClassWeight (JS line 2168)
+    # Returns a weight based on the node's class and id attributes matching
+    # POSITIVE or NEGATIVE regexps.
+    def get_class_weight(node)
+      return 0 unless flag_is_active?(FLAG_WEIGHT_CLASSES)
+      weight = 0
+      klass = node["class"]
+      if klass && !klass.empty?
+        weight -= 25 if NEGATIVE.match?(klass)
+        weight += 25 if POSITIVE.match?(klass)
+      end
+      id = node["id"]
+      if id && !id.empty?
+        weight -= 25 if NEGATIVE.match?(id)
+        weight += 25 if POSITIVE.match?(id)
+      end
+      weight
+    end
+    # Port of _getLinkDensity (JS line 2143)
+    # Returns the ratio of anchor text length to total text length.
+    # Fragment-only links (#...) count at 0.3 coefficient.
+    def get_link_density(element)
+      text_length = get_inner_text(element).length
+      return 0 if text_length == 0
+      link_length = 0.0
+      element.css("a").each do |link_node|
+        href = link_node["href"]
+        coefficient = href && HASH_URL.match?(href) ? 0.3 : 1.0
+        link_length += get_inner_text(link_node).length * coefficient
+      end
+      link_length / text_length
+    end
+    # Port of _getTextDensity (JS line 2440)
+    # Returns the ratio of text inside elements matching +tags+ to total text in element.
+    def get_text_density(element, tags)
+      text_length = get_inner_text(element, true).length
+      return 0 if text_length == 0
+      children_length = 0
+      get_all_nodes_with_tag(element, tags).each do |child|
+        children_length += get_inner_text(child, true).length
+      end
+      children_length.to_f / text_length
+    end
+    # Port of _getCharCount (JS line 2102)
+    # Counts occurrences of +separator+ in the element's inner text.
+    def get_char_count(element, separator = ",")
+      get_inner_text(element).split(separator).length - 1
+    end
+    # Returns the content score for a candidate node, defaulting to 0.
+    def content_score(node)
+      @candidates.dig(node, :content_score) || 0
+    end
+    # Sets the content score for a candidate node.
+    def set_content_score(node, score)
+      @candidates[node] ||= {}
+      @candidates[node][:content_score] = score
+    end
+  end
+end