RubyGems - html2rss - Versions diffs - 0.11.0 → 0.13.0 - Mend

html2rss 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/README.md +38 -10
data/html2rss.gemspec +1 -0
data/lib/html2rss/attribute_post_processors/base.rb +74 -0
data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +17 -8
data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
data/lib/html2rss/attribute_post_processors/template.rb +19 -11
data/lib/html2rss/attribute_post_processors.rb +8 -0
data/lib/html2rss/auto_source/article.rb +95 -0
data/lib/html2rss/auto_source/channel.rb +79 -0
data/lib/html2rss/auto_source/cleanup.rb +76 -0
data/lib/html2rss/auto_source/reducer.rb +48 -0
data/lib/html2rss/auto_source/rss_builder.rb +68 -0
data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
data/lib/html2rss/auto_source/scraper.rb +33 -0
data/lib/html2rss/auto_source.rb +77 -0
data/lib/html2rss/cli.rb +10 -0
data/lib/html2rss/config/channel.rb +4 -2
data/lib/html2rss/config/selectors.rb +13 -2
data/lib/html2rss/item.rb +8 -2
data/lib/html2rss/utils.rb +5 -10
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +21 -0
metadata +30 -3

data/lib/html2rss/auto_source/channel.rb ADDED Viewed

@@ -0,0 +1,79 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    ##
+    # Extracts channel information from
+    # 1. the HTML document's <head>.
+    # 2. the HTTP response
+    class Channel
+      ##
+      #
+      # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
+      # @param response [Faraday::Response] The URL of the HTML document.
+      def initialize(parsed_body, url:, response:, articles: [])
+        @parsed_body = parsed_body
+        @url = url
+        @response = response
+        @articles = articles
+      end
+      def url = extract_url
+      def title = extract_title
+      def language = extract_language
+      def description = extract_description
+      def image = extract_image
+      def ttl = extract_ttl
+      def last_build_date = response.headers['last-modified']
+      def generator
+        "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
+      end
+      private
+      attr_reader :parsed_body, :response
+      def extract_url
+        @url.normalize.to_s
+      end
+      def extract_title
+        parsed_body.at_css('head > title')&.text
+      end
+      def extract_language
+        return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
+        parsed_body.at_css('[lang]')&.[]('lang')
+      end
+      def extract_description
+        parsed_body.at_css('meta[name="description"]')&.[]('content') || ''
+      end
+      def extract_image
+        url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
+        Html2rss::Utils.sanitize_url(url) if url
+      end
+      def extract_ttl
+        ttl = response.headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
+        return unless ttl
+        ttl.to_i.fdiv(60).ceil
+      end
+      def scraper_counts
+        scraper_counts = +''
+        @articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 }
+                 .each do |klass, count|
+          scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]")
+        end
+        scraper_counts
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/cleanup.rb ADDED Viewed

@@ -0,0 +1,76 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    ##
+    # Cleanup is responsible for cleaning up the extracted articles.
+    # :reek:MissingSafeMethod { enabled: false }
+    # It applies various strategies to filter and refine the article list.
+    class Cleanup
+      class << self
+        def call(articles, url:, keep_different_domain: false)
+          Log.debug "Cleanup: start with #{articles.size} articles"
+          articles.select!(&:valid?)
+          remove_short!(articles, :title)
+          deduplicate_by!(articles, :url)
+          deduplicate_by!(articles, :title)
+          keep_only_http_urls!(articles)
+          reject_different_domain!(articles, url) unless keep_different_domain
+          Log.debug "Cleanup: end with #{articles.size} articles"
+          articles
+        end
+        private
+        ##
+        # Removes articles with short values for a given key.
+        #
+        # @param articles [Array<Article>] The list of articles to process.
+        # @param key [Symbol] The key to check for short values.
+        # @param min_words [Integer] The minimum number of words required.
+        def remove_short!(articles, key = :title, min_words: 2)
+          articles.reject! do |article|
+            value = article.public_send(key)
+            value.nil? || value.to_s.split.size < min_words
+          end
+        end
+        ##
+        # Deduplicates articles by a given key.
+        #
+        # @param articles [Array<Article>] The list of articles to process.
+        # @param key [Symbol] The key to deduplicate by.
+        def deduplicate_by!(articles, key)
+          seen = {}
+          articles.reject! do |article|
+            value = article.public_send(key)
+            value.nil? || seen.key?(value).tap { seen[value] = true }
+          end
+        end
+        ##
+        # Keeps only articles with HTTP or HTTPS URLs.
+        #
+        # @param articles [Array<Article>] The list of articles to process.
+        def keep_only_http_urls!(articles)
+          articles.select! { |article| %w[http https].include?(article.url&.scheme) }
+        end
+        ##
+        # Rejects articles that have a URL not on the same domain as the source.
+        #
+        # @param articles [Array<Article>] The list of articles to process.
+        # @param base_url [Addressable::URI] The source URL to compare against.
+        def reject_different_domain!(articles, base_url)
+          base_host = base_url.host
+          articles.select! { |article| article.url&.host == base_host }
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/reducer.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    ##
+    # Reducer is responsible for reducing the list of articles.
+    # It keeps only the longest attributes of articles with the same URL.
+    # It also filters out invalid articles.
+    class Reducer
+      class << self
+        def call(articles, **_options)
+          Log.debug "Reducer: inited with #{articles.size} articles"
+          reduce_by_keeping_longest_values(articles, keep: [:scraper]) { |article| article.url&.path }
+        end
+        private
+        # @param articles [Array<Article>]
+        # @return [Array<Article>] reduced articles
+        def reduce_by_keeping_longest_values(articles, keep:, &)
+          grouped_by_block = articles.group_by(&)
+          grouped_by_block.each_with_object([]) do |(_key, grouped_articles), result|
+            memo_object = {}
+            grouped_articles.each do |article_hash|
+              keep_longest_values(memo_object, article_hash, keep:)
+            end
+            result << Article.new(**memo_object)
+          end
+        end
+        def keep_longest_values(memo_object, article_hash, keep:)
+          article_hash.each do |key, value|
+            next if value.eql?(memo_object[key])
+            if keep.include?(key)
+              memo_object[key] ||= []
+              memo_object[key] << value
+            elsif value && value.to_s.size > memo_object[key].to_s.size
+              memo_object[key] = value
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/rss_builder.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+require 'rss'
+module Html2rss
+  class AutoSource
+    ##
+    # Converts the autosourced channel and articles to an RSS feed.
+    class RssBuilder
+      def self.add_guid(article, maker)
+        maker.guid.tap do |guid|
+          guid.content = article.guid
+          guid.isPermaLink = false
+        end
+      end
+      def self.add_image(article, maker)
+        url = article.image || return
+        maker.enclosure.tap do |enclosure|
+          enclosure.url = url
+          enclosure.type = Html2rss::Utils.guess_content_type_from_url(url)
+          enclosure.length = 0
+        end
+      end
+      def initialize(channel:, articles:)
+        @channel = channel
+        @articles = articles
+      end
+      def call
+        RSS::Maker.make('2.0') do |maker|
+          make_channel(maker.channel)
+          make_items(maker)
+        end
+      end
+      private
+      attr_reader :channel, :articles
+      def make_channel(maker)
+        %i[language title description ttl].each do |key|
+          maker.public_send(:"#{key}=", channel.public_send(key))
+        end
+        maker.link = channel.url
+        maker.generator = channel.generator
+        maker.updated = channel.last_build_date
+      end
+      def make_items(maker)
+        articles.each do |article|
+          maker.items.new_item do |item_maker|
+            RssBuilder.add_guid(article, item_maker)
+            RssBuilder.add_image(article, item_maker)
+            item_maker.title = article.title
+            item_maker.description = article.description
+            item_maker.pubDate = article.published_at
+            item_maker.link = article.url
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/schema/base.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+require 'date'
+module Html2rss
+  class AutoSource
+    module Scraper
+      class Schema
+        ##
+        # Base class for Schema.org schema_objects.
+        #
+        # @see https://schema.org/Article
+        class Base
+          DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
+          def initialize(schema_object, url:)
+            @schema_object = schema_object
+            @url = url
+          end
+          # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
+          def call
+            DEFAULT_ATTRIBUTES.to_h do |attribute|
+              [attribute, public_send(attribute)]
+            end
+          end
+          def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
+          def title = schema_object[:title]
+          def description
+            [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
+              .max_by { |desc| desc.to_s.size }
+          end
+          # @return [Addressable::URI, nil] the URL of the schema object
+          def url
+            url = schema_object[:url]
+            if url.to_s.empty?
+              Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
+              return
+            end
+            Utils.build_absolute_url_from_relative(url, @url)
+          end
+          def image = images.first || nil
+          def published_at = schema_object[:datePublished]
+          private
+          attr_reader :schema_object
+          def images
+            Array(schema_object[:image]).compact
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/schema.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      ##
+      # Scraps articles from Schema.org objects, by looking for the objects in:
+      #  1. <script type="application/ld+json"> "schema" tag.
+      #  2. tbd
+      #
+      # See:
+      # 1. https://schema.org/NewsArticle
+      # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
+      class Schema
+        include Enumerable
+        TAG_SELECTOR = 'script[type="application/ld+json"]'
+        SCHEMA_OBJECT_TYPES = %w[
+          AdvertiserContentArticle
+          AnalysisNewsArticle
+          APIReference
+          Article
+          AskPublicNewsArticle
+          BackgroundNewsArticle
+          BlogPosting
+          DiscussionForumPosting
+          LiveBlogPosting
+          NewsArticle
+          OpinionNewsArticle
+          Report
+          ReportageNewsArticle
+          ReviewNewsArticle
+          SatiricalArticle
+          ScholarlyArticle
+          SocialMediaPosting
+          TechArticle
+        ].to_set.freeze
+        class << self
+          def articles?(parsed_body)
+            parsed_body.css(TAG_SELECTOR).any? do |script|
+              SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
+            end
+          end
+          ##
+          # Returns a flat array
+          # of all supported schema objects
+          # by recursively traversing the `from` object.
+          #
+          # @param object [Hash, Array]
+          # @return [Array<Hash>] the schema_objects, or an empty array
+          # :reek:DuplicateMethodCall
+          def from(object)
+            case object
+            when Nokogiri::XML::Element
+              from(parse_script_tag(object))
+            when Hash
+              supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
+            when Array
+              object.flat_map { |item| from(item) }
+            else
+              []
+            end
+          end
+          def supported_schema_object?(object)
+            scraper_for_schema_object(object) ? true : false
+          end
+          ##
+          # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
+          def scraper_for_schema_object(schema_object)
+            if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
+              Base
+            else
+              Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
+              nil
+            end
+          end
+          private
+          def parse_script_tag(script_tag)
+            JSON.parse(script_tag.text, symbolize_names: true)
+          rescue JSON::ParserError => error
+            Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
+            []
+          end
+        end
+        def initialize(parsed_body, url:)
+          @parsed_body = parsed_body
+          @url = url
+        end
+        ##
+        # @yield [Hash] Each scraped article_hash
+        # @return [Array<Hash>] the scraped article_hashes
+        def each(&)
+          schema_objects.filter_map do |schema_object|
+            next unless (klass = self.class.scraper_for_schema_object(schema_object))
+            next unless (article_hash = klass.new(schema_object, url:).call)
+            yield article_hash
+          end
+        end
+        private
+        def schema_objects
+          @parsed_body.css(TAG_SELECTOR).flat_map do |tag|
+            Schema.from(tag)
+          end
+        end
+        attr_reader :parsed_body, :url
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb ADDED Viewed

@@ -0,0 +1,123 @@
+# frozen_string_literal: true
+require 'set'
+module Html2rss
+  class AutoSource
+    module Scraper
+      class SemanticHtml
+        ##
+        # ArticleExtractor is responsible for extracting the details of an article.
+        # It focuses on finding a headline first, and from it traverse as much as possible,
+        # to find the DOM upwards to find the other details.
+        class Extractor
+          INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
+          HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
+          NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
+                                   INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
+          def self.visible_text_from_tag(tag, separator: ' ')
+            text = if (children = tag.children).empty?
+                     tag.text.strip
+                   else
+                     children.filter_map do |child|
+                       next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
+                       visible_text_from_tag(child)
+                     end.join(separator)
+                   end
+            return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
+            sanitized_text
+          end
+          def initialize(article_tag, url:)
+            @article_tag = article_tag
+            @url = url
+            @heading = find_heading
+            @extract_url = find_url
+          end
+          # @return [Hash, nil] The scraped article or nil.
+          def call
+            return unless heading
+            {
+              title: extract_title,
+              url: extract_url,
+              image: extract_image,
+              description: extract_description,
+              id: generate_id,
+              published_at: extract_published_at
+            }
+          end
+          private
+          attr_reader :article_tag, :url, :heading, :extract_url
+          def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
+          # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
+          def extract_published_at
+            times = article_tag.css('time[datetime]')
+                               .filter_map do |tag|
+              DateTime.parse(tag['datetime'])
+            rescue ArgumentError, TypeError
+              nil
+            end
+            times.min
+          end
+          def find_heading
+            heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
+            smallest_heading = heading_tags.keys.min
+            heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
+          end
+          def extract_title
+            @extract_title ||= if heading.children.empty? && heading.text
+                                 visible_text_from_tag(heading)
+                               else
+                                 visible_text_from_tag(
+                                   article_tag.css(HEADING_TAGS.join(','))
+                                              .max_by { |tag| tag.text.size }
+                                 )
+                               end
+          end
+          def extract_description
+            text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
+            return text if text
+            description = visible_text_from_tag(article_tag)
+            return nil unless description
+            title_text = extract_title
+            description.gsub!(title_text, '') if title_text
+            description.strip!
+            description.empty? ? nil : description
+          end
+          def find_url
+            closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
+                                                                selector: 'a[href]:not([href=""])')
+            href = closest_anchor&.[]('href')&.split('#')&.first&.strip
+            Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
+          end
+          def extract_image
+            Image.call(article_tag, url:)
+          end
+          def generate_id
+            [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
+             extract_url&.path].compact.reject(&:empty?).first
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/semantic_html/image.rb ADDED Viewed

@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class SemanticHtml
+        ##
+        # Image is responsible for extracting image URLs the article_tag.
+        class Image
+          def self.call(article_tag, url:)
+            img_src = from_source(article_tag) ||
+                      from_img(article_tag) ||
+                      from_style(article_tag)
+            Utils.build_absolute_url_from_relative(img_src, url) if img_src
+          end
+          def self.from_img(article_tag)
+            article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
+          end
+          ##
+          # Extracts the largest image source from the srcset attribute
+          # of an img tag or a source tag inside a picture tag.
+          #
+          # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
+          # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
+          # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
+          def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
+            hash = article_tag.css('img[srcset], picture > source[srcset]')
+                              .flat_map { |source| source['srcset'].to_s.split(',') }
+                              .filter_map do |line|
+              width, url = line.split.reverse
+              next if url.nil? || url.start_with?('data:')
+              width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
+              [width_value, url.strip]
+            end.to_h
+            hash[hash.keys.max]
+          end
+          def self.from_style(article_tag)
+            article_tag.css('[style*="url"]')
+                       .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
+                       .reject { |src| !src || src.start_with?('data:') }
+                       .max_by(&:size)
+          end
+        end
+      end
+    end
+  end
+end