RubyGems - html2rss - Versions diffs - 0.17.0 → 0.18.0 - Mend

html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -656
data/exe/html2rss +1 -1
data/html2rss.gemspec +5 -2
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +115 -38
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/auto_source/scraper/json_state.rb ADDED Viewed

@@ -0,0 +1,377 @@
+# frozen_string_literal: true
+require 'json'
+module Html2rss
+  class AutoSource
+    module Scraper
+      #
+      # Scrapes JSON state blobs embedded in script tags such as Next.js, Nuxt,
+      # or custom window globals. The scraper searches `<script type="application/json">`
+      # tags and well-known JavaScript globals for arrays of article-like hashes
+      # and normalises them to a structure compatible with HtmlExtractor.
+      class JsonState
+        include Enumerable
+        JSON_SCRIPT_SELECTOR = 'script[type="application/json"]'
+        GLOBAL_ASSIGNMENT_PATTERNS = [
+          /(?:window|self|globalThis)\.__NEXT_DATA__\s*=\s*/m,
+          /(?:window|self|globalThis)\.__NUXT__\s*=\s*/m,
+          /(?:window|self|globalThis)\.STATE\s*=\s*/m,
+          /(?:window|self|globalThis)\.__REDUX_STATE__\s*=\s*/m,
+          /(?:window|self|globalThis)\.__PRELOADED_STATE__\s*=\s*/m,
+          /(?:window|self|globalThis)\.__APOLLO_STATE__\s*=\s*/m,
+          /(?:window|self|globalThis)\.__remixContext\s*=\s*/m,
+          /(?:window|self|globalThis)\.__sveltekit_data\s*=\s*/m,
+          /(?:window|self|globalThis)\.GATSBY_STATE\s*=\s*/m,
+          /(?:window|self|globalThis)\.__ember_meta\s*=\s*/m,
+          /(?:window|self|globalThis)\.angular\s*=\s*/m
+        ].freeze
+        TITLE_KEYS = %w[title headline name text].freeze
+        URL_KEYS = %w[url link href permalink slug path canonicalUrl shortUrl].freeze
+        DESCRIPTION_KEYS = %w[description summary excerpt dek subheading].freeze
+        IMAGE_KEYS = %w[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
+        PUBLISHED_AT_KEYS = %w[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
+                               createdAt created_at].freeze
+        CATEGORY_KEYS = %w[categories tags section sections topic topics channel].freeze
+        ID_KEYS = %w[id guid uuid slug key].freeze
+        # Scans DOM nodes for JSON payloads containing article data.
+        module DocumentScanner
+          module_function
+          def json_documents(parsed_body)
+            script_documents(parsed_body) + assignment_documents(parsed_body)
+          end
+          def script_documents(parsed_body)
+            parsed_body.css(JSON_SCRIPT_SELECTOR).filter_map { parse_json(_1.text) }
+          end
+          def assignment_documents(parsed_body)
+            parsed_body.css('script').filter_map { parse_assignment(_1.text) }
+          end
+          def parse_assignment(text)
+            payload = assignment_payload(text)
+            parse_json(payload) if payload
+          end
+          def assignment_payload(text)
+            trimmed = text.to_s.strip
+            return if trimmed.empty?
+            GLOBAL_ASSIGNMENT_PATTERNS.each do |pattern|
+              next unless trimmed.match?(pattern)
+              payload = trimmed.sub(pattern, '')
+              return extract_assignment_payload(payload)
+            end
+            nil
+          end
+          def extract_assignment_payload(text)
+            extract_json_block(text) || text
+          end
+          def extract_json_block(text)
+            start_index = text.index(/[\[{]/)
+            return unless start_index
+            stop_index = scan_for_json_end(text, start_index)
+            text[start_index..stop_index] if stop_index
+          end
+          # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+          def scan_for_json_end(text, start_index)
+            stack = []
+            in_string = false
+            escape = false
+            text.each_char.with_index do |char, index|
+              next if index < start_index
+              if in_string
+                if escape
+                  escape = false
+                elsif char == '\\'
+                  escape = true
+                elsif char == '"'
+                  in_string = false
+                end
+                next
+              end
+              case char
+              when '"'
+                in_string = true
+              when '{'
+                stack << '}'
+              when '['
+                stack << ']'
+              when '}', ']'
+                expected = stack.pop
+                return index if expected == char && stack.empty?
+              end
+            end
+            nil
+          end
+          # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+          def parse_json(payload)
+            return unless payload
+            JSON.parse(payload, symbolize_names: true)
+          rescue JSON::ParserError => error
+            parse_js_object(payload, error)
+          end
+          def parse_js_object(payload, _original_error)
+            coerced = coerce_javascript_object(payload)
+            return unless coerced
+            # Some sites emit JavaScript object literals (unquoted keys, trailing commas).
+            # Coerce those payloads into valid JSON so we keep the same parsing pipeline.
+            JSON.parse(coerced, symbolize_names: true)
+          rescue JSON::ParserError => error
+            Html2rss::Log.debug("#{name}: failed to parse coerced JavaScript object (#{error.message})")
+            nil
+          end
+          def coerce_javascript_object(payload)
+            string = payload.dup
+            # KISS approach: mutate common JS literal quirks instead of a full parser.
+            strip_trailing_commas(quote_unquoted_keys(string))
+          end
+          def quote_unquoted_keys(jsonish)
+            jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
+              "#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
+            end
+          end
+          def strip_trailing_commas(jsonish)
+            jsonish.gsub(/,(\s*[\]}])/, '\1')
+          end
+        end
+        private_constant :DocumentScanner
+        # Retrieves values from heterogeneous objects by probing multiple keys.
+        module ValueFinder
+          module_function
+          def fetch(object, keys)
+            case object
+            when Hash then fetch_from_hash(object, keys)
+            when Array then fetch_from_array(object, keys)
+            end
+          end
+          def fetch_from_hash(hash, keys)
+            keys.each do |key|
+              string_key = key.to_s
+              return hash[string_key] if hash.key?(string_key)
+              symbol_key = string_key.to_sym
+              return hash[symbol_key] if hash.key?(symbol_key)
+            end
+            fetch_nested(hash[:attributes] || hash['attributes'], keys) ||
+              fetch_nested(hash[:data] || hash['data'], keys)
+          end
+          def fetch_from_array(array, keys)
+            array.each do |entry|
+              result = fetch(entry, keys)
+              return result if result
+            end
+            nil
+          end
+          def fetch_nested(value, keys)
+            fetch(value, keys) if value
+          end
+        end
+        private_constant :ValueFinder
+        # Identifies arrays that look like collections of article hashes.
+        module CandidateDetector
+          module_function
+          def candidate_array?(document)
+            case document
+            when Array
+              return true if array_of_articles?(document)
+              document.any? { traversable_candidate?(_1) }
+            when Hash then document.each_value.any? { candidate_array?(_1) }
+            else false
+            end
+          end
+          def traversable_candidate?(value)
+            case value
+            when Array, Hash then candidate_array?(value)
+            else false
+            end
+          end
+          def array_of_articles?(array)
+            array.any? do |element|
+              next unless element.is_a?(Hash)
+              title_from(element) && url_from(element)
+            end
+          end
+          def title_from(object)
+            ValueFinder.fetch(object, TITLE_KEYS)
+          end
+          def url_from(object)
+            ValueFinder.fetch(object, URL_KEYS)
+          end
+        end
+        private_constant :CandidateDetector
+        # Shapes raw entries into the structure required downstream.
+        module ArticleNormalizer
+          module_function
+          # rubocop:disable Metrics/MethodLength
+          def normalise(entry, base_url:)
+            return unless entry.is_a?(Hash)
+            title = string(ValueFinder.fetch(entry, TITLE_KEYS))
+            description = string(ValueFinder.fetch(entry, DESCRIPTION_KEYS))
+            article_url = resolve_link(entry, keys: URL_KEYS, base_url:,
+                                              log_key: 'JsonState: invalid URL encountered')
+            return unless article_url
+            return if title.nil? && description.nil?
+            {
+              title:,
+              description:,
+              url: article_url,
+              image: resolve_link(entry, keys: IMAGE_KEYS, base_url:,
+                                         log_key: 'JsonState: invalid image URL encountered'),
+              published_at: string(ValueFinder.fetch(entry, PUBLISHED_AT_KEYS)),
+              categories: categories(entry),
+              id: identifier(entry, article_url)
+            }.compact
+          end
+          # rubocop:enable Metrics/MethodLength
+          def string(value)
+            trimmed = value.to_s.strip
+            trimmed unless trimmed.empty?
+          end
+          def resolve_link(entry, keys:, base_url:, log_key:)
+            value = ValueFinder.fetch(entry, keys)
+            value = ValueFinder.fetch(value, keys) if value.is_a?(Hash)
+            string = string(value)
+            return unless string
+            Url.from_relative(string, base_url)
+          rescue ArgumentError
+            Log.debug(log_key, url: string)
+            nil
+          end
+          # rubocop:disable Metrics/MethodLength
+          def categories(entry)
+            raw = ValueFinder.fetch(entry, CATEGORY_KEYS)
+            names = case raw
+                    when Array then raw
+                    when Hash then raw.values
+                    when String then [raw]
+                    else []
+                    end
+            result = names.flat_map do |value|
+              case value
+              when Hash
+                string(ValueFinder.fetch(value, %w[name title label]))
+              else
+                string(value)
+              end
+            end.compact
+            result.uniq!
+            result unless result.empty?
+          end
+          # rubocop:enable Metrics/MethodLength
+          def identifier(entry, article_url)
+            value = ValueFinder.fetch(entry, ID_KEYS)
+            value = ValueFinder.fetch(value, ID_KEYS) if value.is_a?(Hash)
+            string(value) || article_url.to_s
+          end
+        end
+        private_constant :ArticleNormalizer
+        def self.options_key = :json_state
+        class << self
+          def articles?(parsed_body)
+            return false unless parsed_body
+            DocumentScanner.json_documents(parsed_body).any? { CandidateDetector.candidate_array?(_1) }
+          end
+          def json_documents(parsed_body)
+            DocumentScanner.json_documents(parsed_body)
+          end
+        end
+        def initialize(parsed_body, url:, **_opts)
+          @parsed_body = parsed_body
+          @url = url
+        end
+        attr_reader :parsed_body
+        def each
+          return enum_for(:each) unless block_given?
+          DocumentScanner.json_documents(parsed_body).each do |document|
+            discover_articles(document) do |article|
+              yield article if article
+            end
+          end
+        end
+        private
+        attr_reader :url
+        def discover_articles(document, &block)
+          case document
+          when Array then handle_array(document, &block)
+          when Hash then document.each_value { discover_articles(_1, &block) if traversable?(_1) }
+          end
+        end
+        def handle_array(array, &block)
+          if CandidateDetector.array_of_articles?(array)
+            array.each do |entry|
+              yield(ArticleNormalizer.normalise(entry, base_url: url))
+            end
+          else
+            array.each { discover_articles(_1, &block) if traversable?(_1) }
+          end
+        end
+        def traversable?(value)
+          value.is_a?(Array) || value.is_a?(Hash)
+        end
+      end
+    end
+  end
+end