RubyGems - html2rss - Versions diffs - 0.17.0 → 0.18.0 - Mend

html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -656
data/exe/html2rss +1 -1
data/html2rss.gemspec +5 -2
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +115 -38
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/auto_source/scraper/microdata.rb ADDED Viewed

@@ -0,0 +1,399 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      # Scrapes Schema.org Microdata items embedded directly in HTML markup.
+      class Microdata
+        include Enumerable
+        ITEM_SELECTOR = '[itemscope][itemtype]'
+        SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
+        VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
+        def self.options_key = :microdata
+        class << self
+          def articles?(parsed_body)
+            supported_roots(parsed_body).any?
+          end
+          def supported_roots(parsed_body)
+            return [] unless parsed_body
+            parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
+          end
+          def supported_root?(node)
+            supported_type_name(node) && top_level_item?(node)
+          end
+          def supported_type_name(node)
+            normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
+          end
+          def normalized_types(itemtype)
+            itemtype.to_s.split.filter_map do |value|
+              type = value.split('/').last.to_s.split('#').last.to_s
+              type unless type.empty?
+            end
+          end
+          def top_level_item?(node)
+            return false if node.attribute('itemprop')
+            node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
+          end
+        end
+        ##
+        # Builds a Microdata scraper for an already parsed response body.
+        #
+        # @param parsed_body [Nokogiri::HTML5::Document, Nokogiri::HTML4::Document, Nokogiri::XML::Node, nil]
+        #   the parsed response body to inspect for top-level Microdata items.
+        # @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
+        # @param _opts [Hash] unused scraper-specific options.
+        # @return [void]
+        def initialize(parsed_body, url:, **_opts)
+          @parsed_body = parsed_body
+          @url = url
+        end
+        ##
+        # Iterates over normalized article hashes extracted from supported Microdata roots.
+        #
+        # @yieldparam article [Hash<Symbol, Object>] the normalized article attributes.
+        # @return [Enumerator, void] an enumerator when no block is given.
+        def each
+          return enum_for(:each) unless block_given?
+          self.class.supported_roots(parsed_body).each do |root|
+            article = article_from(root)
+            yield article if article
+          end
+        end
+        private
+        attr_reader :parsed_body, :url
+        def article_from(root)
+          schema_object = SchemaObjectBuilder.call(root)
+          return unless schema_object
+          article = Schema::Thing.new(schema_object, url:).call.compact
+          return unless valid_article?(article)
+          article
+        end
+        def valid_article?(article)
+          return false unless article[:url]
+          article[:title] || article[:description]
+        end
+        # Extracts direct Microdata itemprop values for a single item root.
+        module ItemParser
+          module_function
+          def call(root)
+            {}.tap do |properties|
+              direct_properties(root).each { append_properties!(properties, _1) }
+            end
+          end
+          def append_properties!(properties, node)
+            value = property_value(node)
+            return if blank_value?(value)
+            property_names(node).each do |name|
+              append(properties, name.to_sym, value)
+            end
+          end
+          def direct_properties(root)
+            root.css('[itemprop]').select { direct_property?(root, _1) }
+          end
+          def direct_property?(root, node)
+            return false if node == root
+            node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
+          end
+          def property_names(node)
+            node['itemprop'].to_s.split.filter_map do |name|
+              stripped = name.strip
+              stripped unless stripped.empty?
+            end
+          end
+          def property_value(node)
+            value = if node.attribute('itemscope')
+                      nested_item(node)
+                    else
+                      attribute_value(node) || text_value(node)
+                    end
+            value unless blank_value?(value)
+          end
+          def nested_item(node)
+            item = call(node)
+            itemtype = node['itemtype']
+            itemid = node['itemid']
+            item[:@type] = Microdata.normalized_types(itemtype).first if itemtype
+            item[:@id] = itemid if present?(itemid)
+            item
+          end
+          def attribute_value(node)
+            VALUE_ATTRIBUTES.each do |attribute|
+              value = node[attribute]
+              return value if present?(value)
+            end
+            nil
+          end
+          def text_value(node)
+            value = node.text.to_s.strip
+            value unless value.empty?
+          end
+          def append(properties, key, value)
+            return if blank_value?(value)
+            unless properties.key?(key)
+              properties[key] = value
+              return
+            end
+            properties[key] = Array(properties[key]) << value
+          end
+          def blank_value?(value)
+            case value
+            when nil then true
+            when String then value.strip.empty?
+            when Array, Hash then value.empty?
+            else false
+            end
+          end
+          def present?(value)
+            !blank_value?(value)
+          end
+        end
+        private_constant :ItemParser
+        # Shared value normalization helpers for Microdata property conversion.
+        module ValueNormalizer
+          module_function
+          def url_value(*values)
+            values.each do |value|
+              candidate = extract_nested_value(value, :url, :@id)
+              return candidate.to_s if present?(candidate)
+            end
+            nil
+          end
+          def image_value(*values)
+            values.each do |value|
+              candidate = normalize_image(value)
+              return candidate if present?(candidate)
+            end
+            nil
+          end
+          def normalize_image(value)
+            candidate = unwrap(value)
+            return unless present?(candidate)
+            return candidate if candidate.is_a?(String) || candidate.is_a?(Hash)
+            candidate.to_s
+          end
+          def normalize_about(value)
+            candidate = unwrap(value)
+            items = candidate.is_a?(Array) ? candidate : [candidate]
+            values = items.filter_map { normalize_about_item(_1) }
+            values unless values.empty?
+          end
+          def normalize_about_item(item)
+            case item
+            when Hash
+              name = item[:name]
+              { name: name.to_s } if name
+            when String then item
+            end
+          end
+          def string_or_array(value)
+            candidate = unwrap(value)
+            return unless present?(candidate)
+            return stringify(candidate) unless candidate.is_a?(Array)
+            result = string_values(candidate)
+            result unless result.empty?
+          end
+          def array_value(*values)
+            result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
+            result unless result.empty?
+          end
+          def string_values(values)
+            values.filter_map { stringify(_1) }
+          end
+          def first_string(*values)
+            values.each do |value|
+              candidate = stringify(unwrap(value))
+              return candidate if present?(candidate)
+            end
+            nil
+          end
+          def extract_nested_value(value, *keys)
+            candidate = unwrap(value)
+            return candidate unless candidate.is_a?(Hash)
+            keys.each do |key|
+              nested_value = candidate[key]
+              return nested_value if present?(nested_value)
+            end
+            nil
+          end
+          def unwrap(value)
+            value.is_a?(Array) ? value.first : value
+          end
+          def stringify(value)
+            return unless present?(value)
+            return value if value.is_a?(String)
+            return if value.is_a?(Hash) || value.is_a?(Array)
+            value.to_s
+          end
+          def present?(value)
+            case value
+            when nil then false
+            when String then !value.strip.empty?
+            when Array, Hash then !value.empty?
+            else true
+            end
+          end
+        end
+        private_constant :ValueNormalizer
+        # Normalizes raw Microdata properties into the schema-like shape used downstream.
+        module SchemaObjectBuilder
+          module_function
+          extend ValueNormalizer
+          def call(root)
+            type = Microdata.supported_type_name(root)
+            return unless type
+            compact_object(type, root, ItemParser.call(root))
+          end
+          def compact_object(type, root, properties)
+            object = base_attributes(type, root, properties)
+            merge_categories!(object, properties)
+            object.compact
+          end
+          def base_attributes(type, root, properties)
+            identifier = first_string(root['itemid'], properties.delete(:identifier))
+            {
+              '@type': type,
+              '@id': identifier
+            }.merge(text_attributes(properties))
+              .merge(link_attributes(properties, identifier))
+              .merge(media_attributes(properties))
+          end
+          def title(properties)
+            first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
+          end
+          def text_attributes(properties)
+            {
+              title: title(properties),
+              description: first_string(properties.delete(:description)),
+              schema_object_body: first_string(properties.delete(:articleBody)),
+              abstract: first_string(properties.delete(:abstract)),
+              datePublished: published_at(properties)
+            }
+          end
+          def link_attributes(properties, identifier)
+            {
+              url: url(properties, identifier)
+            }
+          end
+          def media_attributes(properties)
+            {
+              image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
+            }
+          end
+          def url(properties, fallback_id)
+            url_value(
+              properties.delete(:url),
+              properties.delete(:mainEntityOfPage),
+              url_fallback(fallback_id)
+            )
+          end
+          def url_fallback(fallback_id)
+            value = first_string(fallback_id)
+            return unless value
+            return value if value.start_with?('/')
+            return value if value.match?(%r{\Ahttps?://})
+            nil
+          end
+          def published_at(properties)
+            first_string(
+              properties.delete(:datePublished),
+              properties.delete(:dateCreated),
+              properties.delete(:dateModified),
+              properties.delete(:uploadDate)
+            )
+          end
+          def merge_categories!(object, properties)
+            categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
+            assign_if_present(object, :categories, categories)
+            assign_if_present(object, :keywords, string_or_array(properties.delete(:keywords)))
+            assign_if_present(object, :tags, string_or_array(properties.delete(:tags)))
+            assign_if_present(object, :about, normalize_about(properties.delete(:about)))
+          end
+          def assign_if_present(object, key, value)
+            object[key] = value if value
+          end
+        end
+        private_constant :SchemaObjectBuilder
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb ADDED Viewed

@@ -0,0 +1,102 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class Schema
+        ##
+        # Extracts categories from Schema.org structured data.
+        module CategoryExtractor
+          ##
+          # Extracts categories from a schema object.
+          #
+          # @param schema_object [Hash] The schema object
+          # @return [Array<String>] Array of category strings
+          def self.call(schema_object)
+            # Build union of all category sources
+            field_categories = extract_field_categories(schema_object)
+            about_categories = extract_about_categories(schema_object)
+            (field_categories | about_categories).to_a
+          end
+          ##
+          # Extracts categories from keywords, categories, and tags fields.
+          #
+          # @param schema_object [Hash] The schema object
+          # @return [Set<String>] Set of category strings
+          def self.extract_field_categories(schema_object)
+            Set.new.tap do |categories|
+              %w[keywords categories tags].each do |field|
+                categories.merge(extract_field_value(schema_object, field))
+              end
+            end
+          end
+          ##
+          # Extracts categories from the about field.
+          #
+          # @param schema_object [Hash] The schema object
+          # @return [Set<String>] Set of category strings
+          def self.extract_about_categories(schema_object)
+            about = schema_object[:about]
+            return Set.new unless about
+            if about.is_a?(Array)
+              extract_about_array(about)
+            elsif about.is_a?(String)
+              extract_string_categories(about)
+            else
+              Set.new
+            end
+          end
+          ##
+          # Extracts categories from a single field value.
+          #
+          # @param schema_object [Hash] The schema object
+          # @param field [String] The field name
+          # @return [Set<String>] Set of category strings
+          def self.extract_field_value(schema_object, field)
+            value = schema_object[field.to_sym]
+            return Set.new unless value
+            if value.is_a?(Array)
+              Set.new(value.map(&:to_s).reject(&:empty?))
+            elsif value.is_a?(String)
+              extract_string_categories(value)
+            else
+              Set.new
+            end
+          end
+          ##
+          # Extracts categories from an about array.
+          #
+          # @param about [Array] The about array
+          # @return [Set<String>] Set of category strings
+          def self.extract_about_array(about)
+            Set.new.tap do |categories|
+              about.each do |item|
+                if item.is_a?(Hash) && item[:name]
+                  categories.add(item[:name].to_s)
+                elsif item.is_a?(String)
+                  categories.add(item)
+                end
+              end
+            end
+          end
+          ##
+          # Extracts categories from a string by splitting on separators.
+          #
+          # @param string [String] The string to process
+          # @return [Set<String>] Set of category strings
+          def self.extract_string_categories(string)
+            Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/schema/item_list.rb CHANGED Viewed

@@ -17,12 +17,12 @@ module Html2rss
           def call
             hashes = [super]
-            return hashes if (elements = @schema_object[:itemListElement]).nil?
+            return hashes unless (elements = @schema_object[:itemListElement])
             elements = [elements] unless elements.is_a?(Array)
             elements.each do |schema_object|
-              hashes << ListItem.new(schema_object, url: @url).call
+              hashes << ListItem.new(schema_object, url: base_url || '').call
             end
             hashes

data/lib/html2rss/auto_source/scraper/schema/list_item.rb CHANGED Viewed

@@ -9,14 +9,14 @@ module Html2rss
         # @see https://schema.org/ListItem
         class ListItem < Thing
           def id =          (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
-          def title =       schema_object.dig(:item, :name) || super || (url ? Utils.titleized_url(url) : nil)
+          def title =       schema_object.dig(:item, :name) || super || url&.titleized
           def description = schema_object.dig(:item, :description) || super
-          # @return [Addressable::URI, nil]
+          # @return [Html2rss::Url, nil]
           def url
             url = schema_object.dig(:item, :url) || super
-            Utils.build_absolute_url_from_relative(url, @url) if url
+            Url.from_relative(url, base_url || url) if url
           end
         end
       end

data/lib/html2rss/auto_source/scraper/schema/thing.rb CHANGED Viewed

@@ -32,11 +32,11 @@ module Html2rss
             TechArticle
           ].to_set.freeze
-          DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
+          DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
           def initialize(schema_object, url:)
             @schema_object = schema_object
-            @url = url
+            @base_url = normalized_base_url(url)
           end
           # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
@@ -49,7 +49,7 @@ module Html2rss
           def id
             return @id if defined?(@id)
-            id = (schema_object[:@id] || url&.path).to_s
+            id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
             return if id.empty?
@@ -63,7 +63,7 @@ module Html2rss
                          .max_by { |string| string.to_s.size }
           end
-          # @return [Addressable::URI, nil] the URL of the schema object
+          # @return [Html2rss::Url, nil] the URL of the schema object
           def url
             url = schema_object[:url]
             if url.to_s.empty?
@@ -71,20 +71,24 @@ module Html2rss
               return
             end
-            Utils.build_absolute_url_from_relative(url, @url)
+            Url.from_relative(url, base_url || url)
           end
           def image
             if (image_url = image_urls.first)
-              Utils.build_absolute_url_from_relative(image_url, @url)
+              Url.from_relative(image_url, base_url || image_url)
             end
           end
           def published_at = schema_object[:datePublished]
-          private
+          def categories
+            return @categories if defined?(@categories)
-          attr_reader :schema_object
+            @categories = CategoryExtractor.call(schema_object)
+          end
+          attr_reader :schema_object, :base_url
           def image_urls
             schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
@@ -97,6 +101,42 @@ module Html2rss
               end
             end
           end
+          def normalized_id(value, reference_url:)
+            text = value.to_s
+            return if text.empty?
+            normalized_url = normalized_id_url(text, reference_url:)
+            return text unless reference_url && normalized_url.host == reference_url.host
+            normalized_id_value(normalized_url)
+          rescue ArgumentError
+            text
+          end
+          def normalized_id_url(text, reference_url:)
+            if text.start_with?('/')
+              Url.from_relative(text, reference_url || text)
+            else
+              Url.from_absolute(text)
+            end
+          end
+          def normalized_id_value(url)
+            path = url.path.to_s
+            return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
+            return path unless path.empty?
+            url.query
+          end
+          def normalized_base_url(url)
+            return if url.to_s.strip.empty?
+            Url.from_absolute(url)
+          rescue ArgumentError
+            nil
+          end
         end
       end
     end

data/lib/html2rss/auto_source/scraper/schema.rb CHANGED Viewed

@@ -19,13 +19,16 @@ module Html2rss
         TAG_SELECTOR = 'script[type="application/ld+json"]'
+        def self.options_key = :schema
         class << self
           def articles?(parsed_body)
-            parsed_body.css(TAG_SELECTOR).any? do |script|
-              (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
-                script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
-              end
-            end
+            parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
+          end
+          def supported_schema_type?(script)
+            supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
+            supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
           end
           ##
@@ -63,7 +66,7 @@ module Html2rss
             elsif ItemList::SUPPORTED_TYPES.member?(type)
               ItemList
             else
-              Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
+              Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
               nil
             end
           end
@@ -73,14 +76,15 @@ module Html2rss
           def parse_script_tag(script_tag)
             JSON.parse(script_tag.text, symbolize_names: true)
           rescue JSON::ParserError => error
-            Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
+            Log.warn("#{name}: failed to parse JSON", error: error.message)
             []
           end
         end
-        def initialize(parsed_body, url:)
+        def initialize(parsed_body, url:, **opts)
           @parsed_body = parsed_body
           @url = url
+          @opts = opts
         end
         ##