html2rss 0.12.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +38 -10
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +9 -6
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
  11. data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
  12. data/lib/html2rss/attribute_post_processors/template.rb +4 -4
  13. data/lib/html2rss/auto_source/article.rb +95 -0
  14. data/lib/html2rss/auto_source/channel.rb +79 -0
  15. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  16. data/lib/html2rss/auto_source/reducer.rb +48 -0
  17. data/lib/html2rss/auto_source/rss_builder.rb +68 -0
  18. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  19. data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
  20. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  23. data/lib/html2rss/auto_source/scraper.rb +33 -0
  24. data/lib/html2rss/auto_source.rb +77 -0
  25. data/lib/html2rss/cli.rb +10 -0
  26. data/lib/html2rss/config/channel.rb +4 -2
  27. data/lib/html2rss/config/selectors.rb +2 -2
  28. data/lib/html2rss/item.rb +8 -2
  29. data/lib/html2rss/utils.rb +5 -10
  30. data/lib/html2rss/version.rb +1 -1
  31. data/lib/html2rss.rb +21 -0
  32. metadata +29 -3
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class Schema
9
+ ##
10
+ # Base class for Schema.org schema_objects.
11
+ #
12
+ # @see https://schema.org/Article
13
+ class Base
14
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
15
+
16
+ def initialize(schema_object, url:)
17
+ @schema_object = schema_object
18
+ @url = url
19
+ end
20
+
21
+ # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
22
+ def call
23
+ DEFAULT_ATTRIBUTES.to_h do |attribute|
24
+ [attribute, public_send(attribute)]
25
+ end
26
+ end
27
+
28
+ def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
29
+ def title = schema_object[:title]
30
+
31
+ def description
32
+ [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
33
+ .max_by { |desc| desc.to_s.size }
34
+ end
35
+
36
+ # @return [Addressable::URI, nil] the URL of the schema object
37
+ def url
38
+ url = schema_object[:url]
39
+ if url.to_s.empty?
40
+ Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
41
+ return
42
+ end
43
+
44
+ Utils.build_absolute_url_from_relative(url, @url)
45
+ end
46
+
47
+ def image = images.first || nil
48
+ def published_at = schema_object[:datePublished]
49
+
50
+ private
51
+
52
+ attr_reader :schema_object
53
+
54
+ def images
55
+ Array(schema_object[:image]).compact
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ ##
7
+ # Scraps articles from Schema.org objects, by looking for the objects in:
8
+
9
+ # 1. <script type="application/ld+json"> "schema" tag.
10
+ # 2. tbd
11
+ #
12
+ # See:
13
+ # 1. https://schema.org/NewsArticle
14
+ # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
15
+ class Schema
16
+ include Enumerable
17
+
18
+ TAG_SELECTOR = 'script[type="application/ld+json"]'
19
+ SCHEMA_OBJECT_TYPES = %w[
20
+ AdvertiserContentArticle
21
+ AnalysisNewsArticle
22
+ APIReference
23
+ Article
24
+ AskPublicNewsArticle
25
+ BackgroundNewsArticle
26
+ BlogPosting
27
+ DiscussionForumPosting
28
+ LiveBlogPosting
29
+ NewsArticle
30
+ OpinionNewsArticle
31
+ Report
32
+ ReportageNewsArticle
33
+ ReviewNewsArticle
34
+ SatiricalArticle
35
+ ScholarlyArticle
36
+ SocialMediaPosting
37
+ TechArticle
38
+ ].to_set.freeze
39
+
40
+ class << self
41
+ def articles?(parsed_body)
42
+ parsed_body.css(TAG_SELECTOR).any? do |script|
43
+ SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
44
+ end
45
+ end
46
+
47
+ ##
48
+ # Returns a flat array
49
+ # of all supported schema objects
50
+ # by recursively traversing the `from` object.
51
+ #
52
+ # @param object [Hash, Array]
53
+ # @return [Array<Hash>] the schema_objects, or an empty array
54
+ # :reek:DuplicateMethodCall
55
+ def from(object)
56
+ case object
57
+ when Nokogiri::XML::Element
58
+ from(parse_script_tag(object))
59
+ when Hash
60
+ supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
61
+ when Array
62
+ object.flat_map { |item| from(item) }
63
+ else
64
+ []
65
+ end
66
+ end
67
+
68
+ def supported_schema_object?(object)
69
+ scraper_for_schema_object(object) ? true : false
70
+ end
71
+
72
+ ##
73
+ # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
74
+ def scraper_for_schema_object(schema_object)
75
+ if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
76
+ Base
77
+ else
78
+ Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
79
+ nil
80
+ end
81
+ end
82
+
83
+ private
84
+
85
+ def parse_script_tag(script_tag)
86
+ JSON.parse(script_tag.text, symbolize_names: true)
87
+ rescue JSON::ParserError => error
88
+ Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
89
+ []
90
+ end
91
+ end
92
+
93
+ def initialize(parsed_body, url:)
94
+ @parsed_body = parsed_body
95
+ @url = url
96
+ end
97
+
98
+ ##
99
+ # @yield [Hash] Each scraped article_hash
100
+ # @return [Array<Hash>] the scraped article_hashes
101
+ def each(&)
102
+ schema_objects.filter_map do |schema_object|
103
+ next unless (klass = self.class.scraper_for_schema_object(schema_object))
104
+ next unless (article_hash = klass.new(schema_object, url:).call)
105
+
106
+ yield article_hash
107
+ end
108
+ end
109
+
110
+ private
111
+
112
+ def schema_objects
113
+ @parsed_body.css(TAG_SELECTOR).flat_map do |tag|
114
+ Schema.from(tag)
115
+ end
116
+ end
117
+
118
+ attr_reader :parsed_body, :url
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class SemanticHtml
9
+ ##
10
+ # ArticleExtractor is responsible for extracting the details of an article.
11
+ # It focuses on finding a headline first, and from it traverse as much as possible,
12
+ # to find the DOM upwards to find the other details.
13
+ class Extractor
14
+ INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
15
+ HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
16
+ NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
17
+ INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
18
+
19
+ def self.visible_text_from_tag(tag, separator: ' ')
20
+ text = if (children = tag.children).empty?
21
+ tag.text.strip
22
+ else
23
+ children.filter_map do |child|
24
+ next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
25
+
26
+ visible_text_from_tag(child)
27
+ end.join(separator)
28
+ end
29
+
30
+ return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
31
+
32
+ sanitized_text
33
+ end
34
+
35
+ def initialize(article_tag, url:)
36
+ @article_tag = article_tag
37
+ @url = url
38
+ @heading = find_heading
39
+ @extract_url = find_url
40
+ end
41
+
42
+ # @return [Hash, nil] The scraped article or nil.
43
+ def call
44
+ return unless heading
45
+
46
+ {
47
+ title: extract_title,
48
+ url: extract_url,
49
+ image: extract_image,
50
+ description: extract_description,
51
+ id: generate_id,
52
+ published_at: extract_published_at
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ attr_reader :article_tag, :url, :heading, :extract_url
59
+
60
+ def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
61
+
62
+ # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
63
+ def extract_published_at
64
+ times = article_tag.css('time[datetime]')
65
+ .filter_map do |tag|
66
+ DateTime.parse(tag['datetime'])
67
+ rescue ArgumentError, TypeError
68
+ nil
69
+ end
70
+
71
+ times.min
72
+ end
73
+
74
+ def find_heading
75
+ heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
76
+ smallest_heading = heading_tags.keys.min
77
+ heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
78
+ end
79
+
80
+ def extract_title
81
+ @extract_title ||= if heading.children.empty? && heading.text
82
+ visible_text_from_tag(heading)
83
+ else
84
+ visible_text_from_tag(
85
+ article_tag.css(HEADING_TAGS.join(','))
86
+ .max_by { |tag| tag.text.size }
87
+ )
88
+ end
89
+ end
90
+
91
+ def extract_description
92
+ text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
93
+ return text if text
94
+
95
+ description = visible_text_from_tag(article_tag)
96
+ return nil unless description
97
+
98
+ title_text = extract_title
99
+ description.gsub!(title_text, '') if title_text
100
+ description.strip!
101
+ description.empty? ? nil : description
102
+ end
103
+
104
+ def find_url
105
+ closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
106
+ selector: 'a[href]:not([href=""])')
107
+ href = closest_anchor&.[]('href')&.split('#')&.first&.strip
108
+ Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
109
+ end
110
+
111
+ def extract_image
112
+ Image.call(article_tag, url:)
113
+ end
114
+
115
+ def generate_id
116
+ [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
117
+ extract_url&.path].compact.reject(&:empty?).first
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class SemanticHtml
7
+ ##
8
+ # Image is responsible for extracting image URLs the article_tag.
9
+ class Image
10
+ def self.call(article_tag, url:)
11
+ img_src = from_source(article_tag) ||
12
+ from_img(article_tag) ||
13
+ from_style(article_tag)
14
+
15
+ Utils.build_absolute_url_from_relative(img_src, url) if img_src
16
+ end
17
+
18
+ def self.from_img(article_tag)
19
+ article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
20
+ end
21
+
22
+ ##
23
+ # Extracts the largest image source from the srcset attribute
24
+ # of an img tag or a source tag inside a picture tag.
25
+ #
26
+ # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
27
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
28
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
29
+ def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
30
+ hash = article_tag.css('img[srcset], picture > source[srcset]')
31
+ .flat_map { |source| source['srcset'].to_s.split(',') }
32
+ .filter_map do |line|
33
+ width, url = line.split.reverse
34
+ next if url.nil? || url.start_with?('data:')
35
+
36
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
37
+
38
+ [width_value, url.strip]
39
+ end.to_h
40
+
41
+ hash[hash.keys.max]
42
+ end
43
+
44
+ def self.from_style(article_tag)
45
+ article_tag.css('[style*="url"]')
46
+ .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
47
+ .reject { |src| !src || src.start_with?('data:') }
48
+ .max_by(&:size)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable'
4
+ require 'parallel'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ ##
10
+ # Scrapes articles by looking for common markup tags (article, section, li)
11
+ # containing an <a href> tag.
12
+ #
13
+ # See:
14
+ # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
15
+ class SemanticHtml
16
+ include Enumerable
17
+
18
+ ##
19
+ # Map of parent element names to CSS selectors for finding <a href> tags.
20
+ ANCHOR_TAG_SELECTORS = {
21
+ 'section' => ['section :not(section) a[href]'],
22
+ 'tr' => ['table tr :not(tr) a[href]'],
23
+ 'article' => [
24
+ 'article :not(article) a[href]',
25
+ 'article a[href]'
26
+ ],
27
+ 'li' => [
28
+ 'ul > li :not(li) a[href]',
29
+ 'ol > li :not(li) a[href]'
30
+ ]
31
+ }.freeze
32
+
33
+ # Check if the parsed_body contains articles
34
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
35
+ # @return [Boolean] True if articles are found, otherwise false.
36
+ def self.articles?(parsed_body)
37
+ return false unless parsed_body
38
+
39
+ ANCHOR_TAG_SELECTORS.each_value do |selectors|
40
+ return true if selectors.any? { |selector| parsed_body.at_css(selector) }
41
+ end
42
+ false
43
+ end
44
+
45
+ # Finds the closest ancestor tag matching the specified tag name
46
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
47
+ # @param tag_name [String] The tag name to search for
48
+ # @param stop_tag [String] The tag name to stop searching at
49
+ # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
50
+ def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
51
+ return current_tag if current_tag.name == tag_name
52
+
53
+ stop_tags = Set[tag_name, stop_tag]
54
+
55
+ while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
56
+ current_tag = current_tag.parent
57
+ end
58
+
59
+ current_tag
60
+ end
61
+
62
+ # Finds the closest matching selector upwards in the DOM tree
63
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
64
+ # @param selector [String] The CSS selector to search for
65
+ # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
66
+ def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
67
+ current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
68
+ end
69
+
70
+ # Helper method to find a matching selector upwards
71
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
72
+ # @param selector [String] The CSS selector to search for
73
+ # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
74
+ def self.find_closest_selector_upwards(current_tag, selector:)
75
+ while current_tag
76
+ found = current_tag.at_css(selector)
77
+ return found if found
78
+
79
+ return nil unless current_tag.respond_to?(:parent)
80
+
81
+ current_tag = current_tag.parent
82
+ end
83
+ end
84
+
85
+ # Returns an array of [tag_name, selector] pairs
86
+ # @return [Array<[String, String]>] Array of tag name and selector pairs
87
+ def self.anchor_tag_selector_pairs
88
+ ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
89
+ selectors.map { |selector| [tag_name, selector] }
90
+ end
91
+ end
92
+
93
+ def initialize(parsed_body, url:)
94
+ @parsed_body = parsed_body
95
+ @url = url
96
+ end
97
+
98
+ attr_reader :parsed_body
99
+
100
+ ##
101
+ # @yieldparam [Hash] The scraped article hash
102
+ # @return [Enumerator] Enumerator for the scraped articles
103
+ def each
104
+ return enum_for(:each) unless block_given?
105
+
106
+ SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
+ parsed_body.css(selector).each do |selected_tag|
108
+ article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
109
+ article_hash = Extractor.new(article_tag, url: @url).call
110
+
111
+ yield article_hash if article_hash
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ ##
6
+ # The Scraper module contains all scrapers that can be used to extract articles.
7
+ # Each scraper should implement a `call` method that returns an array of article hashes.
8
+ # Each scraper should also implement an `articles?` method that returns true if the scraper
9
+ # can potentially be used to extract articles from the given HTML.
10
+ #
11
+ module Scraper
12
+ SCRAPERS = [
13
+ Schema,
14
+ SemanticHtml
15
+ ].freeze
16
+
17
+ ##
18
+ # Error raised when no suitable scraper is found.
19
+ class NoScraperFound < Html2rss::Error; end
20
+
21
+ ##
22
+ # Returns an array of scrapers that claim to find articles in the parsed body.
23
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
24
+ # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
25
+ def self.from(parsed_body)
26
+ scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
27
+ raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
28
+
29
+ scrapers
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'parallel'
5
+ require 'addressable'
6
+
7
+ module Html2rss
8
+ ##
9
+ # The AutoSource class is responsible for extracting channel and articles
10
+ # from a given URL.
11
+ # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
+ # marking articles, e.g. schema, microdata, open graph, etc.
13
+ class AutoSource
14
+ class UnsupportedUrlScheme < Html2rss::Error; end
15
+ class NoArticlesFound < Html2rss::Error; end
16
+
17
+ SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
+
19
+ def initialize(url)
20
+ unless url.is_a?(String) || url.is_a?(Addressable::URI)
21
+ raise ArgumentError,
22
+ 'URL must be a String or Addressable::URI'
23
+ end
24
+
25
+ @url = Addressable::URI.parse(url)
26
+
27
+ raise ArgumentError, 'URL must be absolute' unless @url.absolute?
28
+ raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
29
+ end
30
+
31
+ def build
32
+ raise NoArticlesFound if articles.empty?
33
+
34
+ Reducer.call(articles, url:)
35
+ Cleanup.call(articles, url:, keep_different_domain: true)
36
+
37
+ Html2rss::AutoSource::RssBuilder.new(
38
+ channel:,
39
+ articles:
40
+ ).call
41
+ end
42
+
43
+ def articles
44
+ @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
45
+ instance = scraper.new(parsed_body, url:)
46
+
47
+ articles_in_thread = Parallel.map(instance.each) do |article_hash|
48
+ Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
49
+
50
+ Article.new(**article_hash, scraper:)
51
+ end
52
+
53
+ Reducer.call(articles_in_thread, url:)
54
+
55
+ articles_in_thread
56
+ end
57
+ end
58
+
59
+ def channel
60
+ Channel.new(parsed_body, response:, url:, articles:)
61
+ end
62
+
63
+ private
64
+
65
+ attr_reader :url
66
+
67
+ def response
68
+ @response ||= Html2rss::Utils.request_url(url)
69
+ end
70
+
71
+ # Parses the HTML body of the response using Nokogiri.
72
+ # @return [Nokogiri::HTML::Document]
73
+ def parsed_body
74
+ @parsed_body ||= Nokogiri.HTML(response.body).freeze
75
+ end
76
+ end
77
+ end
data/lib/html2rss/cli.rb CHANGED
@@ -2,8 +2,13 @@
2
2
 
3
3
  require_relative '../html2rss'
4
4
  require 'thor'
5
+ require 'addressable'
5
6
 
7
+ ##
8
+ # The Html2rss namespace / command line interface.
6
9
  module Html2rss
10
+ Log = Logger.new($stderr)
11
+
7
12
  ##
8
13
  # The Html2rss command line interface.
9
14
  class CLI < Thor
@@ -25,5 +30,10 @@ module Html2rss
25
30
  params = options.to_h { |opt| opt.split('=', 2) }
26
31
  puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
27
32
  end
33
+
34
+ desc 'auto URL', 'automatically sources an RSS feed from the URL'
35
+ def auto(url)
36
+ puts Html2rss.auto_source(url)
37
+ end
28
38
  end
29
39
  end
@@ -16,7 +16,7 @@ module Html2rss
16
16
  # @return [Set<String>] the required parameter names
17
17
  def self.required_params_for_config(config)
18
18
  config.each_with_object(Set.new) do |(_, value), required_params|
19
- required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
19
+ required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
20
20
  end
21
21
  end
22
22
 
@@ -25,7 +25,9 @@ module Html2rss
25
25
  # @param params [Hash]
26
26
  def initialize(channel, params: {})
27
27
  raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
28
- raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
28
+
29
+ url = channel[:url]
30
+ raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
29
31
 
30
32
  @config = process_params(channel, params.transform_keys(&:to_sym))
31
33
  end
@@ -35,8 +35,8 @@ module Html2rss
35
35
 
36
36
  keywords = config[name].slice(*available_keys)
37
37
 
38
- if (additional_keys = available_keys - keywords.keys).any?
39
- warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
38
+ if (additional_keys = keywords.keys - available_keys).any?
39
+ Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
40
40
  end
41
41
 
42
42
  Selector.new(keywords)
data/lib/html2rss/item.rb CHANGED
@@ -23,7 +23,8 @@ module Html2rss
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
26
- body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
26
+ body = Utils.request_url(url, headers: config.headers).body
27
+ body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
27
28
 
28
29
  Nokogiri.HTML(body)
29
30
  .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
@@ -47,6 +48,7 @@ module Html2rss
47
48
  # @param method_name [Symbol]
48
49
  # @param _include_private [true, false]
49
50
  # @return [true, false]
51
+ # :reek:BooleanParameter { enabled: false }
50
52
  def respond_to_missing?(method_name, _include_private = false)
51
53
  config.selector?(method_name) || super
52
54
  end
@@ -110,7 +112,11 @@ module Html2rss
110
112
  #
111
113
  # @return [Array<String>] list of categories.
112
114
  def categories
113
- config.category_selector_names.map { |method_name| public_send(method_name) }
115
+ config.category_selector_names
116
+ .filter_map do |method_name|
117
+ category = public_send(method_name)
118
+ category.strip unless category.to_s.empty?
119
+ end.uniq
114
120
  end
115
121
 
116
122
  ##