html2rss 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +39 -11
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +9 -6
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
  11. data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
  12. data/lib/html2rss/attribute_post_processors/template.rb +4 -4
  13. data/lib/html2rss/auto_source/article.rb +95 -0
  14. data/lib/html2rss/auto_source/channel.rb +85 -0
  15. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  16. data/lib/html2rss/auto_source/reducer.rb +48 -0
  17. data/lib/html2rss/auto_source/rss_builder.rb +70 -0
  18. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  19. data/lib/html2rss/auto_source/scraper/schema.rb +128 -0
  20. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  23. data/lib/html2rss/auto_source/scraper.rb +33 -0
  24. data/lib/html2rss/auto_source.rb +80 -0
  25. data/lib/html2rss/cli.rb +10 -0
  26. data/lib/html2rss/config/channel.rb +4 -2
  27. data/lib/html2rss/config/selectors.rb +2 -2
  28. data/lib/html2rss/config.rb +1 -4
  29. data/lib/html2rss/item.rb +9 -3
  30. data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
  31. data/lib/html2rss/utils.rb +11 -10
  32. data/lib/html2rss/version.rb +1 -1
  33. data/lib/html2rss.rb +27 -11
  34. metadata +30 -4
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class Schema
9
+ ##
10
+ # Base class for Schema.org schema_objects.
11
+ #
12
+ # @see https://schema.org/Article
13
+ class Base
14
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
15
+
16
+ def initialize(schema_object, url:)
17
+ @schema_object = schema_object
18
+ @url = url
19
+ end
20
+
21
+ # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
22
+ def call
23
+ DEFAULT_ATTRIBUTES.to_h do |attribute|
24
+ [attribute, public_send(attribute)]
25
+ end
26
+ end
27
+
28
+ def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
29
+ def title = schema_object[:title]
30
+
31
+ def description
32
+ [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
33
+ .max_by { |desc| desc.to_s.size }
34
+ end
35
+
36
+ # @return [Addressable::URI, nil] the URL of the schema object
37
+ def url
38
+ url = schema_object[:url]
39
+ if url.to_s.empty?
40
+ Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
41
+ return
42
+ end
43
+
44
+ Utils.build_absolute_url_from_relative(url, @url)
45
+ end
46
+
47
+ def image = images.first || nil
48
+ def published_at = schema_object[:datePublished]
49
+
50
+ private
51
+
52
+ attr_reader :schema_object
53
+
54
+ def images
55
+ Array(schema_object[:image]).compact
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'set'
6
+
7
+ module Html2rss
8
+ class AutoSource
9
+ module Scraper
10
+ ##
11
+ # Scraps articles from Schema.org objects, by looking for the objects in:
12
+
13
+ # 1. <script type="application/ld+json"> "schema" tag.
14
+ # 2. tbd
15
+ #
16
+ # See:
17
+ # 1. https://schema.org/NewsArticle
18
+ # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
19
+ class Schema
20
+ include Enumerable
21
+
22
+ TAG_SELECTOR = 'script[type="application/ld+json"]'
23
+ SCHEMA_OBJECT_TYPES = %w[
24
+ AdvertiserContentArticle
25
+ AnalysisNewsArticle
26
+ APIReference
27
+ Article
28
+ AskPublicNewsArticle
29
+ BackgroundNewsArticle
30
+ BlogPosting
31
+ DiscussionForumPosting
32
+ LiveBlogPosting
33
+ NewsArticle
34
+ OpinionNewsArticle
35
+ Report
36
+ ReportageNewsArticle
37
+ ReviewNewsArticle
38
+ SatiricalArticle
39
+ ScholarlyArticle
40
+ SocialMediaPosting
41
+ TechArticle
42
+ ].to_set.freeze
43
+
44
+ class << self
45
+ def articles?(parsed_body)
46
+ parsed_body.css(TAG_SELECTOR).any? do |script|
47
+ SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
48
+ end
49
+ end
50
+
51
+ ##
52
+ # Returns a flat array
53
+ # of all supported schema objects
54
+ # by recursively traversing the `from` object.
55
+ #
56
+ # @param object [Hash, Array]
57
+ # @return [Array<Hash>] the schema_objects, or an empty array
58
+ # :reek:DuplicateMethodCall
59
+ def from(object)
60
+ case object
61
+ when Nokogiri::XML::Element
62
+ from(parse_script_tag(object))
63
+ when Hash
64
+ supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
65
+ when Array
66
+ object.flat_map { |item| from(item) }
67
+ else
68
+ []
69
+ end
70
+ end
71
+
72
+ def supported_schema_object?(object)
73
+ scraper_for_schema_object(object) ? true : false
74
+ end
75
+
76
+ ##
77
+ # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
78
+ def scraper_for_schema_object(schema_object)
79
+ if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
80
+ Base
81
+ else
82
+ Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
83
+ nil
84
+ end
85
+ end
86
+
87
+ private
88
+
89
+ def parse_script_tag(script_tag)
90
+ JSON.parse(script_tag.text, symbolize_names: true)
91
+ rescue JSON::ParserError => error
92
+ Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
93
+ []
94
+ end
95
+ end
96
+
97
+ def initialize(parsed_body, url:)
98
+ @parsed_body = parsed_body
99
+ @url = url
100
+ end
101
+
102
+ ##
103
+ # @yield [Hash] Each scraped article_hash
104
+ # @return [Array<Hash>] the scraped article_hashes
105
+ def each(&)
106
+ return enum_for(:each) unless block_given?
107
+
108
+ schema_objects.filter_map do |schema_object|
109
+ next unless (klass = self.class.scraper_for_schema_object(schema_object))
110
+ next unless (article_hash = klass.new(schema_object, url:).call)
111
+
112
+ yield article_hash
113
+ end
114
+ end
115
+
116
+ private
117
+
118
+ def schema_objects
119
+ @parsed_body.css(TAG_SELECTOR).flat_map do |tag|
120
+ Schema.from(tag)
121
+ end
122
+ end
123
+
124
+ attr_reader :parsed_body, :url
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ class SemanticHtml
9
+ ##
10
+ # ArticleExtractor is responsible for extracting the details of an article.
11
+ # It focuses on finding a headline first, and from it traverse as much as possible,
12
+ # to find the DOM upwards to find the other details.
13
+ class Extractor
14
+ INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
15
+ HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
16
+ NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
17
+ INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
18
+
19
+ def self.visible_text_from_tag(tag, separator: ' ')
20
+ text = if (children = tag.children).empty?
21
+ tag.text.strip
22
+ else
23
+ children.filter_map do |child|
24
+ next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
25
+
26
+ visible_text_from_tag(child)
27
+ end.join(separator)
28
+ end
29
+
30
+ return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
31
+
32
+ sanitized_text
33
+ end
34
+
35
+ def initialize(article_tag, url:)
36
+ @article_tag = article_tag
37
+ @url = url
38
+ @heading = find_heading
39
+ @extract_url = find_url
40
+ end
41
+
42
+ # @return [Hash, nil] The scraped article or nil.
43
+ def call
44
+ return unless heading
45
+
46
+ {
47
+ title: extract_title,
48
+ url: extract_url,
49
+ image: extract_image,
50
+ description: extract_description,
51
+ id: generate_id,
52
+ published_at: extract_published_at
53
+ }
54
+ end
55
+
56
+ private
57
+
58
+ attr_reader :article_tag, :url, :heading, :extract_url
59
+
60
+ def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
61
+
62
+ # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
63
+ def extract_published_at
64
+ times = article_tag.css('time[datetime]')
65
+ .filter_map do |tag|
66
+ DateTime.parse(tag['datetime'])
67
+ rescue ArgumentError, TypeError
68
+ nil
69
+ end
70
+
71
+ times.min
72
+ end
73
+
74
+ def find_heading
75
+ heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
76
+ smallest_heading = heading_tags.keys.min
77
+ heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
78
+ end
79
+
80
+ def extract_title
81
+ @extract_title ||= if heading.children.empty? && heading.text
82
+ visible_text_from_tag(heading)
83
+ else
84
+ visible_text_from_tag(
85
+ article_tag.css(HEADING_TAGS.join(','))
86
+ .max_by { |tag| tag.text.size }
87
+ )
88
+ end
89
+ end
90
+
91
+ def extract_description
92
+ text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
93
+ return text if text
94
+
95
+ description = visible_text_from_tag(article_tag)
96
+ return nil unless description
97
+
98
+ title_text = extract_title
99
+ description.gsub!(title_text, '') if title_text
100
+ description.strip!
101
+ description.empty? ? nil : description
102
+ end
103
+
104
+ def find_url
105
+ closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
106
+ selector: 'a[href]:not([href=""])')
107
+ href = closest_anchor&.[]('href')&.split('#')&.first&.strip
108
+ Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
109
+ end
110
+
111
+ def extract_image
112
+ Image.call(article_tag, url:)
113
+ end
114
+
115
+ def generate_id
116
+ [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
117
+ extract_url&.path].compact.reject(&:empty?).first
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class SemanticHtml
7
+ ##
8
+ # Image is responsible for extracting image URLs the article_tag.
9
+ class Image
10
+ def self.call(article_tag, url:)
11
+ img_src = from_source(article_tag) ||
12
+ from_img(article_tag) ||
13
+ from_style(article_tag)
14
+
15
+ Utils.build_absolute_url_from_relative(img_src, url) if img_src
16
+ end
17
+
18
+ def self.from_img(article_tag)
19
+ article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
20
+ end
21
+
22
+ ##
23
+ # Extracts the largest image source from the srcset attribute
24
+ # of an img tag or a source tag inside a picture tag.
25
+ #
26
+ # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
27
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
28
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
29
+ def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
30
+ hash = article_tag.css('img[srcset], picture > source[srcset]')
31
+ .flat_map { |source| source['srcset'].to_s.split(',') }
32
+ .filter_map do |line|
33
+ width, url = line.split.reverse
34
+ next if url.nil? || url.start_with?('data:')
35
+
36
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
37
+
38
+ [width_value, url.strip]
39
+ end.to_h
40
+
41
+ hash[hash.keys.max]
42
+ end
43
+
44
+ def self.from_style(article_tag)
45
+ article_tag.css('[style*="url"]')
46
+ .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
47
+ .reject { |src| !src || src.start_with?('data:') }
48
+ .max_by(&:size)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable'
4
+ require 'parallel'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ ##
10
+ # Scrapes articles by looking for common markup tags (article, section, li)
11
+ # containing an <a href> tag.
12
+ #
13
+ # See:
14
+ # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
15
+ class SemanticHtml
16
+ include Enumerable
17
+
18
+ ##
19
+ # Map of parent element names to CSS selectors for finding <a href> tags.
20
+ ANCHOR_TAG_SELECTORS = {
21
+ 'section' => ['section :not(section) a[href]'],
22
+ 'tr' => ['table tr :not(tr) a[href]'],
23
+ 'article' => [
24
+ 'article :not(article) a[href]',
25
+ 'article a[href]'
26
+ ],
27
+ 'li' => [
28
+ 'ul > li :not(li) a[href]',
29
+ 'ol > li :not(li) a[href]'
30
+ ]
31
+ }.freeze
32
+
33
+ # Check if the parsed_body contains articles
34
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
35
+ # @return [Boolean] True if articles are found, otherwise false.
36
+ def self.articles?(parsed_body)
37
+ return false unless parsed_body
38
+
39
+ ANCHOR_TAG_SELECTORS.each_value do |selectors|
40
+ return true if selectors.any? { |selector| parsed_body.at_css(selector) }
41
+ end
42
+ false
43
+ end
44
+
45
+ # Finds the closest ancestor tag matching the specified tag name
46
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
47
+ # @param tag_name [String] The tag name to search for
48
+ # @param stop_tag [String] The tag name to stop searching at
49
+ # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
50
+ def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
51
+ return current_tag if current_tag.name == tag_name
52
+
53
+ stop_tags = Set[tag_name, stop_tag]
54
+
55
+ while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
56
+ current_tag = current_tag.parent
57
+ end
58
+
59
+ current_tag
60
+ end
61
+
62
+ # Finds the closest matching selector upwards in the DOM tree
63
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
64
+ # @param selector [String] The CSS selector to search for
65
+ # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
66
+ def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
67
+ current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
68
+ end
69
+
70
+ # Helper method to find a matching selector upwards
71
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
72
+ # @param selector [String] The CSS selector to search for
73
+ # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
74
+ def self.find_closest_selector_upwards(current_tag, selector:)
75
+ while current_tag
76
+ found = current_tag.at_css(selector)
77
+ return found if found
78
+
79
+ return nil unless current_tag.respond_to?(:parent)
80
+
81
+ current_tag = current_tag.parent
82
+ end
83
+ end
84
+
85
+ # Returns an array of [tag_name, selector] pairs
86
+ # @return [Array<[String, String]>] Array of tag name and selector pairs
87
+ def self.anchor_tag_selector_pairs
88
+ ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
89
+ selectors.map { |selector| [tag_name, selector] }
90
+ end
91
+ end
92
+
93
+ def initialize(parsed_body, url:)
94
+ @parsed_body = parsed_body
95
+ @url = url
96
+ end
97
+
98
+ attr_reader :parsed_body
99
+
100
+ ##
101
+ # @yieldparam [Hash] The scraped article hash
102
+ # @return [Enumerator] Enumerator for the scraped articles
103
+ def each
104
+ return enum_for(:each) unless block_given?
105
+
106
+ SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
+ parsed_body.css(selector).each do |selected_tag|
108
+ article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
109
+ article_hash = Extractor.new(article_tag, url: @url).call
110
+
111
+ yield article_hash if article_hash
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ ##
6
+ # The Scraper module contains all scrapers that can be used to extract articles.
7
+ # Each scraper should implement a `call` method that returns an array of article hashes.
8
+ # Each scraper should also implement an `articles?` method that returns true if the scraper
9
+ # can potentially be used to extract articles from the given HTML.
10
+ #
11
+ module Scraper
12
+ SCRAPERS = [
13
+ Schema,
14
+ SemanticHtml
15
+ ].freeze
16
+
17
+ ##
18
+ # Error raised when no suitable scraper is found.
19
+ class NoScraperFound < Html2rss::Error; end
20
+
21
+ ##
22
+ # Returns an array of scrapers that claim to find articles in the parsed body.
23
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
24
+ # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
25
+ def self.from(parsed_body)
26
+ scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
27
+ raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
28
+
29
+ scrapers
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'parallel'
5
+ require 'addressable'
6
+
7
+ module Html2rss
8
+ ##
9
+ # The AutoSource class is responsible for extracting channel and articles
10
+ # from a given URL.
11
+ # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
+ # marking articles, e.g. schema, microdata, open graph, etc.
13
+ class AutoSource
14
+ class UnsupportedUrlScheme < Html2rss::Error; end
15
+ class NoArticlesFound < Html2rss::Error; end
16
+
17
+ SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
+
19
+ ##
20
+ # @param url [Addressable::URI] The URL to extract articles from.
21
+ # @param body [String] The body of the response.
22
+ # @param headers [Hash] The headers of the response.
23
+ def initialize(url, body:, headers: {})
24
+ raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
25
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
26
+ raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
27
+
28
+ @url = url
29
+ @body = body
30
+ @headers = headers
31
+ end
32
+
33
+ def build
34
+ raise NoArticlesFound if articles.empty?
35
+
36
+ Reducer.call(articles, url:)
37
+ Cleanup.call(articles, url:, keep_different_domain: true)
38
+
39
+ channel.articles = articles
40
+
41
+ Html2rss::AutoSource::RssBuilder.new(
42
+ channel:,
43
+ articles:
44
+ ).call
45
+ end
46
+
47
+ def articles
48
+ @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
49
+ instance = scraper.new(parsed_body, url:)
50
+
51
+ articles_in_thread = Parallel.map(instance.each) do |article_hash|
52
+ Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
53
+
54
+ Article.new(**article_hash, scraper:)
55
+ end
56
+
57
+ Reducer.call(articles_in_thread, url:)
58
+
59
+ articles_in_thread
60
+ end
61
+ end
62
+
63
+ def channel
64
+ @channel ||= Channel.new(parsed_body, headers: @headers, url:)
65
+ end
66
+
67
+ private
68
+
69
+ attr_reader :url
70
+
71
+ # @return [Nokogiri::HTML::Document]
72
+ def parsed_body
73
+ @parsed_body ||= Nokogiri.HTML(@body)
74
+ .tap do |doc|
75
+ # Remove comments from the document
76
+ doc.xpath('//comment()').each(&:remove)
77
+ end.freeze
78
+ end
79
+ end
80
+ end
data/lib/html2rss/cli.rb CHANGED
@@ -2,8 +2,13 @@
2
2
 
3
3
  require_relative '../html2rss'
4
4
  require 'thor'
5
+ require 'addressable'
5
6
 
7
+ ##
8
+ # The Html2rss namespace / command line interface.
6
9
  module Html2rss
10
+ Log = Logger.new($stderr)
11
+
7
12
  ##
8
13
  # The Html2rss command line interface.
9
14
  class CLI < Thor
@@ -25,5 +30,10 @@ module Html2rss
25
30
  params = options.to_h { |opt| opt.split('=', 2) }
26
31
  puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
27
32
  end
33
+
34
+ desc 'auto URL', 'automatically sources an RSS feed from the URL'
35
+ def auto(url)
36
+ puts Html2rss.auto_source(url)
37
+ end
28
38
  end
29
39
  end
@@ -16,7 +16,7 @@ module Html2rss
16
16
  # @return [Set<String>] the required parameter names
17
17
  def self.required_params_for_config(config)
18
18
  config.each_with_object(Set.new) do |(_, value), required_params|
19
- required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
19
+ required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
20
20
  end
21
21
  end
22
22
 
@@ -25,7 +25,9 @@ module Html2rss
25
25
  # @param params [Hash]
26
26
  def initialize(channel, params: {})
27
27
  raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
28
- raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
28
+
29
+ url = channel[:url]
30
+ raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
29
31
 
30
32
  @config = process_params(channel, params.transform_keys(&:to_sym))
31
33
  end
@@ -35,8 +35,8 @@ module Html2rss
35
35
 
36
36
  keywords = config[name].slice(*available_keys)
37
37
 
38
- if (additional_keys = available_keys - keywords.keys).any?
39
- warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
38
+ if (additional_keys = keywords.keys - available_keys).any?
39
+ Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
40
40
  end
41
41
 
42
42
  Selector.new(keywords)
@@ -18,9 +18,6 @@ module Html2rss
18
18
  # Thrown when the feed config does not contain a value at `:channel`.
19
19
  class ChannelMissing < Html2rss::Error; end
20
20
 
21
- # Struct to store XML Stylesheet attributes
22
- Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
23
-
24
21
  def_delegator :@channel, :author
25
22
  def_delegator :@channel, :ttl
26
23
  def_delegator :@channel, :title
@@ -75,7 +72,7 @@ module Html2rss
75
72
  #
76
73
  # @return [Array<Stylesheet>] Array of Stylesheet structs.
77
74
  def stylesheets
78
- @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
75
+ @global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
79
76
  end
80
77
 
81
78
  # Provides read-only access to the channel object.