html2rss 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +38 -10
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +74 -0
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +17 -8
  11. data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
  12. data/lib/html2rss/attribute_post_processors/template.rb +19 -11
  13. data/lib/html2rss/attribute_post_processors.rb +8 -0
  14. data/lib/html2rss/auto_source/article.rb +95 -0
  15. data/lib/html2rss/auto_source/channel.rb +79 -0
  16. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  17. data/lib/html2rss/auto_source/reducer.rb +48 -0
  18. data/lib/html2rss/auto_source/rss_builder.rb +68 -0
  19. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  20. data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  23. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  24. data/lib/html2rss/auto_source/scraper.rb +33 -0
  25. data/lib/html2rss/auto_source.rb +77 -0
  26. data/lib/html2rss/cli.rb +10 -0
  27. data/lib/html2rss/config/channel.rb +4 -2
  28. data/lib/html2rss/config/selectors.rb +13 -2
  29. data/lib/html2rss/item.rb +8 -2
  30. data/lib/html2rss/utils.rb +5 -10
  31. data/lib/html2rss/version.rb +1 -1
  32. data/lib/html2rss.rb +21 -0
  33. metadata +30 -3
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable'
4
+ require 'parallel'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ ##
10
+ # Scrapes articles by looking for common markup tags (article, section, li)
11
+ # containing an <a href> tag.
12
+ #
13
+ # See:
14
+ # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
15
+ class SemanticHtml
16
+ include Enumerable
17
+
18
+ ##
19
+ # Map of parent element names to CSS selectors for finding <a href> tags.
20
+ ANCHOR_TAG_SELECTORS = {
21
+ 'section' => ['section :not(section) a[href]'],
22
+ 'tr' => ['table tr :not(tr) a[href]'],
23
+ 'article' => [
24
+ 'article :not(article) a[href]',
25
+ 'article a[href]'
26
+ ],
27
+ 'li' => [
28
+ 'ul > li :not(li) a[href]',
29
+ 'ol > li :not(li) a[href]'
30
+ ]
31
+ }.freeze
32
+
33
+ # Check if the parsed_body contains articles
34
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
35
+ # @return [Boolean] True if articles are found, otherwise false.
36
+ def self.articles?(parsed_body)
37
+ return false unless parsed_body
38
+
39
+ ANCHOR_TAG_SELECTORS.each_value do |selectors|
40
+ return true if selectors.any? { |selector| parsed_body.at_css(selector) }
41
+ end
42
+ false
43
+ end
44
+
45
+ # Finds the closest ancestor tag matching the specified tag name
46
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
47
+ # @param tag_name [String] The tag name to search for
48
+ # @param stop_tag [String] The tag name to stop searching at
49
+ # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
50
+ def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
51
+ return current_tag if current_tag.name == tag_name
52
+
53
+ stop_tags = Set[tag_name, stop_tag]
54
+
55
+ while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
56
+ current_tag = current_tag.parent
57
+ end
58
+
59
+ current_tag
60
+ end
61
+
62
+ # Finds the closest matching selector upwards in the DOM tree
63
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
64
+ # @param selector [String] The CSS selector to search for
65
+ # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
66
+ def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
67
+ current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
68
+ end
69
+
70
+ # Helper method to find a matching selector upwards
71
+ # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
72
+ # @param selector [String] The CSS selector to search for
73
+ # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
74
+ def self.find_closest_selector_upwards(current_tag, selector:)
75
+ while current_tag
76
+ found = current_tag.at_css(selector)
77
+ return found if found
78
+
79
+ return nil unless current_tag.respond_to?(:parent)
80
+
81
+ current_tag = current_tag.parent
82
+ end
83
+ end
84
+
85
+ # Returns an array of [tag_name, selector] pairs
86
+ # @return [Array<[String, String]>] Array of tag name and selector pairs
87
+ def self.anchor_tag_selector_pairs
88
+ ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
89
+ selectors.map { |selector| [tag_name, selector] }
90
+ end
91
+ end
92
+
93
+ def initialize(parsed_body, url:)
94
+ @parsed_body = parsed_body
95
+ @url = url
96
+ end
97
+
98
+ attr_reader :parsed_body
99
+
100
+ ##
101
+ # @yieldparam [Hash] The scraped article hash
102
+ # @return [Enumerator] Enumerator for the scraped articles
103
+ def each
104
+ return enum_for(:each) unless block_given?
105
+
106
+ SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
+ parsed_body.css(selector).each do |selected_tag|
108
+ article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
109
+ article_hash = Extractor.new(article_tag, url: @url).call
110
+
111
+ yield article_hash if article_hash
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ ##
6
+ # The Scraper module contains all scrapers that can be used to extract articles.
7
+ # Each scraper should implement a `call` method that returns an array of article hashes.
8
+ # Each scraper should also implement an `articles?` method that returns true if the scraper
9
+ # can potentially be used to extract articles from the given HTML.
10
+ #
11
+ module Scraper
12
+ SCRAPERS = [
13
+ Schema,
14
+ SemanticHtml
15
+ ].freeze
16
+
17
+ ##
18
+ # Error raised when no suitable scraper is found.
19
+ class NoScraperFound < Html2rss::Error; end
20
+
21
+ ##
22
+ # Returns an array of scrapers that claim to find articles in the parsed body.
23
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
24
+ # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
25
+ def self.from(parsed_body)
26
+ scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
27
+ raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
28
+
29
+ scrapers
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'parallel'
5
+ require 'addressable'
6
+
7
+ module Html2rss
8
+ ##
9
+ # The AutoSource class is responsible for extracting channel and articles
10
+ # from a given URL.
11
+ # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
+ # marking articles, e.g. schema, microdata, open graph, etc.
13
+ class AutoSource
14
+ class UnsupportedUrlScheme < Html2rss::Error; end
15
+ class NoArticlesFound < Html2rss::Error; end
16
+
17
+ SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
18
+
19
+ def initialize(url)
20
+ unless url.is_a?(String) || url.is_a?(Addressable::URI)
21
+ raise ArgumentError,
22
+ 'URL must be a String or Addressable::URI'
23
+ end
24
+
25
+ @url = Addressable::URI.parse(url)
26
+
27
+ raise ArgumentError, 'URL must be absolute' unless @url.absolute?
28
+ raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
29
+ end
30
+
31
+ def build
32
+ raise NoArticlesFound if articles.empty?
33
+
34
+ Reducer.call(articles, url:)
35
+ Cleanup.call(articles, url:, keep_different_domain: true)
36
+
37
+ Html2rss::AutoSource::RssBuilder.new(
38
+ channel:,
39
+ articles:
40
+ ).call
41
+ end
42
+
43
+ def articles
44
+ @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
45
+ instance = scraper.new(parsed_body, url:)
46
+
47
+ articles_in_thread = Parallel.map(instance.each) do |article_hash|
48
+ Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
49
+
50
+ Article.new(**article_hash, scraper:)
51
+ end
52
+
53
+ Reducer.call(articles_in_thread, url:)
54
+
55
+ articles_in_thread
56
+ end
57
+ end
58
+
59
+ def channel
60
+ Channel.new(parsed_body, response:, url:, articles:)
61
+ end
62
+
63
+ private
64
+
65
+ attr_reader :url
66
+
67
+ def response
68
+ @response ||= Html2rss::Utils.request_url(url)
69
+ end
70
+
71
+ # Parses the HTML body of the response using Nokogiri.
72
+ # @return [Nokogiri::HTML::Document]
73
+ def parsed_body
74
+ @parsed_body ||= Nokogiri.HTML(response.body).freeze
75
+ end
76
+ end
77
+ end
data/lib/html2rss/cli.rb CHANGED
@@ -2,8 +2,13 @@
2
2
 
3
3
  require_relative '../html2rss'
4
4
  require 'thor'
5
+ require 'addressable'
5
6
 
7
+ ##
8
+ # The Html2rss namespace / command line interface.
6
9
  module Html2rss
10
+ Log = Logger.new($stderr)
11
+
7
12
  ##
8
13
  # The Html2rss command line interface.
9
14
  class CLI < Thor
@@ -25,5 +30,10 @@ module Html2rss
25
30
  params = options.to_h { |opt| opt.split('=', 2) }
26
31
  puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
27
32
  end
33
+
34
+ desc 'auto URL', 'automatically sources an RSS feed from the URL'
35
+ def auto(url)
36
+ puts Html2rss.auto_source(url)
37
+ end
28
38
  end
29
39
  end
@@ -16,7 +16,7 @@ module Html2rss
16
16
  # @return [Set<String>] the required parameter names
17
17
  def self.required_params_for_config(config)
18
18
  config.each_with_object(Set.new) do |(_, value), required_params|
19
- required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
19
+ required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
20
20
  end
21
21
  end
22
22
 
@@ -25,7 +25,9 @@ module Html2rss
25
25
  # @param params [Hash]
26
26
  def initialize(channel, params: {})
27
27
  raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
28
- raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
28
+
29
+ url = channel[:url]
30
+ raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
29
31
 
30
32
  @config = process_params(channel, params.transform_keys(&:to_sym))
31
33
  end
@@ -10,6 +10,9 @@ module Html2rss
10
10
  # Struct to represent a selector with associated attributes for extraction and processing.
11
11
  Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
12
12
 
13
+ # raised when an invalid selector name is used
14
+ class InvalidSelectorName < Html2rss::Error; end
15
+
13
16
  ##
14
17
  # @param config [Hash<Symbol, Object>]
15
18
  def initialize(config)
@@ -28,9 +31,15 @@ module Html2rss
28
31
  # @param name [Symbol]
29
32
  # @return [Selector]
30
33
  def selector(name)
31
- raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
34
+ raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
35
+
36
+ keywords = config[name].slice(*available_keys)
32
37
 
33
- Selector.new(config[name])
38
+ if (additional_keys = keywords.keys - available_keys).any?
39
+ Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
40
+ end
41
+
42
+ Selector.new(keywords)
34
43
  end
35
44
 
36
45
  ##
@@ -86,6 +95,8 @@ module Html2rss
86
95
  array.map!(&:to_sym)
87
96
  end.to_set
88
97
  end
98
+
99
+ def available_keys = @available_keys ||= Selector.members
89
100
  end
90
101
  end
91
102
  end
data/lib/html2rss/item.rb CHANGED
@@ -23,7 +23,8 @@ module Html2rss
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
26
- body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
26
+ body = Utils.request_url(url, headers: config.headers).body
27
+ body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
27
28
 
28
29
  Nokogiri.HTML(body)
29
30
  .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
@@ -47,6 +48,7 @@ module Html2rss
47
48
  # @param method_name [Symbol]
48
49
  # @param _include_private [true, false]
49
50
  # @return [true, false]
51
+ # :reek:BooleanParameter { enabled: false }
50
52
  def respond_to_missing?(method_name, _include_private = false)
51
53
  config.selector?(method_name) || super
52
54
  end
@@ -110,7 +112,11 @@ module Html2rss
110
112
  #
111
113
  # @return [Array<String>] list of categories.
112
114
  def categories
113
- config.category_selector_names.map { |method_name| public_send(method_name) }
115
+ config.category_selector_names
116
+ .filter_map do |method_name|
117
+ category = public_send(method_name)
118
+ category.strip unless category.to_s.empty?
119
+ end.uniq
114
120
  end
115
121
 
116
122
  ##
@@ -31,12 +31,12 @@ module Html2rss
31
31
  ##
32
32
  # Removes any space, parses and normalizes the given url.
33
33
  # @param url [String]
34
- # @return [String, nil] sanitized and normalized URL, or nil if input is empty
34
+ # @return [Addressable::URI, nil] normalized URL, or nil if input is empty
35
35
  def self.sanitize_url(url)
36
36
  url = url.to_s.gsub(/\s+/, ' ').strip
37
37
  return if url.empty?
38
38
 
39
- Addressable::URI.parse(url).normalize.to_s
39
+ Addressable::URI.parse(url).normalize
40
40
  end
41
41
 
42
42
  ##
@@ -71,18 +71,13 @@ module Html2rss
71
71
 
72
72
  ##
73
73
  # @param url [String, Addressable::URI]
74
- # @param convert_json_to_xml [true, false] Should JSON be converted to XML
75
74
  # @param headers [Hash] additional HTTP request headers to use for the request
76
- # @return [String] body of the HTTP response
77
- def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
78
- response = Faraday.new(url:, headers:) do |faraday|
75
+ # @return [Faraday::Response] body of the HTTP response
76
+ def self.request_url(url, headers: {})
77
+ Faraday.new(url:, headers:) do |faraday|
79
78
  faraday.use Faraday::FollowRedirects::Middleware
80
79
  faraday.adapter Faraday.default_adapter
81
80
  end.get
82
-
83
- body = response.body
84
-
85
- convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
86
81
  end
87
82
 
88
83
  ##
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.11.0'
6
+ VERSION = '0.13.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
8
  require 'yaml'
9
+ require 'logger'
9
10
 
10
11
  ##
11
12
  # The Html2rss namespace.
12
13
  module Html2rss
14
+ ##
15
+ # The logger instance.
16
+ Log = Logger.new($stdout)
17
+
18
+ Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
19
+
20
+ Log.formatter = proc do |severity, datetime, _progname, msg|
21
+ "#{datetime} [#{severity}] #{msg}\n"
22
+ end
23
+
13
24
  ##
14
25
  # The Html2rss::Error base class.
15
26
  class Error < StandardError; end
@@ -91,5 +102,15 @@ module Html2rss
91
102
  end
92
103
  end
93
104
 
105
+ ##
106
+ # Scrapes the provided URL and returns an RSS object.
107
+ # No need for a "feed config".
108
+ #
109
+ # @param url [String] the URL to automatically source the feed from
110
+ # @return [RSS::Rss]
111
+ def self.auto_source(url)
112
+ Html2rss::AutoSource.new(url).build
113
+ end
114
+
94
115
  private_class_method :load_yaml, :find_feed_config
95
116
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-09 00:00:00.000000000 Z
11
+ date: 2024-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -106,6 +106,20 @@ dependencies:
106
106
  - - "<"
107
107
  - !ruby/object:Gem::Version
108
108
  version: '2.0'
109
+ - !ruby/object:Gem::Dependency
110
+ name: parallel
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
109
123
  - !ruby/object:Gem::Dependency
110
124
  name: regexp_parser
111
125
  requirement: !ruby/object:Gem::Requirement
@@ -219,6 +233,7 @@ files:
219
233
  - html2rss.gemspec
220
234
  - lib/html2rss.rb
221
235
  - lib/html2rss/attribute_post_processors.rb
236
+ - lib/html2rss/attribute_post_processors/base.rb
222
237
  - lib/html2rss/attribute_post_processors/gsub.rb
223
238
  - lib/html2rss/attribute_post_processors/html_to_markdown.rb
224
239
  - lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
@@ -229,6 +244,18 @@ files:
229
244
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
230
245
  - lib/html2rss/attribute_post_processors/substring.rb
231
246
  - lib/html2rss/attribute_post_processors/template.rb
247
+ - lib/html2rss/auto_source.rb
248
+ - lib/html2rss/auto_source/article.rb
249
+ - lib/html2rss/auto_source/channel.rb
250
+ - lib/html2rss/auto_source/cleanup.rb
251
+ - lib/html2rss/auto_source/reducer.rb
252
+ - lib/html2rss/auto_source/rss_builder.rb
253
+ - lib/html2rss/auto_source/scraper.rb
254
+ - lib/html2rss/auto_source/scraper/schema.rb
255
+ - lib/html2rss/auto_source/scraper/schema/base.rb
256
+ - lib/html2rss/auto_source/scraper/semantic_html.rb
257
+ - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
258
+ - lib/html2rss/auto_source/scraper/semantic_html/image.rb
232
259
  - lib/html2rss/cli.rb
233
260
  - lib/html2rss/config.rb
234
261
  - lib/html2rss/config/channel.rb
@@ -252,7 +279,7 @@ licenses:
252
279
  - MIT
253
280
  metadata:
254
281
  allowed_push_host: https://rubygems.org
255
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.11.0
282
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
256
283
  rubygems_mfa_required: 'true'
257
284
  post_install_message:
258
285
  rdoc_options: []