html2rss 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +39 -11
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +9 -6
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
  11. data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
  12. data/lib/html2rss/attribute_post_processors/template.rb +4 -4
  13. data/lib/html2rss/auto_source/article.rb +95 -0
  14. data/lib/html2rss/auto_source/channel.rb +85 -0
  15. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  16. data/lib/html2rss/auto_source/reducer.rb +48 -0
  17. data/lib/html2rss/auto_source/rss_builder.rb +70 -0
  18. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  19. data/lib/html2rss/auto_source/scraper/schema.rb +128 -0
  20. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  23. data/lib/html2rss/auto_source/scraper.rb +33 -0
  24. data/lib/html2rss/auto_source.rb +80 -0
  25. data/lib/html2rss/cli.rb +10 -0
  26. data/lib/html2rss/config/channel.rb +4 -2
  27. data/lib/html2rss/config/selectors.rb +2 -2
  28. data/lib/html2rss/config.rb +1 -4
  29. data/lib/html2rss/item.rb +9 -3
  30. data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
  31. data/lib/html2rss/utils.rb +11 -10
  32. data/lib/html2rss/version.rb +1 -1
  33. data/lib/html2rss.rb +27 -11
  34. metadata +30 -4
data/lib/html2rss/item.rb CHANGED
@@ -19,11 +19,12 @@ module Html2rss
19
19
  ##
20
20
  # Fetches items from a given URL using configuration settings.
21
21
  #
22
- # @param url [String] URL to fetch items from.
22
+ # @param url [Addressable::URI] URL to fetch items from.
23
23
  # @param config [Html2rss::Config] Configuration object.
24
24
  # @return [Array<Html2rss::Item>] list of items fetched.
25
25
  def self.from_url(url, config)
26
- body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
26
+ body = Utils.request_url(url, headers: config.headers).body
27
+ body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
27
28
 
28
29
  Nokogiri.HTML(body)
29
30
  .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
@@ -47,6 +48,7 @@ module Html2rss
47
48
  # @param method_name [Symbol]
48
49
  # @param _include_private [true, false]
49
50
  # @return [true, false]
51
+ # :reek:BooleanParameter { enabled: false }
50
52
  def respond_to_missing?(method_name, _include_private = false)
51
53
  config.selector?(method_name) || super
52
54
  end
@@ -110,7 +112,11 @@ module Html2rss
110
112
  #
111
113
  # @return [Array<String>] list of categories.
112
114
  def categories
113
- config.category_selector_names.map { |method_name| public_send(method_name) }
115
+ config.category_selector_names
116
+ .filter_map do |method_name|
117
+ category = public_send(method_name)
118
+ category.strip unless category.to_s.empty?
119
+ end.uniq
114
120
  end
115
121
 
116
122
  ##
@@ -3,35 +3,50 @@
3
3
  module Html2rss
4
4
  module RssBuilder
5
5
  ##
6
- # Adds XML stylesheet tags (with the provided maker).
6
+ # Represents a stylesheet.
7
7
  class Stylesheet
8
- ##
9
- # Adds the stylesheet XML tags to the RSS.
10
- #
11
- # @param maker [RSS::Maker::RSS20] RSS maker object.
12
- # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
13
- # @return [nil]
14
- def self.add(maker, stylesheets)
15
- stylesheets.each do |stylesheet|
16
- add_stylesheet(maker, stylesheet)
8
+ class << self
9
+ ##
10
+ # Adds the stylesheet XML tags to the RSS.
11
+ #
12
+ # @param maker [RSS::Maker::RSS20] RSS maker object.
13
+ # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
14
+ # @return [nil]
15
+ def add(maker, stylesheets)
16
+ stylesheets.each do |stylesheet|
17
+ add_stylesheet(maker, stylesheet)
18
+ end
17
19
  end
18
- end
19
20
 
20
- ##
21
- # Adds a single Stylesheet to the RSS.
22
- #
23
- # @param maker [RSS::Maker::RSS20] RSS maker object.
24
- # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
25
- # @return [nil]
26
- def self.add_stylesheet(maker, stylesheet)
27
- maker.xml_stylesheets.new_xml_stylesheet do |xss|
28
- xss.href = stylesheet.href
29
- xss.type = stylesheet.type
30
- xss.media = stylesheet.media
21
+ private
22
+
23
+ ##
24
+ # Adds a single Stylesheet to the RSS.
25
+ #
26
+ # @param maker [RSS::Maker::RSS20] RSS maker object.
27
+ # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
28
+ # @return [nil]
29
+ def add_stylesheet(maker, stylesheet)
30
+ maker.xml_stylesheets.new_xml_stylesheet do |xss|
31
+ xss.href = stylesheet.href
32
+ xss.type = stylesheet.type
33
+ xss.media = stylesheet.media
34
+ end
31
35
  end
32
36
  end
33
37
 
34
- private_class_method :add_stylesheet
38
+ TYPES = ['text/css', 'text/xsl'].freeze
39
+
40
+ def initialize(href:, type:, media: 'all')
41
+ raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
42
+ raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
43
+ raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
44
+
45
+ @href = href
46
+ @type = type
47
+ @media = media
48
+ end
49
+ attr_reader :href, :type, :media
35
50
  end
36
51
  end
37
52
  end
@@ -31,12 +31,12 @@ module Html2rss
31
31
  ##
32
32
  # Removes any space, parses and normalizes the given url.
33
33
  # @param url [String]
34
- # @return [String, nil] sanitized and normalized URL, or nil if input is empty
34
+ # @return [Addressable::URI, nil] normalized URL, or nil if input is empty
35
35
  def self.sanitize_url(url)
36
36
  url = url.to_s.gsub(/\s+/, ' ').strip
37
37
  return if url.empty?
38
38
 
39
- Addressable::URI.parse(url).normalize.to_s
39
+ Addressable::URI.parse(url).normalize
40
40
  end
41
41
 
42
42
  ##
@@ -44,6 +44,7 @@ module Html2rss
44
44
  #
45
45
  # @param time_zone [String]
46
46
  # @param default_time_zone [String]
47
+ # @yield block to execute with the given time zone
47
48
  # @return [Object] whatever the given block returns
48
49
  def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
49
50
  raise ArgumentError, 'a block is required' unless block_given?
@@ -71,18 +72,18 @@ module Html2rss
71
72
 
72
73
  ##
73
74
  # @param url [String, Addressable::URI]
74
- # @param convert_json_to_xml [true, false] Should JSON be converted to XML
75
75
  # @param headers [Hash] additional HTTP request headers to use for the request
76
- # @return [String] body of the HTTP response
77
- def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
78
- response = Faraday.new(url:, headers:) do |faraday|
76
+ # @return [Faraday::Response] body of the HTTP response
77
+ def self.request_url(url, headers: {})
78
+ url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
79
+
80
+ raise ArgumentError, 'URL must be absolute' unless url.absolute?
81
+ raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
82
+
83
+ Faraday.new(url:, headers:) do |faraday|
79
84
  faraday.use Faraday::FollowRedirects::Middleware
80
85
  faraday.adapter Faraday.default_adapter
81
86
  end.get
82
-
83
- body = response.body
84
-
85
- convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
86
87
  end
87
88
 
88
89
  ##
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.12.0'
6
+ VERSION = '0.14.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
8
  require 'yaml'
9
+ require 'logger'
9
10
 
10
11
  ##
11
12
  # The Html2rss namespace.
12
13
  module Html2rss
14
+ ##
15
+ # The logger instance.
16
+ Log = Logger.new($stdout)
17
+
18
+ Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
19
+
20
+ Log.formatter = proc do |severity, datetime, _progname, msg|
21
+ "#{datetime} [#{severity}] #{msg}\n"
22
+ end
23
+
13
24
  ##
14
25
  # The Html2rss::Error base class.
15
26
  class Error < StandardError; end
@@ -32,7 +43,7 @@ module Html2rss
32
43
  # @param params [Hash] Dynamic parameters for the feed configuration.
33
44
  # @return [RSS::Rss] RSS object generated from the configuration.
34
45
  def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
35
- yaml = load_yaml(file)
46
+ yaml = YAML.safe_load_file(file, symbolize_names: true)
36
47
  feeds = yaml[CONFIG_KEY_FEEDS] || {}
37
48
 
38
49
  feed_config = find_feed_config(yaml, feeds, name, global_config)
@@ -62,15 +73,6 @@ module Html2rss
62
73
  RssBuilder.build(config)
63
74
  end
64
75
 
65
- ##
66
- # Loads and parses the YAML file.
67
- #
68
- # @param file [String] Path to the YAML file.
69
- # @return [Hash] Parsed YAML content.
70
- def self.load_yaml(file)
71
- YAML.safe_load_file(file, symbolize_names: true)
72
- end
73
-
74
76
  ##
75
77
  # Builds the feed configuration based on the provided parameters.
76
78
  #
@@ -91,5 +93,19 @@ module Html2rss
91
93
  end
92
94
  end
93
95
 
94
- private_class_method :load_yaml, :find_feed_config
96
+ ##
97
+ # Scrapes the provided URL and returns an RSS object.
98
+ # No need for a "feed config".
99
+ #
100
+ # @param url [String] the URL to automatically source the feed from
101
+ # @return [RSS::Rss]
102
+ def self.auto_source(url)
103
+ url = Addressable::URI.parse(url)
104
+
105
+ response = Html2rss::Utils.request_url(url)
106
+
107
+ Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
108
+ end
109
+
110
+ private_class_method :find_feed_config
95
111
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-10 00:00:00.000000000 Z
11
+ date: 2024-10-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -106,6 +106,20 @@ dependencies:
106
106
  - - "<"
107
107
  - !ruby/object:Gem::Version
108
108
  version: '2.0'
109
+ - !ruby/object:Gem::Dependency
110
+ name: parallel
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
109
123
  - !ruby/object:Gem::Dependency
110
124
  name: regexp_parser
111
125
  requirement: !ruby/object:Gem::Requirement
@@ -230,6 +244,18 @@ files:
230
244
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
231
245
  - lib/html2rss/attribute_post_processors/substring.rb
232
246
  - lib/html2rss/attribute_post_processors/template.rb
247
+ - lib/html2rss/auto_source.rb
248
+ - lib/html2rss/auto_source/article.rb
249
+ - lib/html2rss/auto_source/channel.rb
250
+ - lib/html2rss/auto_source/cleanup.rb
251
+ - lib/html2rss/auto_source/reducer.rb
252
+ - lib/html2rss/auto_source/rss_builder.rb
253
+ - lib/html2rss/auto_source/scraper.rb
254
+ - lib/html2rss/auto_source/scraper/schema.rb
255
+ - lib/html2rss/auto_source/scraper/schema/base.rb
256
+ - lib/html2rss/auto_source/scraper/semantic_html.rb
257
+ - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
258
+ - lib/html2rss/auto_source/scraper/semantic_html/image.rb
233
259
  - lib/html2rss/cli.rb
234
260
  - lib/html2rss/config.rb
235
261
  - lib/html2rss/config/channel.rb
@@ -253,7 +279,7 @@ licenses:
253
279
  - MIT
254
280
  metadata:
255
281
  allowed_push_host: https://rubygems.org
256
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.12.0
282
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
257
283
  rubygems_mfa_required: 'true'
258
284
  post_install_message:
259
285
  rdoc_options: []
@@ -270,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
270
296
  - !ruby/object:Gem::Version
271
297
  version: '0'
272
298
  requirements: []
273
- rubygems_version: 3.5.11
299
+ rubygems_version: 3.5.16
274
300
  signing_key:
275
301
  specification_version: 4
276
302
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors