html2rss 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +38 -10
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +9 -6
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
  11. data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
  12. data/lib/html2rss/attribute_post_processors/template.rb +4 -4
  13. data/lib/html2rss/auto_source/article.rb +95 -0
  14. data/lib/html2rss/auto_source/channel.rb +79 -0
  15. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  16. data/lib/html2rss/auto_source/reducer.rb +48 -0
  17. data/lib/html2rss/auto_source/rss_builder.rb +68 -0
  18. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  19. data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
  20. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  23. data/lib/html2rss/auto_source/scraper.rb +33 -0
  24. data/lib/html2rss/auto_source.rb +77 -0
  25. data/lib/html2rss/cli.rb +10 -0
  26. data/lib/html2rss/config/channel.rb +4 -2
  27. data/lib/html2rss/config/selectors.rb +2 -2
  28. data/lib/html2rss/item.rb +8 -2
  29. data/lib/html2rss/utils.rb +5 -10
  30. data/lib/html2rss/version.rb +1 -1
  31. data/lib/html2rss.rb +21 -0
  32. metadata +29 -3
@@ -31,12 +31,12 @@ module Html2rss
31
31
  ##
32
32
  # Removes any space, parses and normalizes the given url.
33
33
  # @param url [String]
34
- # @return [String, nil] sanitized and normalized URL, or nil if input is empty
34
+ # @return [Addressable::URI, nil] normalized URL, or nil if input is empty
35
35
  def self.sanitize_url(url)
36
36
  url = url.to_s.gsub(/\s+/, ' ').strip
37
37
  return if url.empty?
38
38
 
39
- Addressable::URI.parse(url).normalize.to_s
39
+ Addressable::URI.parse(url).normalize
40
40
  end
41
41
 
42
42
  ##
@@ -71,18 +71,13 @@ module Html2rss
71
71
 
72
72
  ##
73
73
  # @param url [String, Addressable::URI]
74
- # @param convert_json_to_xml [true, false] Should JSON be converted to XML
75
74
  # @param headers [Hash] additional HTTP request headers to use for the request
76
- # @return [String] body of the HTTP response
77
- def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
78
- response = Faraday.new(url:, headers:) do |faraday|
75
+ # @return [Faraday::Response] body of the HTTP response
76
+ def self.request_url(url, headers: {})
77
+ Faraday.new(url:, headers:) do |faraday|
79
78
  faraday.use Faraday::FollowRedirects::Middleware
80
79
  faraday.adapter Faraday.default_adapter
81
80
  end.get
82
-
83
- body = response.body
84
-
85
- convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
86
81
  end
87
82
 
88
83
  ##
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.12.0'
6
+ VERSION = '0.13.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
8
  require 'yaml'
9
+ require 'logger'
9
10
 
10
11
  ##
11
12
  # The Html2rss namespace.
12
13
  module Html2rss
14
+ ##
15
+ # The logger instance.
16
+ Log = Logger.new($stdout)
17
+
18
+ Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
19
+
20
+ Log.formatter = proc do |severity, datetime, _progname, msg|
21
+ "#{datetime} [#{severity}] #{msg}\n"
22
+ end
23
+
13
24
  ##
14
25
  # The Html2rss::Error base class.
15
26
  class Error < StandardError; end
@@ -91,5 +102,15 @@ module Html2rss
91
102
  end
92
103
  end
93
104
 
105
+ ##
106
+ # Scrapes the provided URL and returns an RSS object.
107
+ # No need for a "feed config".
108
+ #
109
+ # @param url [String] the URL to automatically source the feed from
110
+ # @return [RSS::Rss]
111
+ def self.auto_source(url)
112
+ Html2rss::AutoSource.new(url).build
113
+ end
114
+
94
115
  private_class_method :load_yaml, :find_feed_config
95
116
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-10 00:00:00.000000000 Z
11
+ date: 2024-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -106,6 +106,20 @@ dependencies:
106
106
  - - "<"
107
107
  - !ruby/object:Gem::Version
108
108
  version: '2.0'
109
+ - !ruby/object:Gem::Dependency
110
+ name: parallel
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
109
123
  - !ruby/object:Gem::Dependency
110
124
  name: regexp_parser
111
125
  requirement: !ruby/object:Gem::Requirement
@@ -230,6 +244,18 @@ files:
230
244
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
231
245
  - lib/html2rss/attribute_post_processors/substring.rb
232
246
  - lib/html2rss/attribute_post_processors/template.rb
247
+ - lib/html2rss/auto_source.rb
248
+ - lib/html2rss/auto_source/article.rb
249
+ - lib/html2rss/auto_source/channel.rb
250
+ - lib/html2rss/auto_source/cleanup.rb
251
+ - lib/html2rss/auto_source/reducer.rb
252
+ - lib/html2rss/auto_source/rss_builder.rb
253
+ - lib/html2rss/auto_source/scraper.rb
254
+ - lib/html2rss/auto_source/scraper/schema.rb
255
+ - lib/html2rss/auto_source/scraper/schema/base.rb
256
+ - lib/html2rss/auto_source/scraper/semantic_html.rb
257
+ - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
258
+ - lib/html2rss/auto_source/scraper/semantic_html/image.rb
233
259
  - lib/html2rss/cli.rb
234
260
  - lib/html2rss/config.rb
235
261
  - lib/html2rss/config/channel.rb
@@ -253,7 +279,7 @@ licenses:
253
279
  - MIT
254
280
  metadata:
255
281
  allowed_push_host: https://rubygems.org
256
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.12.0
282
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
257
283
  rubygems_mfa_required: 'true'
258
284
  post_install_message:
259
285
  rdoc_options: []