html2rss 0.12.0 → 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (32) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +38 -10
  3. data/html2rss.gemspec +1 -0
  4. data/lib/html2rss/attribute_post_processors/base.rb +9 -6
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
  7. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
  8. data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
  9. data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
  10. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
  11. data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
  12. data/lib/html2rss/attribute_post_processors/template.rb +4 -4
  13. data/lib/html2rss/auto_source/article.rb +95 -0
  14. data/lib/html2rss/auto_source/channel.rb +79 -0
  15. data/lib/html2rss/auto_source/cleanup.rb +76 -0
  16. data/lib/html2rss/auto_source/reducer.rb +48 -0
  17. data/lib/html2rss/auto_source/rss_builder.rb +68 -0
  18. data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
  19. data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
  20. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
  21. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
  22. data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
  23. data/lib/html2rss/auto_source/scraper.rb +33 -0
  24. data/lib/html2rss/auto_source.rb +77 -0
  25. data/lib/html2rss/cli.rb +10 -0
  26. data/lib/html2rss/config/channel.rb +4 -2
  27. data/lib/html2rss/config/selectors.rb +2 -2
  28. data/lib/html2rss/item.rb +8 -2
  29. data/lib/html2rss/utils.rb +5 -10
  30. data/lib/html2rss/version.rb +1 -1
  31. data/lib/html2rss.rb +21 -0
  32. metadata +29 -3
@@ -31,12 +31,12 @@ module Html2rss
31
31
  ##
32
32
  # Removes any space, parses and normalizes the given url.
33
33
  # @param url [String]
34
- # @return [String, nil] sanitized and normalized URL, or nil if input is empty
34
+ # @return [Addressable::URI, nil] normalized URL, or nil if input is empty
35
35
  def self.sanitize_url(url)
36
36
  url = url.to_s.gsub(/\s+/, ' ').strip
37
37
  return if url.empty?
38
38
 
39
- Addressable::URI.parse(url).normalize.to_s
39
+ Addressable::URI.parse(url).normalize
40
40
  end
41
41
 
42
42
  ##
@@ -71,18 +71,13 @@ module Html2rss
71
71
 
72
72
  ##
73
73
  # @param url [String, Addressable::URI]
74
- # @param convert_json_to_xml [true, false] Should JSON be converted to XML
75
74
  # @param headers [Hash] additional HTTP request headers to use for the request
76
- # @return [String] body of the HTTP response
77
- def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
78
- response = Faraday.new(url:, headers:) do |faraday|
75
+ # @return [Faraday::Response] body of the HTTP response
76
+ def self.request_url(url, headers: {})
77
+ Faraday.new(url:, headers:) do |faraday|
79
78
  faraday.use Faraday::FollowRedirects::Middleware
80
79
  faraday.adapter Faraday.default_adapter
81
80
  end.get
82
-
83
- body = response.body
84
-
85
- convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
86
81
  end
87
82
 
88
83
  ##
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.12.0'
6
+ VERSION = '0.13.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
8
  require 'yaml'
9
+ require 'logger'
9
10
 
10
11
  ##
11
12
  # The Html2rss namespace.
12
13
  module Html2rss
14
+ ##
15
+ # The logger instance.
16
+ Log = Logger.new($stdout)
17
+
18
+ Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
19
+
20
+ Log.formatter = proc do |severity, datetime, _progname, msg|
21
+ "#{datetime} [#{severity}] #{msg}\n"
22
+ end
23
+
13
24
  ##
14
25
  # The Html2rss::Error base class.
15
26
  class Error < StandardError; end
@@ -91,5 +102,15 @@ module Html2rss
91
102
  end
92
103
  end
93
104
 
105
+ ##
106
+ # Scrapes the provided URL and returns an RSS object.
107
+ # No need for a "feed config".
108
+ #
109
+ # @param url [String] the URL to automatically source the feed from
110
+ # @return [RSS::Rss]
111
+ def self.auto_source(url)
112
+ Html2rss::AutoSource.new(url).build
113
+ end
114
+
94
115
  private_class_method :load_yaml, :find_feed_config
95
116
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.12.0
4
+ version: 0.13.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-08-10 00:00:00.000000000 Z
11
+ date: 2024-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -106,6 +106,20 @@ dependencies:
106
106
  - - "<"
107
107
  - !ruby/object:Gem::Version
108
108
  version: '2.0'
109
+ - !ruby/object:Gem::Dependency
110
+ name: parallel
111
+ requirement: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ type: :runtime
117
+ prerelease: false
118
+ version_requirements: !ruby/object:Gem::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: '0'
109
123
  - !ruby/object:Gem::Dependency
110
124
  name: regexp_parser
111
125
  requirement: !ruby/object:Gem::Requirement
@@ -230,6 +244,18 @@ files:
230
244
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
231
245
  - lib/html2rss/attribute_post_processors/substring.rb
232
246
  - lib/html2rss/attribute_post_processors/template.rb
247
+ - lib/html2rss/auto_source.rb
248
+ - lib/html2rss/auto_source/article.rb
249
+ - lib/html2rss/auto_source/channel.rb
250
+ - lib/html2rss/auto_source/cleanup.rb
251
+ - lib/html2rss/auto_source/reducer.rb
252
+ - lib/html2rss/auto_source/rss_builder.rb
253
+ - lib/html2rss/auto_source/scraper.rb
254
+ - lib/html2rss/auto_source/scraper/schema.rb
255
+ - lib/html2rss/auto_source/scraper/schema/base.rb
256
+ - lib/html2rss/auto_source/scraper/semantic_html.rb
257
+ - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
258
+ - lib/html2rss/auto_source/scraper/semantic_html/image.rb
233
259
  - lib/html2rss/cli.rb
234
260
  - lib/html2rss/config.rb
235
261
  - lib/html2rss/config/channel.rb
@@ -253,7 +279,7 @@ licenses:
253
279
  - MIT
254
280
  metadata:
255
281
  allowed_push_host: https://rubygems.org
256
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.12.0
282
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
257
283
  rubygems_mfa_required: 'true'
258
284
  post_install_message:
259
285
  rdoc_options: []