html2rss 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +112 -44
  3. data/html2rss.gemspec +3 -2
  4. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
  5. data/lib/html2rss/auto_source/article.rb +37 -5
  6. data/lib/html2rss/auto_source/channel.rb +21 -28
  7. data/lib/html2rss/auto_source/cleanup.rb +0 -16
  8. data/lib/html2rss/auto_source/rss_builder.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/html.rb +21 -12
  10. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
  11. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
  12. data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
  13. data/lib/html2rss/auto_source/scraper/schema.rb +22 -34
  14. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +41 -41
  15. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
  17. data/lib/html2rss/auto_source.rb +0 -7
  18. data/lib/html2rss/cli.rb +11 -4
  19. data/lib/html2rss/config/channel.rb +7 -1
  20. data/lib/html2rss/config/selectors.rb +2 -1
  21. data/lib/html2rss/config.rb +1 -0
  22. data/lib/html2rss/item.rb +7 -2
  23. data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
  24. data/lib/html2rss/request_service/context.rb +46 -0
  25. data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
  26. data/lib/html2rss/request_service/puppet_commander.rb +61 -0
  27. data/lib/html2rss/request_service/response.rb +27 -0
  28. data/lib/html2rss/request_service/strategy.rb +28 -0
  29. data/lib/html2rss/request_service.rb +97 -0
  30. data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
  31. data/lib/html2rss/utils.rb +23 -26
  32. data/lib/html2rss/version.rb +1 -1
  33. data/lib/html2rss.rb +5 -5
  34. metadata +31 -11
  35. data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # To be used by strategies to provide their response.
7
+ class Response
8
+ ##
9
+ # @param body [String] the body of the response
10
+ # @param headers [Hash] the headers of the response
11
+ def initialize(body:, headers: {})
12
+ @body = body
13
+
14
+ headers = headers.dup
15
+ headers.transform_keys!(&:to_s)
16
+
17
+ @headers = headers
18
+ end
19
+
20
+ # @return [String] the body of the response
21
+ attr_reader :body
22
+
23
+ # @return [Hash<String, Object>] the headers of the response
24
+ attr_reader :headers
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Defines the interface of every request strategy.
7
+ class Strategy
8
+ ##
9
+ # @param ctx [Context] the context for the request
10
+ def initialize(ctx)
11
+ @ctx = ctx
12
+ end
13
+
14
+ ##
15
+ # Executes the request.
16
+ # @return [Response] the response from the strategy
17
+ # @raise [NotImplementedError] if the method is not implemented by the subclass
18
+ def execute
19
+ raise NotImplementedError, 'Subclass must implement #execute'
20
+ end
21
+
22
+ private
23
+
24
+ # @return [Context] the context for the request
25
+ attr_reader :ctx
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'singleton'
4
+ require 'forwardable'
5
+
6
+ module Html2rss
7
+ ##
8
+ # Requests website URLs to retrieve their HTML for further processing.
9
+ # Provides strategies, i.e. to integrate Browserless.io.
10
+ class RequestService
11
+ include Singleton
12
+
13
+ class UnknownStrategy < Html2rss::Error; end
14
+ class InvalidUrl < Html2rss::Error; end
15
+ class UnsupportedUrlScheme < Html2rss::Error; end
16
+
17
+ class << self
18
+ extend Forwardable
19
+
20
+ %i[default_strategy_name
21
+ default_strategy_name=
22
+ strategy_names
23
+ register_strategy
24
+ unregister_strategy
25
+ strategy_registered?
26
+ execute].each do |method|
27
+ def_delegator :instance, method
28
+ end
29
+ end
30
+
31
+ def initialize
32
+ @strategies = {
33
+ faraday: FaradayStrategy,
34
+ browserless: BrowserlessStrategy
35
+ }
36
+ @default_strategy_name = :faraday
37
+ end
38
+
39
+ # @return [Symbol] the default strategy name
40
+ attr_reader :default_strategy_name
41
+
42
+ ##
43
+ # Sets the default strategy.
44
+ # @param strategy [Symbol] the name of the strategy
45
+ # @raise [UnknownStrategy] if the strategy is not registered
46
+ def default_strategy_name=(strategy)
47
+ raise UnknownStrategy unless strategy_registered?(strategy)
48
+
49
+ @default_strategy_name = strategy.to_sym
50
+ end
51
+
52
+ # @return [Array<String>] the names of the registered strategies
53
+ def strategy_names = @strategies.keys.map(&:to_s)
54
+
55
+ ##
56
+ # Registers a new strategy.
57
+ # @param name [Symbol] the name of the strategy
58
+ # @param strategy_class [Class] the class of the strategy
59
+ def register_strategy(name, strategy_class)
60
+ raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
61
+
62
+ @strategies[name.to_sym] = strategy_class
63
+ end
64
+
65
+ ##
66
+ # Checks if a strategy is registered.
67
+ # @param name [Symbol] the name of the strategy
68
+ # @return [Boolean] true if the strategy is registered, false otherwise
69
+ def strategy_registered?(name)
70
+ @strategies.key?(name.to_sym)
71
+ end
72
+
73
+ ##
74
+ # Unregisters a strategy.
75
+ # @param name [Symbol] the name of the strategy
76
+ # @return [Boolean] true if the strategy was unregistered, false otherwise
77
+ def unregister_strategy(name)
78
+ raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
79
+
80
+ !!@strategies.delete(name.to_sym)
81
+ end
82
+
83
+ ##
84
+ # Executes the request.
85
+ # @param ctx [Context] the context for the request
86
+ # @param strategy [Symbol] the strategy to use
87
+ # @return [Response] the response from the strategy
88
+ # @raise [UnknownStrategy] if the strategy is not known
89
+ def execute(ctx, strategy: default_strategy_name)
90
+ strategy_class = @strategies.fetch(strategy) do
91
+ raise UnknownStrategy,
92
+ "The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
93
+ end
94
+ strategy_class.new(ctx).execute
95
+ end
96
+ end
97
+ end
@@ -47,6 +47,13 @@ module Html2rss
47
47
  @media = media
48
48
  end
49
49
  attr_reader :href, :type, :media
50
+
51
+ # @return [String] the XML representation of the stylesheet
52
+ def to_xml
53
+ <<~XML
54
+ <?xml-stylesheet href="#{href}" type="#{type}" media="#{media}"?>
55
+ XML
56
+ end
50
57
  end
51
58
  end
52
59
  end
@@ -1,8 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable/uri'
4
- require 'faraday'
5
- require 'faraday/follow_redirects'
6
4
  require 'json'
7
5
  require 'regexp_parser'
8
6
  require 'tzinfo'
@@ -15,11 +13,10 @@ module Html2rss
15
13
  module Utils
16
14
  ##
17
15
  # @param url [String, Addressable::URI]
18
- # @param base_url [String]
16
+ # @param base_url [String, Addressable::URI]
19
17
  # @return [Addressable::URI]
20
18
  def self.build_absolute_url_from_relative(url, base_url)
21
- url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
22
-
19
+ url = Addressable::URI.parse(url)
23
20
  return url if url.absolute?
24
21
 
25
22
  base_uri = Addressable::URI.parse(base_url)
@@ -59,31 +56,31 @@ module Html2rss
59
56
  end
60
57
 
61
58
  ##
62
- # Builds a titleized representation of the URL.
63
- # @param url [String, Addressable::URI]
59
+ # Builds a titleized representation of the URL with prefixed host.
60
+ # @param url [Addressable::URI]
64
61
  # @return [String]
65
- def self.titleized_url(url)
66
- uri = Addressable::URI.parse(url)
67
- host = uri.host
62
+ def self.titleized_channel_url(url)
63
+ nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
64
+ host = url.host
68
65
 
69
- nicer_path = uri.path.split('/').reject(&:empty?)
70
66
  nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
71
67
  end
72
68
 
73
69
  ##
74
- # @param url [String, Addressable::URI]
75
- # @param headers [Hash] additional HTTP request headers to use for the request
76
- # @return [Faraday::Response] body of the HTTP response
77
- def self.request_url(url, headers: {})
78
- url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
79
-
80
- raise ArgumentError, 'URL must be absolute' unless url.absolute?
81
- raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
82
-
83
- Faraday.new(url:, headers:) do |faraday|
84
- faraday.use Faraday::FollowRedirects::Middleware
85
- faraday.adapter Faraday.default_adapter
86
- end.get
70
+ # Builds a titleized representation of the URL.
71
+ # @param url [Addressable::URI]
72
+ # @return [String]
73
+ def self.titleized_url(url)
74
+ return '' if url.path.empty?
75
+
76
+ nicer_path = CGI.unescapeURIComponent(url.path)
77
+ .split('/')
78
+ .flat_map do |part|
79
+ part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
80
+ end
81
+
82
+ nicer_path.map!(&:capitalize)
83
+ File.basename nicer_path.join(' '), '.*'
87
84
  end
88
85
 
89
86
  ##
@@ -104,10 +101,10 @@ module Html2rss
104
101
  ##
105
102
  # Guesses the content type based on the file extension of the URL.
106
103
  #
107
- # @param url [String, Addressable::URI]
104
+ # @param url [Addressable::URI]
108
105
  # @return [String] guessed content type, defaults to 'application/octet-stream'
109
106
  def self.guess_content_type_from_url(url)
110
- url = url.to_s.split('?').first
107
+ url = url.path.split('?').first
111
108
 
112
109
  content_type = MIME::Types.type_for(File.extname(url).delete('.'))
113
110
  content_type.first&.to_s || 'application/octet-stream'
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.15.0'
6
+ VERSION = '0.17.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -99,13 +99,13 @@ module Html2rss
99
99
  # No need for a "feed config".
100
100
  #
101
101
  # @param url [String] the URL to automatically source the feed from
102
+ # @param strategy [Symbol] the request strategy to use
102
103
  # @return [RSS::Rss]
103
- def self.auto_source(url)
104
- url = Addressable::URI.parse(url)
104
+ def self.auto_source(url, strategy: :faraday)
105
+ ctx = RequestService::Context.new(url:, headers: {})
106
+ response = RequestService.execute(ctx, strategy:)
105
107
 
106
- response = Html2rss::Utils.request_url(url)
107
-
108
- Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
108
+ Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
109
109
  end
110
110
 
111
111
  private_class_method :find_feed_config
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2024-10-30 00:00:00.000000000 Z
10
+ date: 2025-01-18 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: addressable
@@ -120,6 +119,20 @@ dependencies:
120
119
  - - ">="
121
120
  - !ruby/object:Gem::Version
122
121
  version: '0'
122
+ - !ruby/object:Gem::Dependency
123
+ name: puppeteer-ruby
124
+ requirement: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - ">="
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ type: :runtime
130
+ prerelease: false
131
+ version_requirements: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
123
136
  - !ruby/object:Gem::Dependency
124
137
  name: regexp_parser
125
138
  requirement: !ruby/object:Gem::Requirement
@@ -140,14 +153,14 @@ dependencies:
140
153
  requirements:
141
154
  - - "~>"
142
155
  - !ruby/object:Gem::Version
143
- version: '2.0'
156
+ version: '3.0'
144
157
  type: :runtime
145
158
  prerelease: false
146
159
  version_requirements: !ruby/object:Gem::Requirement
147
160
  requirements:
148
161
  - - "~>"
149
162
  - !ruby/object:Gem::Version
150
- version: '2.0'
163
+ version: '3.0'
151
164
  - !ruby/object:Gem::Dependency
152
165
  name: rss
153
166
  requirement: !ruby/object:Gem::Requirement
@@ -253,7 +266,9 @@ files:
253
266
  - lib/html2rss/auto_source/scraper.rb
254
267
  - lib/html2rss/auto_source/scraper/html.rb
255
268
  - lib/html2rss/auto_source/scraper/schema.rb
256
- - lib/html2rss/auto_source/scraper/schema/base.rb
269
+ - lib/html2rss/auto_source/scraper/schema/item_list.rb
270
+ - lib/html2rss/auto_source/scraper/schema/list_item.rb
271
+ - lib/html2rss/auto_source/scraper/schema/thing.rb
257
272
  - lib/html2rss/auto_source/scraper/semantic_html.rb
258
273
  - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
259
274
  - lib/html2rss/auto_source/scraper/semantic_html/image.rb
@@ -269,6 +284,13 @@ files:
269
284
  - lib/html2rss/item_extractors/static.rb
270
285
  - lib/html2rss/item_extractors/text.rb
271
286
  - lib/html2rss/object_to_xml_converter.rb
287
+ - lib/html2rss/request_service.rb
288
+ - lib/html2rss/request_service/browserless_strategy.rb
289
+ - lib/html2rss/request_service/context.rb
290
+ - lib/html2rss/request_service/faraday_strategy.rb
291
+ - lib/html2rss/request_service/puppet_commander.rb
292
+ - lib/html2rss/request_service/response.rb
293
+ - lib/html2rss/request_service/strategy.rb
272
294
  - lib/html2rss/rss_builder.rb
273
295
  - lib/html2rss/rss_builder/channel.rb
274
296
  - lib/html2rss/rss_builder/item.rb
@@ -280,9 +302,8 @@ licenses:
280
302
  - MIT
281
303
  metadata:
282
304
  allowed_push_host: https://rubygems.org
283
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
305
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.17.0
284
306
  rubygems_mfa_required: 'true'
285
- post_install_message:
286
307
  rdoc_options: []
287
308
  require_paths:
288
309
  - lib
@@ -290,15 +311,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
290
311
  requirements:
291
312
  - - ">="
292
313
  - !ruby/object:Gem::Version
293
- version: '3.1'
314
+ version: '3.2'
294
315
  required_rubygems_version: !ruby/object:Gem::Requirement
295
316
  requirements:
296
317
  - - ">="
297
318
  - !ruby/object:Gem::Version
298
319
  version: '0'
299
320
  requirements: []
300
- rubygems_version: 3.5.16
301
- signing_key:
321
+ rubygems_version: 3.6.2
302
322
  specification_version: 4
303
323
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
304
324
  to extract item.
@@ -1,61 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'date'
4
-
5
- module Html2rss
6
- class AutoSource
7
- module Scraper
8
- class Schema
9
- ##
10
- # Base class for Schema.org schema_objects.
11
- #
12
- # @see https://schema.org/Article
13
- class Base
14
- DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
15
-
16
- def initialize(schema_object, url:)
17
- @schema_object = schema_object
18
- @url = url
19
- end
20
-
21
- # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
22
- def call
23
- DEFAULT_ATTRIBUTES.to_h do |attribute|
24
- [attribute, public_send(attribute)]
25
- end
26
- end
27
-
28
- def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
29
- def title = schema_object[:title]
30
-
31
- def description
32
- [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
33
- .max_by { |desc| desc.to_s.size }
34
- end
35
-
36
- # @return [Addressable::URI, nil] the URL of the schema object
37
- def url
38
- url = schema_object[:url]
39
- if url.to_s.empty?
40
- Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
41
- return
42
- end
43
-
44
- Utils.build_absolute_url_from_relative(url, @url)
45
- end
46
-
47
- def image = images.first || nil
48
- def published_at = schema_object[:datePublished]
49
-
50
- private
51
-
52
- attr_reader :schema_object
53
-
54
- def images
55
- Array(schema_object[:image]).compact
56
- end
57
- end
58
- end
59
- end
60
- end
61
- end