html2rss 0.14.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +113 -44
  3. data/html2rss.gemspec +3 -2
  4. data/lib/html2rss/auto_source/article.rb +37 -5
  5. data/lib/html2rss/auto_source/channel.rb +21 -28
  6. data/lib/html2rss/auto_source/cleanup.rb +0 -16
  7. data/lib/html2rss/auto_source/rss_builder.rb +1 -1
  8. data/lib/html2rss/auto_source/scraper/html.rb +96 -0
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
  13. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +51 -38
  14. data/lib/html2rss/auto_source/scraper.rb +1 -0
  15. data/lib/html2rss/auto_source.rb +0 -7
  16. data/lib/html2rss/cli.rb +11 -4
  17. data/lib/html2rss/config/channel.rb +7 -1
  18. data/lib/html2rss/config/selectors.rb +2 -1
  19. data/lib/html2rss/config.rb +1 -0
  20. data/lib/html2rss/item.rb +7 -2
  21. data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
  22. data/lib/html2rss/request_service/context.rb +46 -0
  23. data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
  24. data/lib/html2rss/request_service/puppet_commander.rb +61 -0
  25. data/lib/html2rss/request_service/response.rb +27 -0
  26. data/lib/html2rss/request_service/strategy.rb +28 -0
  27. data/lib/html2rss/request_service.rb +97 -0
  28. data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
  29. data/lib/html2rss/utils.rb +23 -26
  30. data/lib/html2rss/version.rb +1 -1
  31. data/lib/html2rss.rb +7 -6
  32. metadata +35 -11
  33. data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Defines the interface of every request strategy.
7
+ class Strategy
8
+ ##
9
+ # @param ctx [Context] the context for the request
10
+ def initialize(ctx)
11
+ @ctx = ctx
12
+ end
13
+
14
+ ##
15
+ # Executes the request.
16
+ # @return [Response] the response from the strategy
17
+ # @raise [NotImplementedError] if the method is not implemented by the subclass
18
+ def execute
19
+ raise NotImplementedError, 'Subclass must implement #execute'
20
+ end
21
+
22
+ private
23
+
24
+ # @return [Context] the context for the request
25
+ attr_reader :ctx
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'singleton'
4
+ require 'forwardable'
5
+
6
+ module Html2rss
7
+ ##
8
+ # Requests website URLs to retrieve their HTML for further processing.
9
+ # Provides strategies, i.e. to integrate Browserless.io.
10
+ class RequestService
11
+ include Singleton
12
+
13
+ class UnknownStrategy < Html2rss::Error; end
14
+ class InvalidUrl < Html2rss::Error; end
15
+ class UnsupportedUrlScheme < Html2rss::Error; end
16
+
17
+ class << self
18
+ extend Forwardable
19
+
20
+ %i[default_strategy_name
21
+ default_strategy_name=
22
+ strategy_names
23
+ register_strategy
24
+ unregister_strategy
25
+ strategy_registered?
26
+ execute].each do |method|
27
+ def_delegator :instance, method
28
+ end
29
+ end
30
+
31
+ def initialize
32
+ @strategies = {
33
+ faraday: FaradayStrategy,
34
+ browserless: BrowserlessStrategy
35
+ }
36
+ @default_strategy_name = :faraday
37
+ end
38
+
39
+ # @return [Symbol] the default strategy name
40
+ attr_reader :default_strategy_name
41
+
42
+ ##
43
+ # Sets the default strategy.
44
+ # @param strategy [Symbol] the name of the strategy
45
+ # @raise [UnknownStrategy] if the strategy is not registered
46
+ def default_strategy_name=(strategy)
47
+ raise UnknownStrategy unless strategy_registered?(strategy)
48
+
49
+ @default_strategy_name = strategy.to_sym
50
+ end
51
+
52
+ # @return [Array<String>] the names of the registered strategies
53
+ def strategy_names = @strategies.keys.map(&:to_s)
54
+
55
+ ##
56
+ # Registers a new strategy.
57
+ # @param name [Symbol] the name of the strategy
58
+ # @param strategy_class [Class] the class of the strategy
59
+ def register_strategy(name, strategy_class)
60
+ raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
61
+
62
+ @strategies[name.to_sym] = strategy_class
63
+ end
64
+
65
+ ##
66
+ # Checks if a strategy is registered.
67
+ # @param name [Symbol] the name of the strategy
68
+ # @return [Boolean] true if the strategy is registered, false otherwise
69
+ def strategy_registered?(name)
70
+ @strategies.key?(name.to_sym)
71
+ end
72
+
73
+ ##
74
+ # Unregisters a strategy.
75
+ # @param name [Symbol] the name of the strategy
76
+ # @return [Boolean] true if the strategy was unregistered, false otherwise
77
+ def unregister_strategy(name)
78
+ raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
79
+
80
+ !!@strategies.delete(name.to_sym)
81
+ end
82
+
83
+ ##
84
+ # Executes the request.
85
+ # @param ctx [Context] the context for the request
86
+ # @param strategy [Symbol] the strategy to use
87
+ # @return [Response] the response from the strategy
88
+ # @raise [UnknownStrategy] if the strategy is not known
89
+ def execute(ctx, strategy: default_strategy_name)
90
+ strategy_class = @strategies.fetch(strategy) do
91
+ raise UnknownStrategy,
92
+ "The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
93
+ end
94
+ strategy_class.new(ctx).execute
95
+ end
96
+ end
97
+ end
@@ -47,6 +47,13 @@ module Html2rss
47
47
  @media = media
48
48
  end
49
49
  attr_reader :href, :type, :media
50
+
51
+ # @return [String] the XML representation of the stylesheet
52
+ def to_xml
53
+ <<~XML
54
+ <?xml-stylesheet href="#{href}" type="#{type}" media="#{media}"?>
55
+ XML
56
+ end
50
57
  end
51
58
  end
52
59
  end
@@ -1,8 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'addressable/uri'
4
- require 'faraday'
5
- require 'faraday/follow_redirects'
6
4
  require 'json'
7
5
  require 'regexp_parser'
8
6
  require 'tzinfo'
@@ -15,11 +13,10 @@ module Html2rss
15
13
  module Utils
16
14
  ##
17
15
  # @param url [String, Addressable::URI]
18
- # @param base_url [String]
16
+ # @param base_url [String, Addressable::URI]
19
17
  # @return [Addressable::URI]
20
18
  def self.build_absolute_url_from_relative(url, base_url)
21
- url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
22
-
19
+ url = Addressable::URI.parse(url)
23
20
  return url if url.absolute?
24
21
 
25
22
  base_uri = Addressable::URI.parse(base_url)
@@ -59,31 +56,31 @@ module Html2rss
59
56
  end
60
57
 
61
58
  ##
62
- # Builds a titleized representation of the URL.
63
- # @param url [String, Addressable::URI]
59
+ # Builds a titleized representation of the URL with prefixed host.
60
+ # @param url [Addressable::URI]
64
61
  # @return [String]
65
- def self.titleized_url(url)
66
- uri = Addressable::URI.parse(url)
67
- host = uri.host
62
+ def self.titleized_channel_url(url)
63
+ nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
64
+ host = url.host
68
65
 
69
- nicer_path = uri.path.split('/').reject(&:empty?)
70
66
  nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
71
67
  end
72
68
 
73
69
  ##
74
- # @param url [String, Addressable::URI]
75
- # @param headers [Hash] additional HTTP request headers to use for the request
76
- # @return [Faraday::Response] body of the HTTP response
77
- def self.request_url(url, headers: {})
78
- url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
79
-
80
- raise ArgumentError, 'URL must be absolute' unless url.absolute?
81
- raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
82
-
83
- Faraday.new(url:, headers:) do |faraday|
84
- faraday.use Faraday::FollowRedirects::Middleware
85
- faraday.adapter Faraday.default_adapter
86
- end.get
70
+ # Builds a titleized representation of the URL.
71
+ # @param url [Addressable::URI]
72
+ # @return [String]
73
+ def self.titleized_url(url)
74
+ return '' if url.path.empty?
75
+
76
+ nicer_path = CGI.unescapeURIComponent(url.path)
77
+ .split('/')
78
+ .flat_map do |part|
79
+ part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
80
+ end
81
+
82
+ nicer_path.map!(&:capitalize)
83
+ File.basename nicer_path.join(' '), '.*'
87
84
  end
88
85
 
89
86
  ##
@@ -104,10 +101,10 @@ module Html2rss
104
101
  ##
105
102
  # Guesses the content type based on the file extension of the URL.
106
103
  #
107
- # @param url [String, Addressable::URI]
104
+ # @param url [Addressable::URI]
108
105
  # @return [String] guessed content type, defaults to 'application/octet-stream'
109
106
  def self.guess_content_type_from_url(url)
110
- url = url.to_s.split('?').first
107
+ url = url.path.split('?').first
111
108
 
112
109
  content_type = MIME::Types.type_for(File.extname(url).delete('.'))
113
110
  content_type.first&.to_s || 'application/octet-stream'
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.14.0'
6
+ VERSION = '0.16.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -5,8 +5,9 @@ require 'zeitwerk'
5
5
  loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
- require 'yaml'
8
+ require 'addressable'
9
9
  require 'logger'
10
+ require 'yaml'
10
11
 
11
12
  ##
12
13
  # The Html2rss namespace.
@@ -98,13 +99,13 @@ module Html2rss
98
99
  # No need for a "feed config".
99
100
  #
100
101
  # @param url [String] the URL to automatically source the feed from
102
+ # @param strategy [Symbol] the request strategy to use
101
103
  # @return [RSS::Rss]
102
- def self.auto_source(url)
103
- url = Addressable::URI.parse(url)
104
-
105
- response = Html2rss::Utils.request_url(url)
104
+ def self.auto_source(url, strategy: :faraday)
105
+ ctx = RequestService::Context.new(url:, headers: {})
106
+ response = RequestService.execute(ctx, strategy:)
106
107
 
107
- Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
108
+ Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
108
109
  end
109
110
 
110
111
  private_class_method :find_feed_config
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.0
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-10-08 00:00:00.000000000 Z
11
+ date: 2024-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -120,6 +120,20 @@ dependencies:
120
120
  - - ">="
121
121
  - !ruby/object:Gem::Version
122
122
  version: '0'
123
+ - !ruby/object:Gem::Dependency
124
+ name: puppeteer-ruby
125
+ requirement: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ type: :runtime
131
+ prerelease: false
132
+ version_requirements: !ruby/object:Gem::Requirement
133
+ requirements:
134
+ - - ">="
135
+ - !ruby/object:Gem::Version
136
+ version: '0'
123
137
  - !ruby/object:Gem::Dependency
124
138
  name: regexp_parser
125
139
  requirement: !ruby/object:Gem::Requirement
@@ -140,14 +154,14 @@ dependencies:
140
154
  requirements:
141
155
  - - "~>"
142
156
  - !ruby/object:Gem::Version
143
- version: '2.0'
157
+ version: '3.0'
144
158
  type: :runtime
145
159
  prerelease: false
146
160
  version_requirements: !ruby/object:Gem::Requirement
147
161
  requirements:
148
162
  - - "~>"
149
163
  - !ruby/object:Gem::Version
150
- version: '2.0'
164
+ version: '3.0'
151
165
  - !ruby/object:Gem::Dependency
152
166
  name: rss
153
167
  requirement: !ruby/object:Gem::Requirement
@@ -208,16 +222,16 @@ dependencies:
208
222
  name: zeitwerk
209
223
  requirement: !ruby/object:Gem::Requirement
210
224
  requirements:
211
- - - ">="
225
+ - - "~>"
212
226
  - !ruby/object:Gem::Version
213
- version: '0'
227
+ version: 2.6.0
214
228
  type: :runtime
215
229
  prerelease: false
216
230
  version_requirements: !ruby/object:Gem::Requirement
217
231
  requirements:
218
- - - ">="
232
+ - - "~>"
219
233
  - !ruby/object:Gem::Version
220
- version: '0'
234
+ version: 2.6.0
221
235
  description: Supports JSON content, custom HTTP headers, and post-processing of extracted
222
236
  content.
223
237
  email:
@@ -251,8 +265,11 @@ files:
251
265
  - lib/html2rss/auto_source/reducer.rb
252
266
  - lib/html2rss/auto_source/rss_builder.rb
253
267
  - lib/html2rss/auto_source/scraper.rb
268
+ - lib/html2rss/auto_source/scraper/html.rb
254
269
  - lib/html2rss/auto_source/scraper/schema.rb
255
- - lib/html2rss/auto_source/scraper/schema/base.rb
270
+ - lib/html2rss/auto_source/scraper/schema/item_list.rb
271
+ - lib/html2rss/auto_source/scraper/schema/list_item.rb
272
+ - lib/html2rss/auto_source/scraper/schema/thing.rb
256
273
  - lib/html2rss/auto_source/scraper/semantic_html.rb
257
274
  - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
258
275
  - lib/html2rss/auto_source/scraper/semantic_html/image.rb
@@ -268,6 +285,13 @@ files:
268
285
  - lib/html2rss/item_extractors/static.rb
269
286
  - lib/html2rss/item_extractors/text.rb
270
287
  - lib/html2rss/object_to_xml_converter.rb
288
+ - lib/html2rss/request_service.rb
289
+ - lib/html2rss/request_service/browserless_strategy.rb
290
+ - lib/html2rss/request_service/context.rb
291
+ - lib/html2rss/request_service/faraday_strategy.rb
292
+ - lib/html2rss/request_service/puppet_commander.rb
293
+ - lib/html2rss/request_service/response.rb
294
+ - lib/html2rss/request_service/strategy.rb
271
295
  - lib/html2rss/rss_builder.rb
272
296
  - lib/html2rss/rss_builder/channel.rb
273
297
  - lib/html2rss/rss_builder/item.rb
@@ -279,7 +303,7 @@ licenses:
279
303
  - MIT
280
304
  metadata:
281
305
  allowed_push_host: https://rubygems.org
282
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
306
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.16.0
283
307
  rubygems_mfa_required: 'true'
284
308
  post_install_message:
285
309
  rdoc_options: []
@@ -296,7 +320,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
296
320
  - !ruby/object:Gem::Version
297
321
  version: '0'
298
322
  requirements: []
299
- rubygems_version: 3.5.16
323
+ rubygems_version: 3.5.22
300
324
  signing_key:
301
325
  specification_version: 4
302
326
  summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
@@ -1,61 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'date'
4
-
5
- module Html2rss
6
- class AutoSource
7
- module Scraper
8
- class Schema
9
- ##
10
- # Base class for Schema.org schema_objects.
11
- #
12
- # @see https://schema.org/Article
13
- class Base
14
- DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
15
-
16
- def initialize(schema_object, url:)
17
- @schema_object = schema_object
18
- @url = url
19
- end
20
-
21
- # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
22
- def call
23
- DEFAULT_ATTRIBUTES.to_h do |attribute|
24
- [attribute, public_send(attribute)]
25
- end
26
- end
27
-
28
- def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
29
- def title = schema_object[:title]
30
-
31
- def description
32
- [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
33
- .max_by { |desc| desc.to_s.size }
34
- end
35
-
36
- # @return [Addressable::URI, nil] the URL of the schema object
37
- def url
38
- url = schema_object[:url]
39
- if url.to_s.empty?
40
- Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
41
- return
42
- end
43
-
44
- Utils.build_absolute_url_from_relative(url, @url)
45
- end
46
-
47
- def image = images.first || nil
48
- def published_at = schema_object[:datePublished]
49
-
50
- private
51
-
52
- attr_reader :schema_object
53
-
54
- def images
55
- Array(schema_object[:image]).compact
56
- end
57
- end
58
- end
59
- end
60
- end
61
- end