html2rss 0.14.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +113 -44
- data/html2rss.gemspec +3 -2
- data/lib/html2rss/auto_source/article.rb +37 -5
- data/lib/html2rss/auto_source/channel.rb +21 -28
- data/lib/html2rss/auto_source/cleanup.rb +0 -16
- data/lib/html2rss/auto_source/rss_builder.rb +1 -1
- data/lib/html2rss/auto_source/scraper/html.rb +96 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +51 -38
- data/lib/html2rss/auto_source/scraper.rb +1 -0
- data/lib/html2rss/auto_source.rb +0 -7
- data/lib/html2rss/cli.rb +11 -4
- data/lib/html2rss/config/channel.rb +7 -1
- data/lib/html2rss/config/selectors.rb +2 -1
- data/lib/html2rss/config.rb +1 -0
- data/lib/html2rss/item.rb +7 -2
- data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
- data/lib/html2rss/request_service/context.rb +46 -0
- data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
- data/lib/html2rss/request_service/puppet_commander.rb +61 -0
- data/lib/html2rss/request_service/response.rb +27 -0
- data/lib/html2rss/request_service/strategy.rb +28 -0
- data/lib/html2rss/request_service.rb +97 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
- data/lib/html2rss/utils.rb +23 -26
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +7 -6
- metadata +35 -11
- data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class RequestService
|
5
|
+
##
|
6
|
+
# Defines the interface of every request strategy.
|
7
|
+
class Strategy
|
8
|
+
##
|
9
|
+
# @param ctx [Context] the context for the request
|
10
|
+
def initialize(ctx)
|
11
|
+
@ctx = ctx
|
12
|
+
end
|
13
|
+
|
14
|
+
##
|
15
|
+
# Executes the request.
|
16
|
+
# @return [Response] the response from the strategy
|
17
|
+
# @raise [NotImplementedError] if the method is not implemented by the subclass
|
18
|
+
def execute
|
19
|
+
raise NotImplementedError, 'Subclass must implement #execute'
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# @return [Context] the context for the request
|
25
|
+
attr_reader :ctx
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
require 'forwardable'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# Requests website URLs to retrieve their HTML for further processing.
|
9
|
+
# Provides strategies, i.e. to integrate Browserless.io.
|
10
|
+
class RequestService
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
class UnknownStrategy < Html2rss::Error; end
|
14
|
+
class InvalidUrl < Html2rss::Error; end
|
15
|
+
class UnsupportedUrlScheme < Html2rss::Error; end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
extend Forwardable
|
19
|
+
|
20
|
+
%i[default_strategy_name
|
21
|
+
default_strategy_name=
|
22
|
+
strategy_names
|
23
|
+
register_strategy
|
24
|
+
unregister_strategy
|
25
|
+
strategy_registered?
|
26
|
+
execute].each do |method|
|
27
|
+
def_delegator :instance, method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@strategies = {
|
33
|
+
faraday: FaradayStrategy,
|
34
|
+
browserless: BrowserlessStrategy
|
35
|
+
}
|
36
|
+
@default_strategy_name = :faraday
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Symbol] the default strategy name
|
40
|
+
attr_reader :default_strategy_name
|
41
|
+
|
42
|
+
##
|
43
|
+
# Sets the default strategy.
|
44
|
+
# @param strategy [Symbol] the name of the strategy
|
45
|
+
# @raise [UnknownStrategy] if the strategy is not registered
|
46
|
+
def default_strategy_name=(strategy)
|
47
|
+
raise UnknownStrategy unless strategy_registered?(strategy)
|
48
|
+
|
49
|
+
@default_strategy_name = strategy.to_sym
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [Array<String>] the names of the registered strategies
|
53
|
+
def strategy_names = @strategies.keys.map(&:to_s)
|
54
|
+
|
55
|
+
##
|
56
|
+
# Registers a new strategy.
|
57
|
+
# @param name [Symbol] the name of the strategy
|
58
|
+
# @param strategy_class [Class] the class of the strategy
|
59
|
+
def register_strategy(name, strategy_class)
|
60
|
+
raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
|
61
|
+
|
62
|
+
@strategies[name.to_sym] = strategy_class
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Checks if a strategy is registered.
|
67
|
+
# @param name [Symbol] the name of the strategy
|
68
|
+
# @return [Boolean] true if the strategy is registered, false otherwise
|
69
|
+
def strategy_registered?(name)
|
70
|
+
@strategies.key?(name.to_sym)
|
71
|
+
end
|
72
|
+
|
73
|
+
##
|
74
|
+
# Unregisters a strategy.
|
75
|
+
# @param name [Symbol] the name of the strategy
|
76
|
+
# @return [Boolean] true if the strategy was unregistered, false otherwise
|
77
|
+
def unregister_strategy(name)
|
78
|
+
raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
|
79
|
+
|
80
|
+
!!@strategies.delete(name.to_sym)
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# Executes the request.
|
85
|
+
# @param ctx [Context] the context for the request
|
86
|
+
# @param strategy [Symbol] the strategy to use
|
87
|
+
# @return [Response] the response from the strategy
|
88
|
+
# @raise [UnknownStrategy] if the strategy is not known
|
89
|
+
def execute(ctx, strategy: default_strategy_name)
|
90
|
+
strategy_class = @strategies.fetch(strategy) do
|
91
|
+
raise UnknownStrategy,
|
92
|
+
"The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
|
93
|
+
end
|
94
|
+
strategy_class.new(ctx).execute
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -47,6 +47,13 @@ module Html2rss
|
|
47
47
|
@media = media
|
48
48
|
end
|
49
49
|
attr_reader :href, :type, :media
|
50
|
+
|
51
|
+
# @return [String] the XML representation of the stylesheet
|
52
|
+
def to_xml
|
53
|
+
<<~XML
|
54
|
+
<?xml-stylesheet href="#{href}" type="#{type}" media="#{media}"?>
|
55
|
+
XML
|
56
|
+
end
|
50
57
|
end
|
51
58
|
end
|
52
59
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable/uri'
|
4
|
-
require 'faraday'
|
5
|
-
require 'faraday/follow_redirects'
|
6
4
|
require 'json'
|
7
5
|
require 'regexp_parser'
|
8
6
|
require 'tzinfo'
|
@@ -15,11 +13,10 @@ module Html2rss
|
|
15
13
|
module Utils
|
16
14
|
##
|
17
15
|
# @param url [String, Addressable::URI]
|
18
|
-
# @param base_url [String]
|
16
|
+
# @param base_url [String, Addressable::URI]
|
19
17
|
# @return [Addressable::URI]
|
20
18
|
def self.build_absolute_url_from_relative(url, base_url)
|
21
|
-
url = Addressable::URI.parse(url
|
22
|
-
|
19
|
+
url = Addressable::URI.parse(url)
|
23
20
|
return url if url.absolute?
|
24
21
|
|
25
22
|
base_uri = Addressable::URI.parse(base_url)
|
@@ -59,31 +56,31 @@ module Html2rss
|
|
59
56
|
end
|
60
57
|
|
61
58
|
##
|
62
|
-
# Builds a titleized representation of the URL.
|
63
|
-
# @param url [
|
59
|
+
# Builds a titleized representation of the URL with prefixed host.
|
60
|
+
# @param url [Addressable::URI]
|
64
61
|
# @return [String]
|
65
|
-
def self.
|
66
|
-
|
67
|
-
host =
|
62
|
+
def self.titleized_channel_url(url)
|
63
|
+
nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
|
64
|
+
host = url.host
|
68
65
|
|
69
|
-
nicer_path = uri.path.split('/').reject(&:empty?)
|
70
66
|
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
71
67
|
end
|
72
68
|
|
73
69
|
##
|
74
|
-
#
|
75
|
-
# @param
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
70
|
+
# Builds a titleized representation of the URL.
|
71
|
+
# @param url [Addressable::URI]
|
72
|
+
# @return [String]
|
73
|
+
def self.titleized_url(url)
|
74
|
+
return '' if url.path.empty?
|
75
|
+
|
76
|
+
nicer_path = CGI.unescapeURIComponent(url.path)
|
77
|
+
.split('/')
|
78
|
+
.flat_map do |part|
|
79
|
+
part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
|
80
|
+
end
|
81
|
+
|
82
|
+
nicer_path.map!(&:capitalize)
|
83
|
+
File.basename nicer_path.join(' '), '.*'
|
87
84
|
end
|
88
85
|
|
89
86
|
##
|
@@ -104,10 +101,10 @@ module Html2rss
|
|
104
101
|
##
|
105
102
|
# Guesses the content type based on the file extension of the URL.
|
106
103
|
#
|
107
|
-
# @param url [
|
104
|
+
# @param url [Addressable::URI]
|
108
105
|
# @return [String] guessed content type, defaults to 'application/octet-stream'
|
109
106
|
def self.guess_content_type_from_url(url)
|
110
|
-
url = url.
|
107
|
+
url = url.path.split('?').first
|
111
108
|
|
112
109
|
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
113
110
|
content_type.first&.to_s || 'application/octet-stream'
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -5,8 +5,9 @@ require 'zeitwerk'
|
|
5
5
|
loader = Zeitwerk::Loader.for_gem
|
6
6
|
loader.setup
|
7
7
|
|
8
|
-
require '
|
8
|
+
require 'addressable'
|
9
9
|
require 'logger'
|
10
|
+
require 'yaml'
|
10
11
|
|
11
12
|
##
|
12
13
|
# The Html2rss namespace.
|
@@ -98,13 +99,13 @@ module Html2rss
|
|
98
99
|
# No need for a "feed config".
|
99
100
|
#
|
100
101
|
# @param url [String] the URL to automatically source the feed from
|
102
|
+
# @param strategy [Symbol] the request strategy to use
|
101
103
|
# @return [RSS::Rss]
|
102
|
-
def self.auto_source(url)
|
103
|
-
|
104
|
-
|
105
|
-
response = Html2rss::Utils.request_url(url)
|
104
|
+
def self.auto_source(url, strategy: :faraday)
|
105
|
+
ctx = RequestService::Context.new(url:, headers: {})
|
106
|
+
response = RequestService.execute(ctx, strategy:)
|
106
107
|
|
107
|
-
Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
|
108
|
+
Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
|
108
109
|
end
|
109
110
|
|
110
111
|
private_class_method :find_feed_config
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -120,6 +120,20 @@ dependencies:
|
|
120
120
|
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
122
|
version: '0'
|
123
|
+
- !ruby/object:Gem::Dependency
|
124
|
+
name: puppeteer-ruby
|
125
|
+
requirement: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
type: :runtime
|
131
|
+
prerelease: false
|
132
|
+
version_requirements: !ruby/object:Gem::Requirement
|
133
|
+
requirements:
|
134
|
+
- - ">="
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: '0'
|
123
137
|
- !ruby/object:Gem::Dependency
|
124
138
|
name: regexp_parser
|
125
139
|
requirement: !ruby/object:Gem::Requirement
|
@@ -140,14 +154,14 @@ dependencies:
|
|
140
154
|
requirements:
|
141
155
|
- - "~>"
|
142
156
|
- !ruby/object:Gem::Version
|
143
|
-
version: '
|
157
|
+
version: '3.0'
|
144
158
|
type: :runtime
|
145
159
|
prerelease: false
|
146
160
|
version_requirements: !ruby/object:Gem::Requirement
|
147
161
|
requirements:
|
148
162
|
- - "~>"
|
149
163
|
- !ruby/object:Gem::Version
|
150
|
-
version: '
|
164
|
+
version: '3.0'
|
151
165
|
- !ruby/object:Gem::Dependency
|
152
166
|
name: rss
|
153
167
|
requirement: !ruby/object:Gem::Requirement
|
@@ -208,16 +222,16 @@ dependencies:
|
|
208
222
|
name: zeitwerk
|
209
223
|
requirement: !ruby/object:Gem::Requirement
|
210
224
|
requirements:
|
211
|
-
- - "
|
225
|
+
- - "~>"
|
212
226
|
- !ruby/object:Gem::Version
|
213
|
-
version:
|
227
|
+
version: 2.6.0
|
214
228
|
type: :runtime
|
215
229
|
prerelease: false
|
216
230
|
version_requirements: !ruby/object:Gem::Requirement
|
217
231
|
requirements:
|
218
|
-
- - "
|
232
|
+
- - "~>"
|
219
233
|
- !ruby/object:Gem::Version
|
220
|
-
version:
|
234
|
+
version: 2.6.0
|
221
235
|
description: Supports JSON content, custom HTTP headers, and post-processing of extracted
|
222
236
|
content.
|
223
237
|
email:
|
@@ -251,8 +265,11 @@ files:
|
|
251
265
|
- lib/html2rss/auto_source/reducer.rb
|
252
266
|
- lib/html2rss/auto_source/rss_builder.rb
|
253
267
|
- lib/html2rss/auto_source/scraper.rb
|
268
|
+
- lib/html2rss/auto_source/scraper/html.rb
|
254
269
|
- lib/html2rss/auto_source/scraper/schema.rb
|
255
|
-
- lib/html2rss/auto_source/scraper/schema/
|
270
|
+
- lib/html2rss/auto_source/scraper/schema/item_list.rb
|
271
|
+
- lib/html2rss/auto_source/scraper/schema/list_item.rb
|
272
|
+
- lib/html2rss/auto_source/scraper/schema/thing.rb
|
256
273
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
257
274
|
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
258
275
|
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
@@ -268,6 +285,13 @@ files:
|
|
268
285
|
- lib/html2rss/item_extractors/static.rb
|
269
286
|
- lib/html2rss/item_extractors/text.rb
|
270
287
|
- lib/html2rss/object_to_xml_converter.rb
|
288
|
+
- lib/html2rss/request_service.rb
|
289
|
+
- lib/html2rss/request_service/browserless_strategy.rb
|
290
|
+
- lib/html2rss/request_service/context.rb
|
291
|
+
- lib/html2rss/request_service/faraday_strategy.rb
|
292
|
+
- lib/html2rss/request_service/puppet_commander.rb
|
293
|
+
- lib/html2rss/request_service/response.rb
|
294
|
+
- lib/html2rss/request_service/strategy.rb
|
271
295
|
- lib/html2rss/rss_builder.rb
|
272
296
|
- lib/html2rss/rss_builder/channel.rb
|
273
297
|
- lib/html2rss/rss_builder/item.rb
|
@@ -279,7 +303,7 @@ licenses:
|
|
279
303
|
- MIT
|
280
304
|
metadata:
|
281
305
|
allowed_push_host: https://rubygems.org
|
282
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
306
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.16.0
|
283
307
|
rubygems_mfa_required: 'true'
|
284
308
|
post_install_message:
|
285
309
|
rdoc_options: []
|
@@ -296,7 +320,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
296
320
|
- !ruby/object:Gem::Version
|
297
321
|
version: '0'
|
298
322
|
requirements: []
|
299
|
-
rubygems_version: 3.5.
|
323
|
+
rubygems_version: 3.5.22
|
300
324
|
signing_key:
|
301
325
|
specification_version: 4
|
302
326
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
@@ -1,61 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'date'
|
4
|
-
|
5
|
-
module Html2rss
|
6
|
-
class AutoSource
|
7
|
-
module Scraper
|
8
|
-
class Schema
|
9
|
-
##
|
10
|
-
# Base class for Schema.org schema_objects.
|
11
|
-
#
|
12
|
-
# @see https://schema.org/Article
|
13
|
-
class Base
|
14
|
-
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
15
|
-
|
16
|
-
def initialize(schema_object, url:)
|
17
|
-
@schema_object = schema_object
|
18
|
-
@url = url
|
19
|
-
end
|
20
|
-
|
21
|
-
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
22
|
-
def call
|
23
|
-
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
24
|
-
[attribute, public_send(attribute)]
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
|
29
|
-
def title = schema_object[:title]
|
30
|
-
|
31
|
-
def description
|
32
|
-
[schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
|
33
|
-
.max_by { |desc| desc.to_s.size }
|
34
|
-
end
|
35
|
-
|
36
|
-
# @return [Addressable::URI, nil] the URL of the schema object
|
37
|
-
def url
|
38
|
-
url = schema_object[:url]
|
39
|
-
if url.to_s.empty?
|
40
|
-
Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
|
41
|
-
return
|
42
|
-
end
|
43
|
-
|
44
|
-
Utils.build_absolute_url_from_relative(url, @url)
|
45
|
-
end
|
46
|
-
|
47
|
-
def image = images.first || nil
|
48
|
-
def published_at = schema_object[:datePublished]
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
attr_reader :schema_object
|
53
|
-
|
54
|
-
def images
|
55
|
-
Array(schema_object[:image]).compact
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|