html2rss 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +112 -44
- data/html2rss.gemspec +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
- data/lib/html2rss/auto_source/article.rb +37 -5
- data/lib/html2rss/auto_source/channel.rb +21 -28
- data/lib/html2rss/auto_source/cleanup.rb +0 -16
- data/lib/html2rss/auto_source/rss_builder.rb +1 -1
- data/lib/html2rss/auto_source/scraper/html.rb +21 -12
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +22 -34
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +41 -41
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
- data/lib/html2rss/auto_source.rb +0 -7
- data/lib/html2rss/cli.rb +11 -4
- data/lib/html2rss/config/channel.rb +7 -1
- data/lib/html2rss/config/selectors.rb +2 -1
- data/lib/html2rss/config.rb +1 -0
- data/lib/html2rss/item.rb +7 -2
- data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
- data/lib/html2rss/request_service/context.rb +46 -0
- data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
- data/lib/html2rss/request_service/puppet_commander.rb +61 -0
- data/lib/html2rss/request_service/response.rb +27 -0
- data/lib/html2rss/request_service/strategy.rb +28 -0
- data/lib/html2rss/request_service.rb +97 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
- data/lib/html2rss/utils.rb +23 -26
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +5 -5
- metadata +31 -11
- data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class RequestService
|
5
|
+
##
|
6
|
+
# To be used by strategies to provide their response.
|
7
|
+
class Response
|
8
|
+
##
|
9
|
+
# @param body [String] the body of the response
|
10
|
+
# @param headers [Hash] the headers of the response
|
11
|
+
def initialize(body:, headers: {})
|
12
|
+
@body = body
|
13
|
+
|
14
|
+
headers = headers.dup
|
15
|
+
headers.transform_keys!(&:to_s)
|
16
|
+
|
17
|
+
@headers = headers
|
18
|
+
end
|
19
|
+
|
20
|
+
# @return [String] the body of the response
|
21
|
+
attr_reader :body
|
22
|
+
|
23
|
+
# @return [Hash<String, Object>] the headers of the response
|
24
|
+
attr_reader :headers
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class RequestService
|
5
|
+
##
|
6
|
+
# Defines the interface of every request strategy.
|
7
|
+
class Strategy
|
8
|
+
##
|
9
|
+
# @param ctx [Context] the context for the request
|
10
|
+
def initialize(ctx)
|
11
|
+
@ctx = ctx
|
12
|
+
end
|
13
|
+
|
14
|
+
##
|
15
|
+
# Executes the request.
|
16
|
+
# @return [Response] the response from the strategy
|
17
|
+
# @raise [NotImplementedError] if the method is not implemented by the subclass
|
18
|
+
def execute
|
19
|
+
raise NotImplementedError, 'Subclass must implement #execute'
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# @return [Context] the context for the request
|
25
|
+
attr_reader :ctx
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
require 'forwardable'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# Requests website URLs to retrieve their HTML for further processing.
|
9
|
+
# Provides strategies, i.e. to integrate Browserless.io.
|
10
|
+
class RequestService
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
class UnknownStrategy < Html2rss::Error; end
|
14
|
+
class InvalidUrl < Html2rss::Error; end
|
15
|
+
class UnsupportedUrlScheme < Html2rss::Error; end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
extend Forwardable
|
19
|
+
|
20
|
+
%i[default_strategy_name
|
21
|
+
default_strategy_name=
|
22
|
+
strategy_names
|
23
|
+
register_strategy
|
24
|
+
unregister_strategy
|
25
|
+
strategy_registered?
|
26
|
+
execute].each do |method|
|
27
|
+
def_delegator :instance, method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@strategies = {
|
33
|
+
faraday: FaradayStrategy,
|
34
|
+
browserless: BrowserlessStrategy
|
35
|
+
}
|
36
|
+
@default_strategy_name = :faraday
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Symbol] the default strategy name
|
40
|
+
attr_reader :default_strategy_name
|
41
|
+
|
42
|
+
##
|
43
|
+
# Sets the default strategy.
|
44
|
+
# @param strategy [Symbol] the name of the strategy
|
45
|
+
# @raise [UnknownStrategy] if the strategy is not registered
|
46
|
+
def default_strategy_name=(strategy)
|
47
|
+
raise UnknownStrategy unless strategy_registered?(strategy)
|
48
|
+
|
49
|
+
@default_strategy_name = strategy.to_sym
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [Array<String>] the names of the registered strategies
|
53
|
+
def strategy_names = @strategies.keys.map(&:to_s)
|
54
|
+
|
55
|
+
##
|
56
|
+
# Registers a new strategy.
|
57
|
+
# @param name [Symbol] the name of the strategy
|
58
|
+
# @param strategy_class [Class] the class of the strategy
|
59
|
+
def register_strategy(name, strategy_class)
|
60
|
+
raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
|
61
|
+
|
62
|
+
@strategies[name.to_sym] = strategy_class
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Checks if a strategy is registered.
|
67
|
+
# @param name [Symbol] the name of the strategy
|
68
|
+
# @return [Boolean] true if the strategy is registered, false otherwise
|
69
|
+
def strategy_registered?(name)
|
70
|
+
@strategies.key?(name.to_sym)
|
71
|
+
end
|
72
|
+
|
73
|
+
##
|
74
|
+
# Unregisters a strategy.
|
75
|
+
# @param name [Symbol] the name of the strategy
|
76
|
+
# @return [Boolean] true if the strategy was unregistered, false otherwise
|
77
|
+
def unregister_strategy(name)
|
78
|
+
raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
|
79
|
+
|
80
|
+
!!@strategies.delete(name.to_sym)
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# Executes the request.
|
85
|
+
# @param ctx [Context] the context for the request
|
86
|
+
# @param strategy [Symbol] the strategy to use
|
87
|
+
# @return [Response] the response from the strategy
|
88
|
+
# @raise [UnknownStrategy] if the strategy is not known
|
89
|
+
def execute(ctx, strategy: default_strategy_name)
|
90
|
+
strategy_class = @strategies.fetch(strategy) do
|
91
|
+
raise UnknownStrategy,
|
92
|
+
"The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
|
93
|
+
end
|
94
|
+
strategy_class.new(ctx).execute
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -47,6 +47,13 @@ module Html2rss
|
|
47
47
|
@media = media
|
48
48
|
end
|
49
49
|
attr_reader :href, :type, :media
|
50
|
+
|
51
|
+
# @return [String] the XML representation of the stylesheet
|
52
|
+
def to_xml
|
53
|
+
<<~XML
|
54
|
+
<?xml-stylesheet href="#{href}" type="#{type}" media="#{media}"?>
|
55
|
+
XML
|
56
|
+
end
|
50
57
|
end
|
51
58
|
end
|
52
59
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable/uri'
|
4
|
-
require 'faraday'
|
5
|
-
require 'faraday/follow_redirects'
|
6
4
|
require 'json'
|
7
5
|
require 'regexp_parser'
|
8
6
|
require 'tzinfo'
|
@@ -15,11 +13,10 @@ module Html2rss
|
|
15
13
|
module Utils
|
16
14
|
##
|
17
15
|
# @param url [String, Addressable::URI]
|
18
|
-
# @param base_url [String]
|
16
|
+
# @param base_url [String, Addressable::URI]
|
19
17
|
# @return [Addressable::URI]
|
20
18
|
def self.build_absolute_url_from_relative(url, base_url)
|
21
|
-
url = Addressable::URI.parse(url
|
22
|
-
|
19
|
+
url = Addressable::URI.parse(url)
|
23
20
|
return url if url.absolute?
|
24
21
|
|
25
22
|
base_uri = Addressable::URI.parse(base_url)
|
@@ -59,31 +56,31 @@ module Html2rss
|
|
59
56
|
end
|
60
57
|
|
61
58
|
##
|
62
|
-
# Builds a titleized representation of the URL.
|
63
|
-
# @param url [
|
59
|
+
# Builds a titleized representation of the URL with prefixed host.
|
60
|
+
# @param url [Addressable::URI]
|
64
61
|
# @return [String]
|
65
|
-
def self.
|
66
|
-
|
67
|
-
host =
|
62
|
+
def self.titleized_channel_url(url)
|
63
|
+
nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
|
64
|
+
host = url.host
|
68
65
|
|
69
|
-
nicer_path = uri.path.split('/').reject(&:empty?)
|
70
66
|
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
71
67
|
end
|
72
68
|
|
73
69
|
##
|
74
|
-
#
|
75
|
-
# @param
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
70
|
+
# Builds a titleized representation of the URL.
|
71
|
+
# @param url [Addressable::URI]
|
72
|
+
# @return [String]
|
73
|
+
def self.titleized_url(url)
|
74
|
+
return '' if url.path.empty?
|
75
|
+
|
76
|
+
nicer_path = CGI.unescapeURIComponent(url.path)
|
77
|
+
.split('/')
|
78
|
+
.flat_map do |part|
|
79
|
+
part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
|
80
|
+
end
|
81
|
+
|
82
|
+
nicer_path.map!(&:capitalize)
|
83
|
+
File.basename nicer_path.join(' '), '.*'
|
87
84
|
end
|
88
85
|
|
89
86
|
##
|
@@ -104,10 +101,10 @@ module Html2rss
|
|
104
101
|
##
|
105
102
|
# Guesses the content type based on the file extension of the URL.
|
106
103
|
#
|
107
|
-
# @param url [
|
104
|
+
# @param url [Addressable::URI]
|
108
105
|
# @return [String] guessed content type, defaults to 'application/octet-stream'
|
109
106
|
def self.guess_content_type_from_url(url)
|
110
|
-
url = url.
|
107
|
+
url = url.path.split('?').first
|
111
108
|
|
112
109
|
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
113
110
|
content_type.first&.to_s || 'application/octet-stream'
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -99,13 +99,13 @@ module Html2rss
|
|
99
99
|
# No need for a "feed config".
|
100
100
|
#
|
101
101
|
# @param url [String] the URL to automatically source the feed from
|
102
|
+
# @param strategy [Symbol] the request strategy to use
|
102
103
|
# @return [RSS::Rss]
|
103
|
-
def self.auto_source(url)
|
104
|
-
|
104
|
+
def self.auto_source(url, strategy: :faraday)
|
105
|
+
ctx = RequestService::Context.new(url:, headers: {})
|
106
|
+
response = RequestService.execute(ctx, strategy:)
|
105
107
|
|
106
|
-
|
107
|
-
|
108
|
-
Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
|
108
|
+
Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
|
109
109
|
end
|
110
110
|
|
111
111
|
private_class_method :find_feed_config
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.17.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
|
-
autorequire:
|
9
8
|
bindir: exe
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 2025-01-18 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: addressable
|
@@ -120,6 +119,20 @@ dependencies:
|
|
120
119
|
- - ">="
|
121
120
|
- !ruby/object:Gem::Version
|
122
121
|
version: '0'
|
122
|
+
- !ruby/object:Gem::Dependency
|
123
|
+
name: puppeteer-ruby
|
124
|
+
requirement: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - ">="
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
type: :runtime
|
130
|
+
prerelease: false
|
131
|
+
version_requirements: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
123
136
|
- !ruby/object:Gem::Dependency
|
124
137
|
name: regexp_parser
|
125
138
|
requirement: !ruby/object:Gem::Requirement
|
@@ -140,14 +153,14 @@ dependencies:
|
|
140
153
|
requirements:
|
141
154
|
- - "~>"
|
142
155
|
- !ruby/object:Gem::Version
|
143
|
-
version: '
|
156
|
+
version: '3.0'
|
144
157
|
type: :runtime
|
145
158
|
prerelease: false
|
146
159
|
version_requirements: !ruby/object:Gem::Requirement
|
147
160
|
requirements:
|
148
161
|
- - "~>"
|
149
162
|
- !ruby/object:Gem::Version
|
150
|
-
version: '
|
163
|
+
version: '3.0'
|
151
164
|
- !ruby/object:Gem::Dependency
|
152
165
|
name: rss
|
153
166
|
requirement: !ruby/object:Gem::Requirement
|
@@ -253,7 +266,9 @@ files:
|
|
253
266
|
- lib/html2rss/auto_source/scraper.rb
|
254
267
|
- lib/html2rss/auto_source/scraper/html.rb
|
255
268
|
- lib/html2rss/auto_source/scraper/schema.rb
|
256
|
-
- lib/html2rss/auto_source/scraper/schema/
|
269
|
+
- lib/html2rss/auto_source/scraper/schema/item_list.rb
|
270
|
+
- lib/html2rss/auto_source/scraper/schema/list_item.rb
|
271
|
+
- lib/html2rss/auto_source/scraper/schema/thing.rb
|
257
272
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
258
273
|
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
259
274
|
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
@@ -269,6 +284,13 @@ files:
|
|
269
284
|
- lib/html2rss/item_extractors/static.rb
|
270
285
|
- lib/html2rss/item_extractors/text.rb
|
271
286
|
- lib/html2rss/object_to_xml_converter.rb
|
287
|
+
- lib/html2rss/request_service.rb
|
288
|
+
- lib/html2rss/request_service/browserless_strategy.rb
|
289
|
+
- lib/html2rss/request_service/context.rb
|
290
|
+
- lib/html2rss/request_service/faraday_strategy.rb
|
291
|
+
- lib/html2rss/request_service/puppet_commander.rb
|
292
|
+
- lib/html2rss/request_service/response.rb
|
293
|
+
- lib/html2rss/request_service/strategy.rb
|
272
294
|
- lib/html2rss/rss_builder.rb
|
273
295
|
- lib/html2rss/rss_builder/channel.rb
|
274
296
|
- lib/html2rss/rss_builder/item.rb
|
@@ -280,9 +302,8 @@ licenses:
|
|
280
302
|
- MIT
|
281
303
|
metadata:
|
282
304
|
allowed_push_host: https://rubygems.org
|
283
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
305
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.17.0
|
284
306
|
rubygems_mfa_required: 'true'
|
285
|
-
post_install_message:
|
286
307
|
rdoc_options: []
|
287
308
|
require_paths:
|
288
309
|
- lib
|
@@ -290,15 +311,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
290
311
|
requirements:
|
291
312
|
- - ">="
|
292
313
|
- !ruby/object:Gem::Version
|
293
|
-
version: '3.
|
314
|
+
version: '3.2'
|
294
315
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
295
316
|
requirements:
|
296
317
|
- - ">="
|
297
318
|
- !ruby/object:Gem::Version
|
298
319
|
version: '0'
|
299
320
|
requirements: []
|
300
|
-
rubygems_version: 3.
|
301
|
-
signing_key:
|
321
|
+
rubygems_version: 3.6.2
|
302
322
|
specification_version: 4
|
303
323
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
304
324
|
to extract item.
|
@@ -1,61 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'date'
|
4
|
-
|
5
|
-
module Html2rss
|
6
|
-
class AutoSource
|
7
|
-
module Scraper
|
8
|
-
class Schema
|
9
|
-
##
|
10
|
-
# Base class for Schema.org schema_objects.
|
11
|
-
#
|
12
|
-
# @see https://schema.org/Article
|
13
|
-
class Base
|
14
|
-
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
15
|
-
|
16
|
-
def initialize(schema_object, url:)
|
17
|
-
@schema_object = schema_object
|
18
|
-
@url = url
|
19
|
-
end
|
20
|
-
|
21
|
-
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
22
|
-
def call
|
23
|
-
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
24
|
-
[attribute, public_send(attribute)]
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
|
29
|
-
def title = schema_object[:title]
|
30
|
-
|
31
|
-
def description
|
32
|
-
[schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
|
33
|
-
.max_by { |desc| desc.to_s.size }
|
34
|
-
end
|
35
|
-
|
36
|
-
# @return [Addressable::URI, nil] the URL of the schema object
|
37
|
-
def url
|
38
|
-
url = schema_object[:url]
|
39
|
-
if url.to_s.empty?
|
40
|
-
Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
|
41
|
-
return
|
42
|
-
end
|
43
|
-
|
44
|
-
Utils.build_absolute_url_from_relative(url, @url)
|
45
|
-
end
|
46
|
-
|
47
|
-
def image = images.first || nil
|
48
|
-
def published_at = schema_object[:datePublished]
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
attr_reader :schema_object
|
53
|
-
|
54
|
-
def images
|
55
|
-
Array(schema_object[:image]).compact
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|