html2rss 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +113 -44
- data/html2rss.gemspec +3 -2
- data/lib/html2rss/auto_source/article.rb +37 -5
- data/lib/html2rss/auto_source/channel.rb +21 -28
- data/lib/html2rss/auto_source/cleanup.rb +0 -16
- data/lib/html2rss/auto_source/rss_builder.rb +1 -1
- data/lib/html2rss/auto_source/scraper/html.rb +18 -9
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +39 -39
- data/lib/html2rss/auto_source.rb +0 -7
- data/lib/html2rss/cli.rb +11 -4
- data/lib/html2rss/config/channel.rb +7 -1
- data/lib/html2rss/config/selectors.rb +2 -1
- data/lib/html2rss/config.rb +1 -0
- data/lib/html2rss/item.rb +7 -2
- data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
- data/lib/html2rss/request_service/context.rb +46 -0
- data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
- data/lib/html2rss/request_service/puppet_commander.rb +61 -0
- data/lib/html2rss/request_service/response.rb +27 -0
- data/lib/html2rss/request_service/strategy.rb +28 -0
- data/lib/html2rss/request_service.rb +97 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
- data/lib/html2rss/utils.rb +23 -26
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +5 -5
- metadata +34 -11
- data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61
@@ -0,0 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'singleton'
|
4
|
+
require 'forwardable'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# Requests website URLs to retrieve their HTML for further processing.
|
9
|
+
# Provides strategies, i.e. to integrate Browserless.io.
|
10
|
+
class RequestService
|
11
|
+
include Singleton
|
12
|
+
|
13
|
+
class UnknownStrategy < Html2rss::Error; end
|
14
|
+
class InvalidUrl < Html2rss::Error; end
|
15
|
+
class UnsupportedUrlScheme < Html2rss::Error; end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
extend Forwardable
|
19
|
+
|
20
|
+
%i[default_strategy_name
|
21
|
+
default_strategy_name=
|
22
|
+
strategy_names
|
23
|
+
register_strategy
|
24
|
+
unregister_strategy
|
25
|
+
strategy_registered?
|
26
|
+
execute].each do |method|
|
27
|
+
def_delegator :instance, method
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize
|
32
|
+
@strategies = {
|
33
|
+
faraday: FaradayStrategy,
|
34
|
+
browserless: BrowserlessStrategy
|
35
|
+
}
|
36
|
+
@default_strategy_name = :faraday
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Symbol] the default strategy name
|
40
|
+
attr_reader :default_strategy_name
|
41
|
+
|
42
|
+
##
|
43
|
+
# Sets the default strategy.
|
44
|
+
# @param strategy [Symbol] the name of the strategy
|
45
|
+
# @raise [UnknownStrategy] if the strategy is not registered
|
46
|
+
def default_strategy_name=(strategy)
|
47
|
+
raise UnknownStrategy unless strategy_registered?(strategy)
|
48
|
+
|
49
|
+
@default_strategy_name = strategy.to_sym
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [Array<String>] the names of the registered strategies
|
53
|
+
def strategy_names = @strategies.keys.map(&:to_s)
|
54
|
+
|
55
|
+
##
|
56
|
+
# Registers a new strategy.
|
57
|
+
# @param name [Symbol] the name of the strategy
|
58
|
+
# @param strategy_class [Class] the class of the strategy
|
59
|
+
def register_strategy(name, strategy_class)
|
60
|
+
raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
|
61
|
+
|
62
|
+
@strategies[name.to_sym] = strategy_class
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Checks if a strategy is registered.
|
67
|
+
# @param name [Symbol] the name of the strategy
|
68
|
+
# @return [Boolean] true if the strategy is registered, false otherwise
|
69
|
+
def strategy_registered?(name)
|
70
|
+
@strategies.key?(name.to_sym)
|
71
|
+
end
|
72
|
+
|
73
|
+
##
|
74
|
+
# Unregisters a strategy.
|
75
|
+
# @param name [Symbol] the name of the strategy
|
76
|
+
# @return [Boolean] true if the strategy was unregistered, false otherwise
|
77
|
+
def unregister_strategy(name)
|
78
|
+
raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
|
79
|
+
|
80
|
+
!!@strategies.delete(name.to_sym)
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# Executes the request.
|
85
|
+
# @param ctx [Context] the context for the request
|
86
|
+
# @param strategy [Symbol] the strategy to use
|
87
|
+
# @return [Response] the response from the strategy
|
88
|
+
# @raise [UnknownStrategy] if the strategy is not known
|
89
|
+
def execute(ctx, strategy: default_strategy_name)
|
90
|
+
strategy_class = @strategies.fetch(strategy) do
|
91
|
+
raise UnknownStrategy,
|
92
|
+
"The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
|
93
|
+
end
|
94
|
+
strategy_class.new(ctx).execute
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -47,6 +47,13 @@ module Html2rss
|
|
47
47
|
@media = media
|
48
48
|
end
|
49
49
|
attr_reader :href, :type, :media
|
50
|
+
|
51
|
+
# @return [String] the XML representation of the stylesheet
|
52
|
+
def to_xml
|
53
|
+
<<~XML
|
54
|
+
<?xml-stylesheet href="#{href}" type="#{type}" media="#{media}"?>
|
55
|
+
XML
|
56
|
+
end
|
50
57
|
end
|
51
58
|
end
|
52
59
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'addressable/uri'
|
4
|
-
require 'faraday'
|
5
|
-
require 'faraday/follow_redirects'
|
6
4
|
require 'json'
|
7
5
|
require 'regexp_parser'
|
8
6
|
require 'tzinfo'
|
@@ -15,11 +13,10 @@ module Html2rss
|
|
15
13
|
module Utils
|
16
14
|
##
|
17
15
|
# @param url [String, Addressable::URI]
|
18
|
-
# @param base_url [String]
|
16
|
+
# @param base_url [String, Addressable::URI]
|
19
17
|
# @return [Addressable::URI]
|
20
18
|
def self.build_absolute_url_from_relative(url, base_url)
|
21
|
-
url = Addressable::URI.parse(url
|
22
|
-
|
19
|
+
url = Addressable::URI.parse(url)
|
23
20
|
return url if url.absolute?
|
24
21
|
|
25
22
|
base_uri = Addressable::URI.parse(base_url)
|
@@ -59,31 +56,31 @@ module Html2rss
|
|
59
56
|
end
|
60
57
|
|
61
58
|
##
|
62
|
-
# Builds a titleized representation of the URL.
|
63
|
-
# @param url [
|
59
|
+
# Builds a titleized representation of the URL with prefixed host.
|
60
|
+
# @param url [Addressable::URI]
|
64
61
|
# @return [String]
|
65
|
-
def self.
|
66
|
-
|
67
|
-
host =
|
62
|
+
def self.titleized_channel_url(url)
|
63
|
+
nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
|
64
|
+
host = url.host
|
68
65
|
|
69
|
-
nicer_path = uri.path.split('/').reject(&:empty?)
|
70
66
|
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
71
67
|
end
|
72
68
|
|
73
69
|
##
|
74
|
-
#
|
75
|
-
# @param
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
70
|
+
# Builds a titleized representation of the URL.
|
71
|
+
# @param url [Addressable::URI]
|
72
|
+
# @return [String]
|
73
|
+
def self.titleized_url(url)
|
74
|
+
return '' if url.path.empty?
|
75
|
+
|
76
|
+
nicer_path = CGI.unescapeURIComponent(url.path)
|
77
|
+
.split('/')
|
78
|
+
.flat_map do |part|
|
79
|
+
part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
|
80
|
+
end
|
81
|
+
|
82
|
+
nicer_path.map!(&:capitalize)
|
83
|
+
File.basename nicer_path.join(' '), '.*'
|
87
84
|
end
|
88
85
|
|
89
86
|
##
|
@@ -104,10 +101,10 @@ module Html2rss
|
|
104
101
|
##
|
105
102
|
# Guesses the content type based on the file extension of the URL.
|
106
103
|
#
|
107
|
-
# @param url [
|
104
|
+
# @param url [Addressable::URI]
|
108
105
|
# @return [String] guessed content type, defaults to 'application/octet-stream'
|
109
106
|
def self.guess_content_type_from_url(url)
|
110
|
-
url = url.
|
107
|
+
url = url.path.split('?').first
|
111
108
|
|
112
109
|
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
113
110
|
content_type.first&.to_s || 'application/octet-stream'
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -99,13 +99,13 @@ module Html2rss
|
|
99
99
|
# No need for a "feed config".
|
100
100
|
#
|
101
101
|
# @param url [String] the URL to automatically source the feed from
|
102
|
+
# @param strategy [Symbol] the request strategy to use
|
102
103
|
# @return [RSS::Rss]
|
103
|
-
def self.auto_source(url)
|
104
|
-
|
104
|
+
def self.auto_source(url, strategy: :faraday)
|
105
|
+
ctx = RequestService::Context.new(url:, headers: {})
|
106
|
+
response = RequestService.execute(ctx, strategy:)
|
105
107
|
|
106
|
-
|
107
|
-
|
108
|
-
Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
|
108
|
+
Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
|
109
109
|
end
|
110
110
|
|
111
111
|
private_class_method :find_feed_config
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -120,6 +120,20 @@ dependencies:
|
|
120
120
|
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
122
|
version: '0'
|
123
|
+
- !ruby/object:Gem::Dependency
|
124
|
+
name: puppeteer-ruby
|
125
|
+
requirement: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - ">="
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: '0'
|
130
|
+
type: :runtime
|
131
|
+
prerelease: false
|
132
|
+
version_requirements: !ruby/object:Gem::Requirement
|
133
|
+
requirements:
|
134
|
+
- - ">="
|
135
|
+
- !ruby/object:Gem::Version
|
136
|
+
version: '0'
|
123
137
|
- !ruby/object:Gem::Dependency
|
124
138
|
name: regexp_parser
|
125
139
|
requirement: !ruby/object:Gem::Requirement
|
@@ -140,14 +154,14 @@ dependencies:
|
|
140
154
|
requirements:
|
141
155
|
- - "~>"
|
142
156
|
- !ruby/object:Gem::Version
|
143
|
-
version: '
|
157
|
+
version: '3.0'
|
144
158
|
type: :runtime
|
145
159
|
prerelease: false
|
146
160
|
version_requirements: !ruby/object:Gem::Requirement
|
147
161
|
requirements:
|
148
162
|
- - "~>"
|
149
163
|
- !ruby/object:Gem::Version
|
150
|
-
version: '
|
164
|
+
version: '3.0'
|
151
165
|
- !ruby/object:Gem::Dependency
|
152
166
|
name: rss
|
153
167
|
requirement: !ruby/object:Gem::Requirement
|
@@ -208,16 +222,16 @@ dependencies:
|
|
208
222
|
name: zeitwerk
|
209
223
|
requirement: !ruby/object:Gem::Requirement
|
210
224
|
requirements:
|
211
|
-
- - "
|
225
|
+
- - "~>"
|
212
226
|
- !ruby/object:Gem::Version
|
213
|
-
version:
|
227
|
+
version: 2.6.0
|
214
228
|
type: :runtime
|
215
229
|
prerelease: false
|
216
230
|
version_requirements: !ruby/object:Gem::Requirement
|
217
231
|
requirements:
|
218
|
-
- - "
|
232
|
+
- - "~>"
|
219
233
|
- !ruby/object:Gem::Version
|
220
|
-
version:
|
234
|
+
version: 2.6.0
|
221
235
|
description: Supports JSON content, custom HTTP headers, and post-processing of extracted
|
222
236
|
content.
|
223
237
|
email:
|
@@ -253,7 +267,9 @@ files:
|
|
253
267
|
- lib/html2rss/auto_source/scraper.rb
|
254
268
|
- lib/html2rss/auto_source/scraper/html.rb
|
255
269
|
- lib/html2rss/auto_source/scraper/schema.rb
|
256
|
-
- lib/html2rss/auto_source/scraper/schema/
|
270
|
+
- lib/html2rss/auto_source/scraper/schema/item_list.rb
|
271
|
+
- lib/html2rss/auto_source/scraper/schema/list_item.rb
|
272
|
+
- lib/html2rss/auto_source/scraper/schema/thing.rb
|
257
273
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
258
274
|
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
259
275
|
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
@@ -269,6 +285,13 @@ files:
|
|
269
285
|
- lib/html2rss/item_extractors/static.rb
|
270
286
|
- lib/html2rss/item_extractors/text.rb
|
271
287
|
- lib/html2rss/object_to_xml_converter.rb
|
288
|
+
- lib/html2rss/request_service.rb
|
289
|
+
- lib/html2rss/request_service/browserless_strategy.rb
|
290
|
+
- lib/html2rss/request_service/context.rb
|
291
|
+
- lib/html2rss/request_service/faraday_strategy.rb
|
292
|
+
- lib/html2rss/request_service/puppet_commander.rb
|
293
|
+
- lib/html2rss/request_service/response.rb
|
294
|
+
- lib/html2rss/request_service/strategy.rb
|
272
295
|
- lib/html2rss/rss_builder.rb
|
273
296
|
- lib/html2rss/rss_builder/channel.rb
|
274
297
|
- lib/html2rss/rss_builder/item.rb
|
@@ -280,7 +303,7 @@ licenses:
|
|
280
303
|
- MIT
|
281
304
|
metadata:
|
282
305
|
allowed_push_host: https://rubygems.org
|
283
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
306
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.16.0
|
284
307
|
rubygems_mfa_required: 'true'
|
285
308
|
post_install_message:
|
286
309
|
rdoc_options: []
|
@@ -297,7 +320,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
297
320
|
- !ruby/object:Gem::Version
|
298
321
|
version: '0'
|
299
322
|
requirements: []
|
300
|
-
rubygems_version: 3.5.
|
323
|
+
rubygems_version: 3.5.22
|
301
324
|
signing_key:
|
302
325
|
specification_version: 4
|
303
326
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
@@ -1,61 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'date'
|
4
|
-
|
5
|
-
module Html2rss
|
6
|
-
class AutoSource
|
7
|
-
module Scraper
|
8
|
-
class Schema
|
9
|
-
##
|
10
|
-
# Base class for Schema.org schema_objects.
|
11
|
-
#
|
12
|
-
# @see https://schema.org/Article
|
13
|
-
class Base
|
14
|
-
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
15
|
-
|
16
|
-
def initialize(schema_object, url:)
|
17
|
-
@schema_object = schema_object
|
18
|
-
@url = url
|
19
|
-
end
|
20
|
-
|
21
|
-
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
22
|
-
def call
|
23
|
-
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
24
|
-
[attribute, public_send(attribute)]
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
|
29
|
-
def title = schema_object[:title]
|
30
|
-
|
31
|
-
def description
|
32
|
-
[schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
|
33
|
-
.max_by { |desc| desc.to_s.size }
|
34
|
-
end
|
35
|
-
|
36
|
-
# @return [Addressable::URI, nil] the URL of the schema object
|
37
|
-
def url
|
38
|
-
url = schema_object[:url]
|
39
|
-
if url.to_s.empty?
|
40
|
-
Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
|
41
|
-
return
|
42
|
-
end
|
43
|
-
|
44
|
-
Utils.build_absolute_url_from_relative(url, @url)
|
45
|
-
end
|
46
|
-
|
47
|
-
def image = images.first || nil
|
48
|
-
def published_at = schema_object[:datePublished]
|
49
|
-
|
50
|
-
private
|
51
|
-
|
52
|
-
attr_reader :schema_object
|
53
|
-
|
54
|
-
def images
|
55
|
-
Array(schema_object[:image]).compact
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
end
|