html2rss 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +39 -11
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +85 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +70 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +128 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +80 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/config.rb +1 -4
- data/lib/html2rss/item.rb +9 -3
- data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
- data/lib/html2rss/utils.rb +11 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +27 -11
- metadata +30 -4
data/lib/html2rss/item.rb
CHANGED
@@ -19,11 +19,12 @@ module Html2rss
|
|
19
19
|
##
|
20
20
|
# Fetches items from a given URL using configuration settings.
|
21
21
|
#
|
22
|
-
# @param url [
|
22
|
+
# @param url [Addressable::URI] URL to fetch items from.
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
26
|
-
body = Utils.
|
26
|
+
body = Utils.request_url(url, headers: config.headers).body
|
27
|
+
body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
|
27
28
|
|
28
29
|
Nokogiri.HTML(body)
|
29
30
|
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
@@ -47,6 +48,7 @@ module Html2rss
|
|
47
48
|
# @param method_name [Symbol]
|
48
49
|
# @param _include_private [true, false]
|
49
50
|
# @return [true, false]
|
51
|
+
# :reek:BooleanParameter { enabled: false }
|
50
52
|
def respond_to_missing?(method_name, _include_private = false)
|
51
53
|
config.selector?(method_name) || super
|
52
54
|
end
|
@@ -110,7 +112,11 @@ module Html2rss
|
|
110
112
|
#
|
111
113
|
# @return [Array<String>] list of categories.
|
112
114
|
def categories
|
113
|
-
config.category_selector_names
|
115
|
+
config.category_selector_names
|
116
|
+
.filter_map do |method_name|
|
117
|
+
category = public_send(method_name)
|
118
|
+
category.strip unless category.to_s.empty?
|
119
|
+
end.uniq
|
114
120
|
end
|
115
121
|
|
116
122
|
##
|
@@ -3,35 +3,50 @@
|
|
3
3
|
module Html2rss
|
4
4
|
module RssBuilder
|
5
5
|
##
|
6
|
-
#
|
6
|
+
# Represents a stylesheet.
|
7
7
|
class Stylesheet
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class << self
|
9
|
+
##
|
10
|
+
# Adds the stylesheet XML tags to the RSS.
|
11
|
+
#
|
12
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
13
|
+
# @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
|
14
|
+
# @return [nil]
|
15
|
+
def add(maker, stylesheets)
|
16
|
+
stylesheets.each do |stylesheet|
|
17
|
+
add_stylesheet(maker, stylesheet)
|
18
|
+
end
|
17
19
|
end
|
18
|
-
end
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
private
|
22
|
+
|
23
|
+
##
|
24
|
+
# Adds a single Stylesheet to the RSS.
|
25
|
+
#
|
26
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
27
|
+
# @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
|
28
|
+
# @return [nil]
|
29
|
+
def add_stylesheet(maker, stylesheet)
|
30
|
+
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
31
|
+
xss.href = stylesheet.href
|
32
|
+
xss.type = stylesheet.type
|
33
|
+
xss.media = stylesheet.media
|
34
|
+
end
|
31
35
|
end
|
32
36
|
end
|
33
37
|
|
34
|
-
|
38
|
+
TYPES = ['text/css', 'text/xsl'].freeze
|
39
|
+
|
40
|
+
def initialize(href:, type:, media: 'all')
|
41
|
+
raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
|
42
|
+
raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
|
43
|
+
raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
|
44
|
+
|
45
|
+
@href = href
|
46
|
+
@type = type
|
47
|
+
@media = media
|
48
|
+
end
|
49
|
+
attr_reader :href, :type, :media
|
35
50
|
end
|
36
51
|
end
|
37
52
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -31,12 +31,12 @@ module Html2rss
|
|
31
31
|
##
|
32
32
|
# Removes any space, parses and normalizes the given url.
|
33
33
|
# @param url [String]
|
34
|
-
# @return [
|
34
|
+
# @return [Addressable::URI, nil] normalized URL, or nil if input is empty
|
35
35
|
def self.sanitize_url(url)
|
36
36
|
url = url.to_s.gsub(/\s+/, ' ').strip
|
37
37
|
return if url.empty?
|
38
38
|
|
39
|
-
Addressable::URI.parse(url).normalize
|
39
|
+
Addressable::URI.parse(url).normalize
|
40
40
|
end
|
41
41
|
|
42
42
|
##
|
@@ -44,6 +44,7 @@ module Html2rss
|
|
44
44
|
#
|
45
45
|
# @param time_zone [String]
|
46
46
|
# @param default_time_zone [String]
|
47
|
+
# @yield block to execute with the given time zone
|
47
48
|
# @return [Object] whatever the given block returns
|
48
49
|
def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
|
49
50
|
raise ArgumentError, 'a block is required' unless block_given?
|
@@ -71,18 +72,18 @@ module Html2rss
|
|
71
72
|
|
72
73
|
##
|
73
74
|
# @param url [String, Addressable::URI]
|
74
|
-
# @param convert_json_to_xml [true, false] Should JSON be converted to XML
|
75
75
|
# @param headers [Hash] additional HTTP request headers to use for the request
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
76
|
+
# @return [Faraday::Response] body of the HTTP response
|
77
|
+
def self.request_url(url, headers: {})
|
78
|
+
url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
|
79
|
+
|
80
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
81
|
+
raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
|
82
|
+
|
83
|
+
Faraday.new(url:, headers:) do |faraday|
|
79
84
|
faraday.use Faraday::FollowRedirects::Middleware
|
80
85
|
faraday.adapter Faraday.default_adapter
|
81
86
|
end.get
|
82
|
-
|
83
|
-
body = response.body
|
84
|
-
|
85
|
-
convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
|
86
87
|
end
|
87
88
|
|
88
89
|
##
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
|
|
6
6
|
loader.setup
|
7
7
|
|
8
8
|
require 'yaml'
|
9
|
+
require 'logger'
|
9
10
|
|
10
11
|
##
|
11
12
|
# The Html2rss namespace.
|
12
13
|
module Html2rss
|
14
|
+
##
|
15
|
+
# The logger instance.
|
16
|
+
Log = Logger.new($stdout)
|
17
|
+
|
18
|
+
Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
|
19
|
+
|
20
|
+
Log.formatter = proc do |severity, datetime, _progname, msg|
|
21
|
+
"#{datetime} [#{severity}] #{msg}\n"
|
22
|
+
end
|
23
|
+
|
13
24
|
##
|
14
25
|
# The Html2rss::Error base class.
|
15
26
|
class Error < StandardError; end
|
@@ -32,7 +43,7 @@ module Html2rss
|
|
32
43
|
# @param params [Hash] Dynamic parameters for the feed configuration.
|
33
44
|
# @return [RSS::Rss] RSS object generated from the configuration.
|
34
45
|
def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
|
35
|
-
yaml =
|
46
|
+
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
36
47
|
feeds = yaml[CONFIG_KEY_FEEDS] || {}
|
37
48
|
|
38
49
|
feed_config = find_feed_config(yaml, feeds, name, global_config)
|
@@ -62,15 +73,6 @@ module Html2rss
|
|
62
73
|
RssBuilder.build(config)
|
63
74
|
end
|
64
75
|
|
65
|
-
##
|
66
|
-
# Loads and parses the YAML file.
|
67
|
-
#
|
68
|
-
# @param file [String] Path to the YAML file.
|
69
|
-
# @return [Hash] Parsed YAML content.
|
70
|
-
def self.load_yaml(file)
|
71
|
-
YAML.safe_load_file(file, symbolize_names: true)
|
72
|
-
end
|
73
|
-
|
74
76
|
##
|
75
77
|
# Builds the feed configuration based on the provided parameters.
|
76
78
|
#
|
@@ -91,5 +93,19 @@ module Html2rss
|
|
91
93
|
end
|
92
94
|
end
|
93
95
|
|
94
|
-
|
96
|
+
##
|
97
|
+
# Scrapes the provided URL and returns an RSS object.
|
98
|
+
# No need for a "feed config".
|
99
|
+
#
|
100
|
+
# @param url [String] the URL to automatically source the feed from
|
101
|
+
# @return [RSS::Rss]
|
102
|
+
def self.auto_source(url)
|
103
|
+
url = Addressable::URI.parse(url)
|
104
|
+
|
105
|
+
response = Html2rss::Utils.request_url(url)
|
106
|
+
|
107
|
+
Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
|
108
|
+
end
|
109
|
+
|
110
|
+
private_class_method :find_feed_config
|
95
111
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08
|
11
|
+
date: 2024-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -106,6 +106,20 @@ dependencies:
|
|
106
106
|
- - "<"
|
107
107
|
- !ruby/object:Gem::Version
|
108
108
|
version: '2.0'
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: parallel
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
type: :runtime
|
117
|
+
prerelease: false
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
109
123
|
- !ruby/object:Gem::Dependency
|
110
124
|
name: regexp_parser
|
111
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -230,6 +244,18 @@ files:
|
|
230
244
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
231
245
|
- lib/html2rss/attribute_post_processors/substring.rb
|
232
246
|
- lib/html2rss/attribute_post_processors/template.rb
|
247
|
+
- lib/html2rss/auto_source.rb
|
248
|
+
- lib/html2rss/auto_source/article.rb
|
249
|
+
- lib/html2rss/auto_source/channel.rb
|
250
|
+
- lib/html2rss/auto_source/cleanup.rb
|
251
|
+
- lib/html2rss/auto_source/reducer.rb
|
252
|
+
- lib/html2rss/auto_source/rss_builder.rb
|
253
|
+
- lib/html2rss/auto_source/scraper.rb
|
254
|
+
- lib/html2rss/auto_source/scraper/schema.rb
|
255
|
+
- lib/html2rss/auto_source/scraper/schema/base.rb
|
256
|
+
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
257
|
+
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
258
|
+
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
233
259
|
- lib/html2rss/cli.rb
|
234
260
|
- lib/html2rss/config.rb
|
235
261
|
- lib/html2rss/config/channel.rb
|
@@ -253,7 +279,7 @@ licenses:
|
|
253
279
|
- MIT
|
254
280
|
metadata:
|
255
281
|
allowed_push_host: https://rubygems.org
|
256
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
282
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
|
257
283
|
rubygems_mfa_required: 'true'
|
258
284
|
post_install_message:
|
259
285
|
rdoc_options: []
|
@@ -270,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
270
296
|
- !ruby/object:Gem::Version
|
271
297
|
version: '0'
|
272
298
|
requirements: []
|
273
|
-
rubygems_version: 3.5.
|
299
|
+
rubygems_version: 3.5.16
|
274
300
|
signing_key:
|
275
301
|
specification_version: 4
|
276
302
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|