html2rss 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +38 -10
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +79 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +68 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +77 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/item.rb +8 -2
- data/lib/html2rss/utils.rb +5 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +21 -0
- metadata +29 -3
data/lib/html2rss/utils.rb
CHANGED
@@ -31,12 +31,12 @@ module Html2rss
|
|
31
31
|
##
|
32
32
|
# Removes any space, parses and normalizes the given url.
|
33
33
|
# @param url [String]
|
34
|
-
# @return [
|
34
|
+
# @return [Addressable::URI, nil] normalized URL, or nil if input is empty
|
35
35
|
def self.sanitize_url(url)
|
36
36
|
url = url.to_s.gsub(/\s+/, ' ').strip
|
37
37
|
return if url.empty?
|
38
38
|
|
39
|
-
Addressable::URI.parse(url).normalize
|
39
|
+
Addressable::URI.parse(url).normalize
|
40
40
|
end
|
41
41
|
|
42
42
|
##
|
@@ -71,18 +71,13 @@ module Html2rss
|
|
71
71
|
|
72
72
|
##
|
73
73
|
# @param url [String, Addressable::URI]
|
74
|
-
# @param convert_json_to_xml [true, false] Should JSON be converted to XML
|
75
74
|
# @param headers [Hash] additional HTTP request headers to use for the request
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
75
|
+
# @return [Faraday::Response] body of the HTTP response
|
76
|
+
def self.request_url(url, headers: {})
|
77
|
+
Faraday.new(url:, headers:) do |faraday|
|
79
78
|
faraday.use Faraday::FollowRedirects::Middleware
|
80
79
|
faraday.adapter Faraday.default_adapter
|
81
80
|
end.get
|
82
|
-
|
83
|
-
body = response.body
|
84
|
-
|
85
|
-
convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
|
86
81
|
end
|
87
82
|
|
88
83
|
##
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
|
|
6
6
|
loader.setup
|
7
7
|
|
8
8
|
require 'yaml'
|
9
|
+
require 'logger'
|
9
10
|
|
10
11
|
##
|
11
12
|
# The Html2rss namespace.
|
12
13
|
module Html2rss
|
14
|
+
##
|
15
|
+
# The logger instance.
|
16
|
+
Log = Logger.new($stdout)
|
17
|
+
|
18
|
+
Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
|
19
|
+
|
20
|
+
Log.formatter = proc do |severity, datetime, _progname, msg|
|
21
|
+
"#{datetime} [#{severity}] #{msg}\n"
|
22
|
+
end
|
23
|
+
|
13
24
|
##
|
14
25
|
# The Html2rss::Error base class.
|
15
26
|
class Error < StandardError; end
|
@@ -91,5 +102,15 @@ module Html2rss
|
|
91
102
|
end
|
92
103
|
end
|
93
104
|
|
105
|
+
##
|
106
|
+
# Scrapes the provided URL and returns an RSS object.
|
107
|
+
# No need for a "feed config".
|
108
|
+
#
|
109
|
+
# @param url [String] the URL to automatically source the feed from
|
110
|
+
# @return [RSS::Rss]
|
111
|
+
def self.auto_source(url)
|
112
|
+
Html2rss::AutoSource.new(url).build
|
113
|
+
end
|
114
|
+
|
94
115
|
private_class_method :load_yaml, :find_feed_config
|
95
116
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -106,6 +106,20 @@ dependencies:
|
|
106
106
|
- - "<"
|
107
107
|
- !ruby/object:Gem::Version
|
108
108
|
version: '2.0'
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: parallel
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
type: :runtime
|
117
|
+
prerelease: false
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
109
123
|
- !ruby/object:Gem::Dependency
|
110
124
|
name: regexp_parser
|
111
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -230,6 +244,18 @@ files:
|
|
230
244
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
231
245
|
- lib/html2rss/attribute_post_processors/substring.rb
|
232
246
|
- lib/html2rss/attribute_post_processors/template.rb
|
247
|
+
- lib/html2rss/auto_source.rb
|
248
|
+
- lib/html2rss/auto_source/article.rb
|
249
|
+
- lib/html2rss/auto_source/channel.rb
|
250
|
+
- lib/html2rss/auto_source/cleanup.rb
|
251
|
+
- lib/html2rss/auto_source/reducer.rb
|
252
|
+
- lib/html2rss/auto_source/rss_builder.rb
|
253
|
+
- lib/html2rss/auto_source/scraper.rb
|
254
|
+
- lib/html2rss/auto_source/scraper/schema.rb
|
255
|
+
- lib/html2rss/auto_source/scraper/schema/base.rb
|
256
|
+
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
257
|
+
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
258
|
+
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
233
259
|
- lib/html2rss/cli.rb
|
234
260
|
- lib/html2rss/config.rb
|
235
261
|
- lib/html2rss/config/channel.rb
|
@@ -253,7 +279,7 @@ licenses:
|
|
253
279
|
- MIT
|
254
280
|
metadata:
|
255
281
|
allowed_push_host: https://rubygems.org
|
256
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
282
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
|
257
283
|
rubygems_mfa_required: 'true'
|
258
284
|
post_install_message:
|
259
285
|
rdoc_options: []
|