html2rss 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +38 -10
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +79 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +68 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +77 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/item.rb +8 -2
- data/lib/html2rss/utils.rb +5 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +21 -0
- metadata +29 -3
data/lib/html2rss/utils.rb
CHANGED
@@ -31,12 +31,12 @@ module Html2rss
|
|
31
31
|
##
|
32
32
|
# Removes any space, parses and normalizes the given url.
|
33
33
|
# @param url [String]
|
34
|
-
# @return [
|
34
|
+
# @return [Addressable::URI, nil] normalized URL, or nil if input is empty
|
35
35
|
def self.sanitize_url(url)
|
36
36
|
url = url.to_s.gsub(/\s+/, ' ').strip
|
37
37
|
return if url.empty?
|
38
38
|
|
39
|
-
Addressable::URI.parse(url).normalize
|
39
|
+
Addressable::URI.parse(url).normalize
|
40
40
|
end
|
41
41
|
|
42
42
|
##
|
@@ -71,18 +71,13 @@ module Html2rss
|
|
71
71
|
|
72
72
|
##
|
73
73
|
# @param url [String, Addressable::URI]
|
74
|
-
# @param convert_json_to_xml [true, false] Should JSON be converted to XML
|
75
74
|
# @param headers [Hash] additional HTTP request headers to use for the request
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
75
|
+
# @return [Faraday::Response] body of the HTTP response
|
76
|
+
def self.request_url(url, headers: {})
|
77
|
+
Faraday.new(url:, headers:) do |faraday|
|
79
78
|
faraday.use Faraday::FollowRedirects::Middleware
|
80
79
|
faraday.adapter Faraday.default_adapter
|
81
80
|
end.get
|
82
|
-
|
83
|
-
body = response.body
|
84
|
-
|
85
|
-
convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
|
86
81
|
end
|
87
82
|
|
88
83
|
##
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
|
|
6
6
|
loader.setup
|
7
7
|
|
8
8
|
require 'yaml'
|
9
|
+
require 'logger'
|
9
10
|
|
10
11
|
##
|
11
12
|
# The Html2rss namespace.
|
12
13
|
module Html2rss
|
14
|
+
##
|
15
|
+
# The logger instance.
|
16
|
+
Log = Logger.new($stdout)
|
17
|
+
|
18
|
+
Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
|
19
|
+
|
20
|
+
Log.formatter = proc do |severity, datetime, _progname, msg|
|
21
|
+
"#{datetime} [#{severity}] #{msg}\n"
|
22
|
+
end
|
23
|
+
|
13
24
|
##
|
14
25
|
# The Html2rss::Error base class.
|
15
26
|
class Error < StandardError; end
|
@@ -91,5 +102,15 @@ module Html2rss
|
|
91
102
|
end
|
92
103
|
end
|
93
104
|
|
105
|
+
##
|
106
|
+
# Scrapes the provided URL and returns an RSS object.
|
107
|
+
# No need for a "feed config".
|
108
|
+
#
|
109
|
+
# @param url [String] the URL to automatically source the feed from
|
110
|
+
# @return [RSS::Rss]
|
111
|
+
def self.auto_source(url)
|
112
|
+
Html2rss::AutoSource.new(url).build
|
113
|
+
end
|
114
|
+
|
94
115
|
private_class_method :load_yaml, :find_feed_config
|
95
116
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -106,6 +106,20 @@ dependencies:
|
|
106
106
|
- - "<"
|
107
107
|
- !ruby/object:Gem::Version
|
108
108
|
version: '2.0'
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: parallel
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
type: :runtime
|
117
|
+
prerelease: false
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
109
123
|
- !ruby/object:Gem::Dependency
|
110
124
|
name: regexp_parser
|
111
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -230,6 +244,18 @@ files:
|
|
230
244
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
231
245
|
- lib/html2rss/attribute_post_processors/substring.rb
|
232
246
|
- lib/html2rss/attribute_post_processors/template.rb
|
247
|
+
- lib/html2rss/auto_source.rb
|
248
|
+
- lib/html2rss/auto_source/article.rb
|
249
|
+
- lib/html2rss/auto_source/channel.rb
|
250
|
+
- lib/html2rss/auto_source/cleanup.rb
|
251
|
+
- lib/html2rss/auto_source/reducer.rb
|
252
|
+
- lib/html2rss/auto_source/rss_builder.rb
|
253
|
+
- lib/html2rss/auto_source/scraper.rb
|
254
|
+
- lib/html2rss/auto_source/scraper/schema.rb
|
255
|
+
- lib/html2rss/auto_source/scraper/schema/base.rb
|
256
|
+
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
257
|
+
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
258
|
+
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
233
259
|
- lib/html2rss/cli.rb
|
234
260
|
- lib/html2rss/config.rb
|
235
261
|
- lib/html2rss/config/channel.rb
|
@@ -253,7 +279,7 @@ licenses:
|
|
253
279
|
- MIT
|
254
280
|
metadata:
|
255
281
|
allowed_push_host: https://rubygems.org
|
256
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
282
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
|
257
283
|
rubygems_mfa_required: 'true'
|
258
284
|
post_install_message:
|
259
285
|
rdoc_options: []
|