html2rss 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/html2rss/auto_source/channel.rb +12 -6
- data/lib/html2rss/auto_source/rss_builder.rb +2 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +6 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +1 -1
- data/lib/html2rss/auto_source.rb +20 -17
- data/lib/html2rss/config.rb +1 -4
- data/lib/html2rss/item.rb +1 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
- data/lib/html2rss/utils.rb +6 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +7 -12
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
|
4
|
+
data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
|
7
|
+
data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[](http://rubygems.org/gems/html2rss/) [](https://www.rubydoc.info/gems/html2rss) 
|
4
4
|
|
5
|
-
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
|
5
|
+
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from websites automatically, and as a fallback via _feed config_.
|
6
6
|
|
7
7
|
With the _feed config_, you provide a URL to scrape and CSS selectors for extracting information (like title, URL, etc.). The gem builds the RSS feed accordingly. [Extractors](#using-extractors) and chainable [post processors](#using-post-processors) make information extraction, processing, and sanitizing a breeze. The gem also supports [scraping JSON](#scraping-and-handling-json-responses) responses and [setting HTTP request headers](#set-any-http-header-in-the-request).
|
8
8
|
|
@@ -26,9 +26,9 @@ You can also install it as a dependency in your Ruby project:
|
|
26
26
|
|
27
27
|
## Generating a feed on the CLI
|
28
28
|
|
29
|
-
### using automatic
|
29
|
+
### using automatic generation
|
30
30
|
|
31
|
-
html2rss offers an automatic
|
31
|
+
html2rss offers an automatic RSS generation feature. Try it with:
|
32
32
|
|
33
33
|
`html2rss auto https://unmatchedstyle.com/`
|
34
34
|
|
@@ -10,21 +10,27 @@ module Html2rss
|
|
10
10
|
##
|
11
11
|
#
|
12
12
|
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
13
|
-
# @param
|
14
|
-
|
13
|
+
# @param url [Addressable::URI] The URL of the channel.
|
14
|
+
# @param headers [Hash<String, String>] the http headers
|
15
|
+
# @param articles [Array<Html2rss::AutoSource::Article>] The articles.
|
16
|
+
def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
|
15
17
|
@parsed_body = parsed_body
|
16
18
|
@url = url
|
17
|
-
@
|
19
|
+
@headers = headers
|
18
20
|
@articles = articles
|
21
|
+
@stylesheets = stylesheets
|
19
22
|
end
|
20
23
|
|
24
|
+
attr_writer :articles
|
25
|
+
attr_reader :stylesheets
|
26
|
+
|
21
27
|
def url = extract_url
|
22
28
|
def title = extract_title
|
23
29
|
def language = extract_language
|
24
30
|
def description = extract_description
|
25
31
|
def image = extract_image
|
26
32
|
def ttl = extract_ttl
|
27
|
-
def last_build_date =
|
33
|
+
def last_build_date = headers['last-modified']
|
28
34
|
|
29
35
|
def generator
|
30
36
|
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
@@ -32,7 +38,7 @@ module Html2rss
|
|
32
38
|
|
33
39
|
private
|
34
40
|
|
35
|
-
attr_reader :parsed_body, :
|
41
|
+
attr_reader :parsed_body, :headers
|
36
42
|
|
37
43
|
def extract_url
|
38
44
|
@url.normalize.to_s
|
@@ -58,7 +64,7 @@ module Html2rss
|
|
58
64
|
end
|
59
65
|
|
60
66
|
def extract_ttl
|
61
|
-
ttl =
|
67
|
+
ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
|
62
68
|
return unless ttl
|
63
69
|
|
64
70
|
ttl.to_i.fdiv(60).ceil
|
@@ -1,5 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'set'
|
6
|
+
|
3
7
|
module Html2rss
|
4
8
|
class AutoSource
|
5
9
|
module Scraper
|
@@ -99,6 +103,8 @@ module Html2rss
|
|
99
103
|
# @yield [Hash] Each scraped article_hash
|
100
104
|
# @return [Array<Hash>] the scraped article_hashes
|
101
105
|
def each(&)
|
106
|
+
return enum_for(:each) unless block_given?
|
107
|
+
|
102
108
|
schema_objects.filter_map do |schema_object|
|
103
109
|
next unless (klass = self.class.scraper_for_schema_object(schema_object))
|
104
110
|
next unless (article_hash = klass.new(schema_object, url:).call)
|
@@ -74,7 +74,7 @@ module Html2rss
|
|
74
74
|
def find_heading
|
75
75
|
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
76
76
|
smallest_heading = heading_tags.keys.min
|
77
|
-
heading_tags[smallest_heading]&.max_by { |tag| tag
|
77
|
+
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
|
78
78
|
end
|
79
79
|
|
80
80
|
def extract_title
|
data/lib/html2rss/auto_source.rb
CHANGED
@@ -16,16 +16,18 @@ module Html2rss
|
|
16
16
|
|
17
17
|
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
19
|
+
##
|
20
|
+
# @param url [Addressable::URI] The URL to extract articles from.
|
21
|
+
# @param body [String] The body of the response.
|
22
|
+
# @param headers [Hash] The headers of the response.
|
23
|
+
def initialize(url, body:, headers: {})
|
24
|
+
raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
|
25
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
26
|
+
raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
27
|
+
|
28
|
+
@url = url
|
29
|
+
@body = body
|
30
|
+
@headers = headers
|
29
31
|
end
|
30
32
|
|
31
33
|
def build
|
@@ -34,6 +36,8 @@ module Html2rss
|
|
34
36
|
Reducer.call(articles, url:)
|
35
37
|
Cleanup.call(articles, url:, keep_different_domain: true)
|
36
38
|
|
39
|
+
channel.articles = articles
|
40
|
+
|
37
41
|
Html2rss::AutoSource::RssBuilder.new(
|
38
42
|
channel:,
|
39
43
|
articles:
|
@@ -57,21 +61,20 @@ module Html2rss
|
|
57
61
|
end
|
58
62
|
|
59
63
|
def channel
|
60
|
-
Channel.new(parsed_body,
|
64
|
+
@channel ||= Channel.new(parsed_body, headers: @headers, url:)
|
61
65
|
end
|
62
66
|
|
63
67
|
private
|
64
68
|
|
65
69
|
attr_reader :url
|
66
70
|
|
67
|
-
def response
|
68
|
-
@response ||= Html2rss::Utils.request_url(url)
|
69
|
-
end
|
70
|
-
|
71
|
-
# Parses the HTML body of the response using Nokogiri.
|
72
71
|
# @return [Nokogiri::HTML::Document]
|
73
72
|
def parsed_body
|
74
|
-
@parsed_body ||= Nokogiri.HTML(
|
73
|
+
@parsed_body ||= Nokogiri.HTML(@body)
|
74
|
+
.tap do |doc|
|
75
|
+
# Remove comments from the document
|
76
|
+
doc.xpath('//comment()').each(&:remove)
|
77
|
+
end.freeze
|
75
78
|
end
|
76
79
|
end
|
77
80
|
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -18,9 +18,6 @@ module Html2rss
|
|
18
18
|
# Thrown when the feed config does not contain a value at `:channel`.
|
19
19
|
class ChannelMissing < Html2rss::Error; end
|
20
20
|
|
21
|
-
# Struct to store XML Stylesheet attributes
|
22
|
-
Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
|
23
|
-
|
24
21
|
def_delegator :@channel, :author
|
25
22
|
def_delegator :@channel, :ttl
|
26
23
|
def_delegator :@channel, :title
|
@@ -75,7 +72,7 @@ module Html2rss
|
|
75
72
|
#
|
76
73
|
# @return [Array<Stylesheet>] Array of Stylesheet structs.
|
77
74
|
def stylesheets
|
78
|
-
@global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
|
75
|
+
@global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
|
79
76
|
end
|
80
77
|
|
81
78
|
# Provides read-only access to the channel object.
|
data/lib/html2rss/item.rb
CHANGED
@@ -19,7 +19,7 @@ module Html2rss
|
|
19
19
|
##
|
20
20
|
# Fetches items from a given URL using configuration settings.
|
21
21
|
#
|
22
|
-
# @param url [
|
22
|
+
# @param url [Addressable::URI] URL to fetch items from.
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
@@ -3,35 +3,50 @@
|
|
3
3
|
module Html2rss
|
4
4
|
module RssBuilder
|
5
5
|
##
|
6
|
-
#
|
6
|
+
# Represents a stylesheet.
|
7
7
|
class Stylesheet
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class << self
|
9
|
+
##
|
10
|
+
# Adds the stylesheet XML tags to the RSS.
|
11
|
+
#
|
12
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
13
|
+
# @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
|
14
|
+
# @return [nil]
|
15
|
+
def add(maker, stylesheets)
|
16
|
+
stylesheets.each do |stylesheet|
|
17
|
+
add_stylesheet(maker, stylesheet)
|
18
|
+
end
|
17
19
|
end
|
18
|
-
end
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
private
|
22
|
+
|
23
|
+
##
|
24
|
+
# Adds a single Stylesheet to the RSS.
|
25
|
+
#
|
26
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
27
|
+
# @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
|
28
|
+
# @return [nil]
|
29
|
+
def add_stylesheet(maker, stylesheet)
|
30
|
+
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
31
|
+
xss.href = stylesheet.href
|
32
|
+
xss.type = stylesheet.type
|
33
|
+
xss.media = stylesheet.media
|
34
|
+
end
|
31
35
|
end
|
32
36
|
end
|
33
37
|
|
34
|
-
|
38
|
+
TYPES = ['text/css', 'text/xsl'].freeze
|
39
|
+
|
40
|
+
def initialize(href:, type:, media: 'all')
|
41
|
+
raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
|
42
|
+
raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
|
43
|
+
raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
|
44
|
+
|
45
|
+
@href = href
|
46
|
+
@type = type
|
47
|
+
@media = media
|
48
|
+
end
|
49
|
+
attr_reader :href, :type, :media
|
35
50
|
end
|
36
51
|
end
|
37
52
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -44,6 +44,7 @@ module Html2rss
|
|
44
44
|
#
|
45
45
|
# @param time_zone [String]
|
46
46
|
# @param default_time_zone [String]
|
47
|
+
# @yield block to execute with the given time zone
|
47
48
|
# @return [Object] whatever the given block returns
|
48
49
|
def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
|
49
50
|
raise ArgumentError, 'a block is required' unless block_given?
|
@@ -74,6 +75,11 @@ module Html2rss
|
|
74
75
|
# @param headers [Hash] additional HTTP request headers to use for the request
|
75
76
|
# @return [Faraday::Response] body of the HTTP response
|
76
77
|
def self.request_url(url, headers: {})
|
78
|
+
url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
|
79
|
+
|
80
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
81
|
+
raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
|
82
|
+
|
77
83
|
Faraday.new(url:, headers:) do |faraday|
|
78
84
|
faraday.use Faraday::FollowRedirects::Middleware
|
79
85
|
faraday.adapter Faraday.default_adapter
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -43,7 +43,7 @@ module Html2rss
|
|
43
43
|
# @param params [Hash] Dynamic parameters for the feed configuration.
|
44
44
|
# @return [RSS::Rss] RSS object generated from the configuration.
|
45
45
|
def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
|
46
|
-
yaml =
|
46
|
+
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
47
47
|
feeds = yaml[CONFIG_KEY_FEEDS] || {}
|
48
48
|
|
49
49
|
feed_config = find_feed_config(yaml, feeds, name, global_config)
|
@@ -73,15 +73,6 @@ module Html2rss
|
|
73
73
|
RssBuilder.build(config)
|
74
74
|
end
|
75
75
|
|
76
|
-
##
|
77
|
-
# Loads and parses the YAML file.
|
78
|
-
#
|
79
|
-
# @param file [String] Path to the YAML file.
|
80
|
-
# @return [Hash] Parsed YAML content.
|
81
|
-
def self.load_yaml(file)
|
82
|
-
YAML.safe_load_file(file, symbolize_names: true)
|
83
|
-
end
|
84
|
-
|
85
76
|
##
|
86
77
|
# Builds the feed configuration based on the provided parameters.
|
87
78
|
#
|
@@ -109,8 +100,12 @@ module Html2rss
|
|
109
100
|
# @param url [String] the URL to automatically source the feed from
|
110
101
|
# @return [RSS::Rss]
|
111
102
|
def self.auto_source(url)
|
112
|
-
|
103
|
+
url = Addressable::URI.parse(url)
|
104
|
+
|
105
|
+
response = Html2rss::Utils.request_url(url)
|
106
|
+
|
107
|
+
Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
|
113
108
|
end
|
114
109
|
|
115
|
-
private_class_method :
|
110
|
+
private_class_method :find_feed_config
|
116
111
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08
|
11
|
+
date: 2024-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -279,7 +279,7 @@ licenses:
|
|
279
279
|
- MIT
|
280
280
|
metadata:
|
281
281
|
allowed_push_host: https://rubygems.org
|
282
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
282
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
|
283
283
|
rubygems_mfa_required: 'true'
|
284
284
|
post_install_message:
|
285
285
|
rdoc_options: []
|
@@ -296,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
296
296
|
- !ruby/object:Gem::Version
|
297
297
|
version: '0'
|
298
298
|
requirements: []
|
299
|
-
rubygems_version: 3.5.
|
299
|
+
rubygems_version: 3.5.16
|
300
300
|
signing_key:
|
301
301
|
specification_version: 4
|
302
302
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|