html2rss 0.13.0 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +3 -3
- data/lib/html2rss/auto_source/channel.rb +12 -6
- data/lib/html2rss/auto_source/rss_builder.rb +2 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +6 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +1 -1
- data/lib/html2rss/auto_source.rb +20 -17
- data/lib/html2rss/config.rb +1 -4
- data/lib/html2rss/item.rb +1 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
- data/lib/html2rss/utils.rb +6 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +7 -12
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
|
4
|
+
data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
|
7
|
+
data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
[![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/) [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss) ![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
|
4
4
|
|
5
|
-
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
|
5
|
+
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from websites automatically, and as a fallback via _feed config_.
|
6
6
|
|
7
7
|
With the _feed config_, you provide a URL to scrape and CSS selectors for extracting information (like title, URL, etc.). The gem builds the RSS feed accordingly. [Extractors](#using-extractors) and chainable [post processors](#using-post-processors) make information extraction, processing, and sanitizing a breeze. The gem also supports [scraping JSON](#scraping-and-handling-json-responses) responses and [setting HTTP request headers](#set-any-http-header-in-the-request).
|
8
8
|
|
@@ -26,9 +26,9 @@ You can also install it as a dependency in your Ruby project:
|
|
26
26
|
|
27
27
|
## Generating a feed on the CLI
|
28
28
|
|
29
|
-
### using automatic
|
29
|
+
### using automatic generation
|
30
30
|
|
31
|
-
html2rss offers an automatic
|
31
|
+
html2rss offers an automatic RSS generation feature. Try it with:
|
32
32
|
|
33
33
|
`html2rss auto https://unmatchedstyle.com/`
|
34
34
|
|
@@ -10,21 +10,27 @@ module Html2rss
|
|
10
10
|
##
|
11
11
|
#
|
12
12
|
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
13
|
-
# @param
|
14
|
-
|
13
|
+
# @param url [Addressable::URI] The URL of the channel.
|
14
|
+
# @param headers [Hash<String, String>] the http headers
|
15
|
+
# @param articles [Array<Html2rss::AutoSource::Article>] The articles.
|
16
|
+
def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
|
15
17
|
@parsed_body = parsed_body
|
16
18
|
@url = url
|
17
|
-
@
|
19
|
+
@headers = headers
|
18
20
|
@articles = articles
|
21
|
+
@stylesheets = stylesheets
|
19
22
|
end
|
20
23
|
|
24
|
+
attr_writer :articles
|
25
|
+
attr_reader :stylesheets
|
26
|
+
|
21
27
|
def url = extract_url
|
22
28
|
def title = extract_title
|
23
29
|
def language = extract_language
|
24
30
|
def description = extract_description
|
25
31
|
def image = extract_image
|
26
32
|
def ttl = extract_ttl
|
27
|
-
def last_build_date =
|
33
|
+
def last_build_date = headers['last-modified']
|
28
34
|
|
29
35
|
def generator
|
30
36
|
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
@@ -32,7 +38,7 @@ module Html2rss
|
|
32
38
|
|
33
39
|
private
|
34
40
|
|
35
|
-
attr_reader :parsed_body, :
|
41
|
+
attr_reader :parsed_body, :headers
|
36
42
|
|
37
43
|
def extract_url
|
38
44
|
@url.normalize.to_s
|
@@ -58,7 +64,7 @@ module Html2rss
|
|
58
64
|
end
|
59
65
|
|
60
66
|
def extract_ttl
|
61
|
-
ttl =
|
67
|
+
ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
|
62
68
|
return unless ttl
|
63
69
|
|
64
70
|
ttl.to_i.fdiv(60).ceil
|
@@ -1,5 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'set'
|
6
|
+
|
3
7
|
module Html2rss
|
4
8
|
class AutoSource
|
5
9
|
module Scraper
|
@@ -99,6 +103,8 @@ module Html2rss
|
|
99
103
|
# @yield [Hash] Each scraped article_hash
|
100
104
|
# @return [Array<Hash>] the scraped article_hashes
|
101
105
|
def each(&)
|
106
|
+
return enum_for(:each) unless block_given?
|
107
|
+
|
102
108
|
schema_objects.filter_map do |schema_object|
|
103
109
|
next unless (klass = self.class.scraper_for_schema_object(schema_object))
|
104
110
|
next unless (article_hash = klass.new(schema_object, url:).call)
|
@@ -74,7 +74,7 @@ module Html2rss
|
|
74
74
|
def find_heading
|
75
75
|
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
76
76
|
smallest_heading = heading_tags.keys.min
|
77
|
-
heading_tags[smallest_heading]&.max_by { |tag| tag
|
77
|
+
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
|
78
78
|
end
|
79
79
|
|
80
80
|
def extract_title
|
data/lib/html2rss/auto_source.rb
CHANGED
@@ -16,16 +16,18 @@ module Html2rss
|
|
16
16
|
|
17
17
|
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
18
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
19
|
+
##
|
20
|
+
# @param url [Addressable::URI] The URL to extract articles from.
|
21
|
+
# @param body [String] The body of the response.
|
22
|
+
# @param headers [Hash] The headers of the response.
|
23
|
+
def initialize(url, body:, headers: {})
|
24
|
+
raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
|
25
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
26
|
+
raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
27
|
+
|
28
|
+
@url = url
|
29
|
+
@body = body
|
30
|
+
@headers = headers
|
29
31
|
end
|
30
32
|
|
31
33
|
def build
|
@@ -34,6 +36,8 @@ module Html2rss
|
|
34
36
|
Reducer.call(articles, url:)
|
35
37
|
Cleanup.call(articles, url:, keep_different_domain: true)
|
36
38
|
|
39
|
+
channel.articles = articles
|
40
|
+
|
37
41
|
Html2rss::AutoSource::RssBuilder.new(
|
38
42
|
channel:,
|
39
43
|
articles:
|
@@ -57,21 +61,20 @@ module Html2rss
|
|
57
61
|
end
|
58
62
|
|
59
63
|
def channel
|
60
|
-
Channel.new(parsed_body,
|
64
|
+
@channel ||= Channel.new(parsed_body, headers: @headers, url:)
|
61
65
|
end
|
62
66
|
|
63
67
|
private
|
64
68
|
|
65
69
|
attr_reader :url
|
66
70
|
|
67
|
-
def response
|
68
|
-
@response ||= Html2rss::Utils.request_url(url)
|
69
|
-
end
|
70
|
-
|
71
|
-
# Parses the HTML body of the response using Nokogiri.
|
72
71
|
# @return [Nokogiri::HTML::Document]
|
73
72
|
def parsed_body
|
74
|
-
@parsed_body ||= Nokogiri.HTML(
|
73
|
+
@parsed_body ||= Nokogiri.HTML(@body)
|
74
|
+
.tap do |doc|
|
75
|
+
# Remove comments from the document
|
76
|
+
doc.xpath('//comment()').each(&:remove)
|
77
|
+
end.freeze
|
75
78
|
end
|
76
79
|
end
|
77
80
|
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -18,9 +18,6 @@ module Html2rss
|
|
18
18
|
# Thrown when the feed config does not contain a value at `:channel`.
|
19
19
|
class ChannelMissing < Html2rss::Error; end
|
20
20
|
|
21
|
-
# Struct to store XML Stylesheet attributes
|
22
|
-
Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
|
23
|
-
|
24
21
|
def_delegator :@channel, :author
|
25
22
|
def_delegator :@channel, :ttl
|
26
23
|
def_delegator :@channel, :title
|
@@ -75,7 +72,7 @@ module Html2rss
|
|
75
72
|
#
|
76
73
|
# @return [Array<Stylesheet>] Array of Stylesheet structs.
|
77
74
|
def stylesheets
|
78
|
-
@global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
|
75
|
+
@global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
|
79
76
|
end
|
80
77
|
|
81
78
|
# Provides read-only access to the channel object.
|
data/lib/html2rss/item.rb
CHANGED
@@ -19,7 +19,7 @@ module Html2rss
|
|
19
19
|
##
|
20
20
|
# Fetches items from a given URL using configuration settings.
|
21
21
|
#
|
22
|
-
# @param url [
|
22
|
+
# @param url [Addressable::URI] URL to fetch items from.
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
@@ -3,35 +3,50 @@
|
|
3
3
|
module Html2rss
|
4
4
|
module RssBuilder
|
5
5
|
##
|
6
|
-
#
|
6
|
+
# Represents a stylesheet.
|
7
7
|
class Stylesheet
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
8
|
+
class << self
|
9
|
+
##
|
10
|
+
# Adds the stylesheet XML tags to the RSS.
|
11
|
+
#
|
12
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
13
|
+
# @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
|
14
|
+
# @return [nil]
|
15
|
+
def add(maker, stylesheets)
|
16
|
+
stylesheets.each do |stylesheet|
|
17
|
+
add_stylesheet(maker, stylesheet)
|
18
|
+
end
|
17
19
|
end
|
18
|
-
end
|
19
20
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
21
|
+
private
|
22
|
+
|
23
|
+
##
|
24
|
+
# Adds a single Stylesheet to the RSS.
|
25
|
+
#
|
26
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
27
|
+
# @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
|
28
|
+
# @return [nil]
|
29
|
+
def add_stylesheet(maker, stylesheet)
|
30
|
+
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
31
|
+
xss.href = stylesheet.href
|
32
|
+
xss.type = stylesheet.type
|
33
|
+
xss.media = stylesheet.media
|
34
|
+
end
|
31
35
|
end
|
32
36
|
end
|
33
37
|
|
34
|
-
|
38
|
+
TYPES = ['text/css', 'text/xsl'].freeze
|
39
|
+
|
40
|
+
def initialize(href:, type:, media: 'all')
|
41
|
+
raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
|
42
|
+
raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
|
43
|
+
raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
|
44
|
+
|
45
|
+
@href = href
|
46
|
+
@type = type
|
47
|
+
@media = media
|
48
|
+
end
|
49
|
+
attr_reader :href, :type, :media
|
35
50
|
end
|
36
51
|
end
|
37
52
|
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -44,6 +44,7 @@ module Html2rss
|
|
44
44
|
#
|
45
45
|
# @param time_zone [String]
|
46
46
|
# @param default_time_zone [String]
|
47
|
+
# @yield block to execute with the given time zone
|
47
48
|
# @return [Object] whatever the given block returns
|
48
49
|
def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
|
49
50
|
raise ArgumentError, 'a block is required' unless block_given?
|
@@ -74,6 +75,11 @@ module Html2rss
|
|
74
75
|
# @param headers [Hash] additional HTTP request headers to use for the request
|
75
76
|
# @return [Faraday::Response] body of the HTTP response
|
76
77
|
def self.request_url(url, headers: {})
|
78
|
+
url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
|
79
|
+
|
80
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
81
|
+
raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
|
82
|
+
|
77
83
|
Faraday.new(url:, headers:) do |faraday|
|
78
84
|
faraday.use Faraday::FollowRedirects::Middleware
|
79
85
|
faraday.adapter Faraday.default_adapter
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -43,7 +43,7 @@ module Html2rss
|
|
43
43
|
# @param params [Hash] Dynamic parameters for the feed configuration.
|
44
44
|
# @return [RSS::Rss] RSS object generated from the configuration.
|
45
45
|
def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
|
46
|
-
yaml =
|
46
|
+
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
47
47
|
feeds = yaml[CONFIG_KEY_FEEDS] || {}
|
48
48
|
|
49
49
|
feed_config = find_feed_config(yaml, feeds, name, global_config)
|
@@ -73,15 +73,6 @@ module Html2rss
|
|
73
73
|
RssBuilder.build(config)
|
74
74
|
end
|
75
75
|
|
76
|
-
##
|
77
|
-
# Loads and parses the YAML file.
|
78
|
-
#
|
79
|
-
# @param file [String] Path to the YAML file.
|
80
|
-
# @return [Hash] Parsed YAML content.
|
81
|
-
def self.load_yaml(file)
|
82
|
-
YAML.safe_load_file(file, symbolize_names: true)
|
83
|
-
end
|
84
|
-
|
85
76
|
##
|
86
77
|
# Builds the feed configuration based on the provided parameters.
|
87
78
|
#
|
@@ -109,8 +100,12 @@ module Html2rss
|
|
109
100
|
# @param url [String] the URL to automatically source the feed from
|
110
101
|
# @return [RSS::Rss]
|
111
102
|
def self.auto_source(url)
|
112
|
-
|
103
|
+
url = Addressable::URI.parse(url)
|
104
|
+
|
105
|
+
response = Html2rss::Utils.request_url(url)
|
106
|
+
|
107
|
+
Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
|
113
108
|
end
|
114
109
|
|
115
|
-
private_class_method :
|
110
|
+
private_class_method :find_feed_config
|
116
111
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08
|
11
|
+
date: 2024-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -279,7 +279,7 @@ licenses:
|
|
279
279
|
- MIT
|
280
280
|
metadata:
|
281
281
|
allowed_push_host: https://rubygems.org
|
282
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
282
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
|
283
283
|
rubygems_mfa_required: 'true'
|
284
284
|
post_install_message:
|
285
285
|
rdoc_options: []
|
@@ -296,7 +296,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
296
296
|
- !ruby/object:Gem::Version
|
297
297
|
version: '0'
|
298
298
|
requirements: []
|
299
|
-
rubygems_version: 3.5.
|
299
|
+
rubygems_version: 3.5.16
|
300
300
|
signing_key:
|
301
301
|
specification_version: 4
|
302
302
|
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|