html2rss 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +38 -10
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +74 -0
- data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
- data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +17 -8
- data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
- data/lib/html2rss/attribute_post_processors/template.rb +19 -11
- data/lib/html2rss/attribute_post_processors.rb +8 -0
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +79 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +68 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +77 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +13 -2
- data/lib/html2rss/item.rb +8 -2
- data/lib/html2rss/utils.rb +5 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +21 -0
- metadata +30 -3
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
require 'parallel'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
module Scraper
|
9
|
+
##
|
10
|
+
# Scrapes articles by looking for common markup tags (article, section, li)
|
11
|
+
# containing an <a href> tag.
|
12
|
+
#
|
13
|
+
# See:
|
14
|
+
# 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
|
15
|
+
class SemanticHtml
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
##
|
19
|
+
# Map of parent element names to CSS selectors for finding <a href> tags.
|
20
|
+
ANCHOR_TAG_SELECTORS = {
|
21
|
+
'section' => ['section :not(section) a[href]'],
|
22
|
+
'tr' => ['table tr :not(tr) a[href]'],
|
23
|
+
'article' => [
|
24
|
+
'article :not(article) a[href]',
|
25
|
+
'article a[href]'
|
26
|
+
],
|
27
|
+
'li' => [
|
28
|
+
'ul > li :not(li) a[href]',
|
29
|
+
'ol > li :not(li) a[href]'
|
30
|
+
]
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
# Check if the parsed_body contains articles
|
34
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
|
35
|
+
# @return [Boolean] True if articles are found, otherwise false.
|
36
|
+
def self.articles?(parsed_body)
|
37
|
+
return false unless parsed_body
|
38
|
+
|
39
|
+
ANCHOR_TAG_SELECTORS.each_value do |selectors|
|
40
|
+
return true if selectors.any? { |selector| parsed_body.at_css(selector) }
|
41
|
+
end
|
42
|
+
false
|
43
|
+
end
|
44
|
+
|
45
|
+
# Finds the closest ancestor tag matching the specified tag name
|
46
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
47
|
+
# @param tag_name [String] The tag name to search for
|
48
|
+
# @param stop_tag [String] The tag name to stop searching at
|
49
|
+
# @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
|
50
|
+
def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
|
51
|
+
return current_tag if current_tag.name == tag_name
|
52
|
+
|
53
|
+
stop_tags = Set[tag_name, stop_tag]
|
54
|
+
|
55
|
+
while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
|
56
|
+
current_tag = current_tag.parent
|
57
|
+
end
|
58
|
+
|
59
|
+
current_tag
|
60
|
+
end
|
61
|
+
|
62
|
+
# Finds the closest matching selector upwards in the DOM tree
|
63
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
64
|
+
# @param selector [String] The CSS selector to search for
|
65
|
+
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
66
|
+
def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
|
67
|
+
current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Helper method to find a matching selector upwards
|
71
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
72
|
+
# @param selector [String] The CSS selector to search for
|
73
|
+
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
74
|
+
def self.find_closest_selector_upwards(current_tag, selector:)
|
75
|
+
while current_tag
|
76
|
+
found = current_tag.at_css(selector)
|
77
|
+
return found if found
|
78
|
+
|
79
|
+
return nil unless current_tag.respond_to?(:parent)
|
80
|
+
|
81
|
+
current_tag = current_tag.parent
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns an array of [tag_name, selector] pairs
|
86
|
+
# @return [Array<[String, String]>] Array of tag name and selector pairs
|
87
|
+
def self.anchor_tag_selector_pairs
|
88
|
+
ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
|
89
|
+
selectors.map { |selector| [tag_name, selector] }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(parsed_body, url:)
|
94
|
+
@parsed_body = parsed_body
|
95
|
+
@url = url
|
96
|
+
end
|
97
|
+
|
98
|
+
attr_reader :parsed_body
|
99
|
+
|
100
|
+
##
|
101
|
+
# @yieldparam [Hash] The scraped article hash
|
102
|
+
# @return [Enumerator] Enumerator for the scraped articles
|
103
|
+
def each
|
104
|
+
return enum_for(:each) unless block_given?
|
105
|
+
|
106
|
+
SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
|
107
|
+
parsed_body.css(selector).each do |selected_tag|
|
108
|
+
article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
|
109
|
+
article_hash = Extractor.new(article_tag, url: @url).call
|
110
|
+
|
111
|
+
yield article_hash if article_hash
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# The Scraper module contains all scrapers that can be used to extract articles.
|
7
|
+
# Each scraper should implement a `call` method that returns an array of article hashes.
|
8
|
+
# Each scraper should also implement an `articles?` method that returns true if the scraper
|
9
|
+
# can potentially be used to extract articles from the given HTML.
|
10
|
+
#
|
11
|
+
module Scraper
|
12
|
+
SCRAPERS = [
|
13
|
+
Schema,
|
14
|
+
SemanticHtml
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Error raised when no suitable scraper is found.
|
19
|
+
class NoScraperFound < Html2rss::Error; end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Returns an array of scrapers that claim to find articles in the parsed body.
|
23
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
24
|
+
# @return [Array<Class>] An array of scraper classes that can handle the parsed body.
|
25
|
+
def self.from(parsed_body)
|
26
|
+
scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
|
27
|
+
raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
|
28
|
+
|
29
|
+
scrapers
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'parallel'
|
5
|
+
require 'addressable'
|
6
|
+
|
7
|
+
module Html2rss
|
8
|
+
##
|
9
|
+
# The AutoSource class is responsible for extracting channel and articles
|
10
|
+
# from a given URL.
|
11
|
+
# It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
|
12
|
+
# marking articles, e.g. schema, microdata, open graph, etc.
|
13
|
+
class AutoSource
|
14
|
+
class UnsupportedUrlScheme < Html2rss::Error; end
|
15
|
+
class NoArticlesFound < Html2rss::Error; end
|
16
|
+
|
17
|
+
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
|
+
|
19
|
+
def initialize(url)
|
20
|
+
unless url.is_a?(String) || url.is_a?(Addressable::URI)
|
21
|
+
raise ArgumentError,
|
22
|
+
'URL must be a String or Addressable::URI'
|
23
|
+
end
|
24
|
+
|
25
|
+
@url = Addressable::URI.parse(url)
|
26
|
+
|
27
|
+
raise ArgumentError, 'URL must be absolute' unless @url.absolute?
|
28
|
+
raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
|
29
|
+
end
|
30
|
+
|
31
|
+
def build
|
32
|
+
raise NoArticlesFound if articles.empty?
|
33
|
+
|
34
|
+
Reducer.call(articles, url:)
|
35
|
+
Cleanup.call(articles, url:, keep_different_domain: true)
|
36
|
+
|
37
|
+
Html2rss::AutoSource::RssBuilder.new(
|
38
|
+
channel:,
|
39
|
+
articles:
|
40
|
+
).call
|
41
|
+
end
|
42
|
+
|
43
|
+
def articles
|
44
|
+
@articles ||= Scraper.from(parsed_body).flat_map do |scraper|
|
45
|
+
instance = scraper.new(parsed_body, url:)
|
46
|
+
|
47
|
+
articles_in_thread = Parallel.map(instance.each) do |article_hash|
|
48
|
+
Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
|
49
|
+
|
50
|
+
Article.new(**article_hash, scraper:)
|
51
|
+
end
|
52
|
+
|
53
|
+
Reducer.call(articles_in_thread, url:)
|
54
|
+
|
55
|
+
articles_in_thread
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def channel
|
60
|
+
Channel.new(parsed_body, response:, url:, articles:)
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
attr_reader :url
|
66
|
+
|
67
|
+
def response
|
68
|
+
@response ||= Html2rss::Utils.request_url(url)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parses the HTML body of the response using Nokogiri.
|
72
|
+
# @return [Nokogiri::HTML::Document]
|
73
|
+
def parsed_body
|
74
|
+
@parsed_body ||= Nokogiri.HTML(response.body).freeze
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
data/lib/html2rss/cli.rb
CHANGED
@@ -2,8 +2,13 @@
|
|
2
2
|
|
3
3
|
require_relative '../html2rss'
|
4
4
|
require 'thor'
|
5
|
+
require 'addressable'
|
5
6
|
|
7
|
+
##
|
8
|
+
# The Html2rss namespace / command line interface.
|
6
9
|
module Html2rss
|
10
|
+
Log = Logger.new($stderr)
|
11
|
+
|
7
12
|
##
|
8
13
|
# The Html2rss command line interface.
|
9
14
|
class CLI < Thor
|
@@ -25,5 +30,10 @@ module Html2rss
|
|
25
30
|
params = options.to_h { |opt| opt.split('=', 2) }
|
26
31
|
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
27
32
|
end
|
33
|
+
|
34
|
+
desc 'auto URL', 'automatically sources an RSS feed from the URL'
|
35
|
+
def auto(url)
|
36
|
+
puts Html2rss.auto_source(url)
|
37
|
+
end
|
28
38
|
end
|
29
39
|
end
|
@@ -16,7 +16,7 @@ module Html2rss
|
|
16
16
|
# @return [Set<String>] the required parameter names
|
17
17
|
def self.required_params_for_config(config)
|
18
18
|
config.each_with_object(Set.new) do |(_, value), required_params|
|
19
|
-
required_params.merge(value.scan(/%<([
|
19
|
+
required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -25,7 +25,9 @@ module Html2rss
|
|
25
25
|
# @param params [Hash]
|
26
26
|
def initialize(channel, params: {})
|
27
27
|
raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
|
28
|
-
|
28
|
+
|
29
|
+
url = channel[:url]
|
30
|
+
raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
|
29
31
|
|
30
32
|
@config = process_params(channel, params.transform_keys(&:to_sym))
|
31
33
|
end
|
@@ -10,6 +10,9 @@ module Html2rss
|
|
10
10
|
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
11
|
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
|
12
12
|
|
13
|
+
# raised when an invalid selector name is used
|
14
|
+
class InvalidSelectorName < Html2rss::Error; end
|
15
|
+
|
13
16
|
##
|
14
17
|
# @param config [Hash<Symbol, Object>]
|
15
18
|
def initialize(config)
|
@@ -28,9 +31,15 @@ module Html2rss
|
|
28
31
|
# @param name [Symbol]
|
29
32
|
# @return [Selector]
|
30
33
|
def selector(name)
|
31
|
-
raise
|
34
|
+
raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
|
35
|
+
|
36
|
+
keywords = config[name].slice(*available_keys)
|
32
37
|
|
33
|
-
|
38
|
+
if (additional_keys = keywords.keys - available_keys).any?
|
39
|
+
Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
40
|
+
end
|
41
|
+
|
42
|
+
Selector.new(keywords)
|
34
43
|
end
|
35
44
|
|
36
45
|
##
|
@@ -86,6 +95,8 @@ module Html2rss
|
|
86
95
|
array.map!(&:to_sym)
|
87
96
|
end.to_set
|
88
97
|
end
|
98
|
+
|
99
|
+
def available_keys = @available_keys ||= Selector.members
|
89
100
|
end
|
90
101
|
end
|
91
102
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -23,7 +23,8 @@ module Html2rss
|
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
26
|
-
body = Utils.
|
26
|
+
body = Utils.request_url(url, headers: config.headers).body
|
27
|
+
body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
|
27
28
|
|
28
29
|
Nokogiri.HTML(body)
|
29
30
|
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
@@ -47,6 +48,7 @@ module Html2rss
|
|
47
48
|
# @param method_name [Symbol]
|
48
49
|
# @param _include_private [true, false]
|
49
50
|
# @return [true, false]
|
51
|
+
# :reek:BooleanParameter { enabled: false }
|
50
52
|
def respond_to_missing?(method_name, _include_private = false)
|
51
53
|
config.selector?(method_name) || super
|
52
54
|
end
|
@@ -110,7 +112,11 @@ module Html2rss
|
|
110
112
|
#
|
111
113
|
# @return [Array<String>] list of categories.
|
112
114
|
def categories
|
113
|
-
config.category_selector_names
|
115
|
+
config.category_selector_names
|
116
|
+
.filter_map do |method_name|
|
117
|
+
category = public_send(method_name)
|
118
|
+
category.strip unless category.to_s.empty?
|
119
|
+
end.uniq
|
114
120
|
end
|
115
121
|
|
116
122
|
##
|
data/lib/html2rss/utils.rb
CHANGED
@@ -31,12 +31,12 @@ module Html2rss
|
|
31
31
|
##
|
32
32
|
# Removes any space, parses and normalizes the given url.
|
33
33
|
# @param url [String]
|
34
|
-
# @return [
|
34
|
+
# @return [Addressable::URI, nil] normalized URL, or nil if input is empty
|
35
35
|
def self.sanitize_url(url)
|
36
36
|
url = url.to_s.gsub(/\s+/, ' ').strip
|
37
37
|
return if url.empty?
|
38
38
|
|
39
|
-
Addressable::URI.parse(url).normalize
|
39
|
+
Addressable::URI.parse(url).normalize
|
40
40
|
end
|
41
41
|
|
42
42
|
##
|
@@ -71,18 +71,13 @@ module Html2rss
|
|
71
71
|
|
72
72
|
##
|
73
73
|
# @param url [String, Addressable::URI]
|
74
|
-
# @param convert_json_to_xml [true, false] Should JSON be converted to XML
|
75
74
|
# @param headers [Hash] additional HTTP request headers to use for the request
|
76
|
-
# @return [
|
77
|
-
def self.
|
78
|
-
|
75
|
+
# @return [Faraday::Response] body of the HTTP response
|
76
|
+
def self.request_url(url, headers: {})
|
77
|
+
Faraday.new(url:, headers:) do |faraday|
|
79
78
|
faraday.use Faraday::FollowRedirects::Middleware
|
80
79
|
faraday.adapter Faraday.default_adapter
|
81
80
|
end.get
|
82
|
-
|
83
|
-
body = response.body
|
84
|
-
|
85
|
-
convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
|
86
81
|
end
|
87
82
|
|
88
83
|
##
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
|
|
6
6
|
loader.setup
|
7
7
|
|
8
8
|
require 'yaml'
|
9
|
+
require 'logger'
|
9
10
|
|
10
11
|
##
|
11
12
|
# The Html2rss namespace.
|
12
13
|
module Html2rss
|
14
|
+
##
|
15
|
+
# The logger instance.
|
16
|
+
Log = Logger.new($stdout)
|
17
|
+
|
18
|
+
Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
|
19
|
+
|
20
|
+
Log.formatter = proc do |severity, datetime, _progname, msg|
|
21
|
+
"#{datetime} [#{severity}] #{msg}\n"
|
22
|
+
end
|
23
|
+
|
13
24
|
##
|
14
25
|
# The Html2rss::Error base class.
|
15
26
|
class Error < StandardError; end
|
@@ -91,5 +102,15 @@ module Html2rss
|
|
91
102
|
end
|
92
103
|
end
|
93
104
|
|
105
|
+
##
|
106
|
+
# Scrapes the provided URL and returns an RSS object.
|
107
|
+
# No need for a "feed config".
|
108
|
+
#
|
109
|
+
# @param url [String] the URL to automatically source the feed from
|
110
|
+
# @return [RSS::Rss]
|
111
|
+
def self.auto_source(url)
|
112
|
+
Html2rss::AutoSource.new(url).build
|
113
|
+
end
|
114
|
+
|
94
115
|
private_class_method :load_yaml, :find_feed_config
|
95
116
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.13.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -106,6 +106,20 @@ dependencies:
|
|
106
106
|
- - "<"
|
107
107
|
- !ruby/object:Gem::Version
|
108
108
|
version: '2.0'
|
109
|
+
- !ruby/object:Gem::Dependency
|
110
|
+
name: parallel
|
111
|
+
requirement: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
type: :runtime
|
117
|
+
prerelease: false
|
118
|
+
version_requirements: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
109
123
|
- !ruby/object:Gem::Dependency
|
110
124
|
name: regexp_parser
|
111
125
|
requirement: !ruby/object:Gem::Requirement
|
@@ -219,6 +233,7 @@ files:
|
|
219
233
|
- html2rss.gemspec
|
220
234
|
- lib/html2rss.rb
|
221
235
|
- lib/html2rss/attribute_post_processors.rb
|
236
|
+
- lib/html2rss/attribute_post_processors/base.rb
|
222
237
|
- lib/html2rss/attribute_post_processors/gsub.rb
|
223
238
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
224
239
|
- lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
@@ -229,6 +244,18 @@ files:
|
|
229
244
|
- lib/html2rss/attribute_post_processors/sanitize_html.rb
|
230
245
|
- lib/html2rss/attribute_post_processors/substring.rb
|
231
246
|
- lib/html2rss/attribute_post_processors/template.rb
|
247
|
+
- lib/html2rss/auto_source.rb
|
248
|
+
- lib/html2rss/auto_source/article.rb
|
249
|
+
- lib/html2rss/auto_source/channel.rb
|
250
|
+
- lib/html2rss/auto_source/cleanup.rb
|
251
|
+
- lib/html2rss/auto_source/reducer.rb
|
252
|
+
- lib/html2rss/auto_source/rss_builder.rb
|
253
|
+
- lib/html2rss/auto_source/scraper.rb
|
254
|
+
- lib/html2rss/auto_source/scraper/schema.rb
|
255
|
+
- lib/html2rss/auto_source/scraper/schema/base.rb
|
256
|
+
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
257
|
+
- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
|
258
|
+
- lib/html2rss/auto_source/scraper/semantic_html/image.rb
|
232
259
|
- lib/html2rss/cli.rb
|
233
260
|
- lib/html2rss/config.rb
|
234
261
|
- lib/html2rss/config/channel.rb
|
@@ -252,7 +279,7 @@ licenses:
|
|
252
279
|
- MIT
|
253
280
|
metadata:
|
254
281
|
allowed_push_host: https://rubygems.org
|
255
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
282
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
|
256
283
|
rubygems_mfa_required: 'true'
|
257
284
|
post_install_message:
|
258
285
|
rdoc_options: []
|