html2rss 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +39 -11
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +85 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +70 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +128 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +80 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/config.rb +1 -4
- data/lib/html2rss/item.rb +9 -3
- data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
- data/lib/html2rss/utils.rb +11 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +27 -11
- metadata +30 -4
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
module Scraper
|
8
|
+
class Schema
|
9
|
+
##
|
10
|
+
# Base class for Schema.org schema_objects.
|
11
|
+
#
|
12
|
+
# @see https://schema.org/Article
|
13
|
+
class Base
|
14
|
+
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
15
|
+
|
16
|
+
def initialize(schema_object, url:)
|
17
|
+
@schema_object = schema_object
|
18
|
+
@url = url
|
19
|
+
end
|
20
|
+
|
21
|
+
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
22
|
+
def call
|
23
|
+
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
24
|
+
[attribute, public_send(attribute)]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
|
29
|
+
def title = schema_object[:title]
|
30
|
+
|
31
|
+
def description
|
32
|
+
[schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
|
33
|
+
.max_by { |desc| desc.to_s.size }
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [Addressable::URI, nil] the URL of the schema object
|
37
|
+
def url
|
38
|
+
url = schema_object[:url]
|
39
|
+
if url.to_s.empty?
|
40
|
+
Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
|
41
|
+
return
|
42
|
+
end
|
43
|
+
|
44
|
+
Utils.build_absolute_url_from_relative(url, @url)
|
45
|
+
end
|
46
|
+
|
47
|
+
def image = images.first || nil
|
48
|
+
def published_at = schema_object[:datePublished]
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
attr_reader :schema_object
|
53
|
+
|
54
|
+
def images
|
55
|
+
Array(schema_object[:image]).compact
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
module Html2rss
|
8
|
+
class AutoSource
|
9
|
+
module Scraper
|
10
|
+
##
|
11
|
+
# Scraps articles from Schema.org objects, by looking for the objects in:
|
12
|
+
|
13
|
+
# 1. <script type="application/ld+json"> "schema" tag.
|
14
|
+
# 2. tbd
|
15
|
+
#
|
16
|
+
# See:
|
17
|
+
# 1. https://schema.org/NewsArticle
|
18
|
+
# 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
19
|
+
class Schema
|
20
|
+
include Enumerable
|
21
|
+
|
22
|
+
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
23
|
+
SCHEMA_OBJECT_TYPES = %w[
|
24
|
+
AdvertiserContentArticle
|
25
|
+
AnalysisNewsArticle
|
26
|
+
APIReference
|
27
|
+
Article
|
28
|
+
AskPublicNewsArticle
|
29
|
+
BackgroundNewsArticle
|
30
|
+
BlogPosting
|
31
|
+
DiscussionForumPosting
|
32
|
+
LiveBlogPosting
|
33
|
+
NewsArticle
|
34
|
+
OpinionNewsArticle
|
35
|
+
Report
|
36
|
+
ReportageNewsArticle
|
37
|
+
ReviewNewsArticle
|
38
|
+
SatiricalArticle
|
39
|
+
ScholarlyArticle
|
40
|
+
SocialMediaPosting
|
41
|
+
TechArticle
|
42
|
+
].to_set.freeze
|
43
|
+
|
44
|
+
class << self
|
45
|
+
def articles?(parsed_body)
|
46
|
+
parsed_body.css(TAG_SELECTOR).any? do |script|
|
47
|
+
SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
##
|
52
|
+
# Returns a flat array
|
53
|
+
# of all supported schema objects
|
54
|
+
# by recursively traversing the `from` object.
|
55
|
+
#
|
56
|
+
# @param object [Hash, Array]
|
57
|
+
# @return [Array<Hash>] the schema_objects, or an empty array
|
58
|
+
# :reek:DuplicateMethodCall
|
59
|
+
def from(object)
|
60
|
+
case object
|
61
|
+
when Nokogiri::XML::Element
|
62
|
+
from(parse_script_tag(object))
|
63
|
+
when Hash
|
64
|
+
supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
|
65
|
+
when Array
|
66
|
+
object.flat_map { |item| from(item) }
|
67
|
+
else
|
68
|
+
[]
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def supported_schema_object?(object)
|
73
|
+
scraper_for_schema_object(object) ? true : false
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
|
78
|
+
def scraper_for_schema_object(schema_object)
|
79
|
+
if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
|
80
|
+
Base
|
81
|
+
else
|
82
|
+
Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
|
83
|
+
nil
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def parse_script_tag(script_tag)
|
90
|
+
JSON.parse(script_tag.text, symbolize_names: true)
|
91
|
+
rescue JSON::ParserError => error
|
92
|
+
Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
|
93
|
+
[]
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def initialize(parsed_body, url:)
|
98
|
+
@parsed_body = parsed_body
|
99
|
+
@url = url
|
100
|
+
end
|
101
|
+
|
102
|
+
##
|
103
|
+
# @yield [Hash] Each scraped article_hash
|
104
|
+
# @return [Array<Hash>] the scraped article_hashes
|
105
|
+
def each(&)
|
106
|
+
return enum_for(:each) unless block_given?
|
107
|
+
|
108
|
+
schema_objects.filter_map do |schema_object|
|
109
|
+
next unless (klass = self.class.scraper_for_schema_object(schema_object))
|
110
|
+
next unless (article_hash = klass.new(schema_object, url:).call)
|
111
|
+
|
112
|
+
yield article_hash
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
def schema_objects
|
119
|
+
@parsed_body.css(TAG_SELECTOR).flat_map do |tag|
|
120
|
+
Schema.from(tag)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
attr_reader :parsed_body, :url
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
module Scraper
|
8
|
+
class SemanticHtml
|
9
|
+
##
|
10
|
+
# ArticleExtractor is responsible for extracting the details of an article.
|
11
|
+
# It focuses on finding a headline first, and from it traverse as much as possible,
|
12
|
+
# to find the DOM upwards to find the other details.
|
13
|
+
class Extractor
|
14
|
+
INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
|
15
|
+
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
16
|
+
NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
|
17
|
+
INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
|
18
|
+
|
19
|
+
def self.visible_text_from_tag(tag, separator: ' ')
|
20
|
+
text = if (children = tag.children).empty?
|
21
|
+
tag.text.strip
|
22
|
+
else
|
23
|
+
children.filter_map do |child|
|
24
|
+
next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
|
25
|
+
|
26
|
+
visible_text_from_tag(child)
|
27
|
+
end.join(separator)
|
28
|
+
end
|
29
|
+
|
30
|
+
return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
|
31
|
+
|
32
|
+
sanitized_text
|
33
|
+
end
|
34
|
+
|
35
|
+
def initialize(article_tag, url:)
|
36
|
+
@article_tag = article_tag
|
37
|
+
@url = url
|
38
|
+
@heading = find_heading
|
39
|
+
@extract_url = find_url
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Hash, nil] The scraped article or nil.
|
43
|
+
def call
|
44
|
+
return unless heading
|
45
|
+
|
46
|
+
{
|
47
|
+
title: extract_title,
|
48
|
+
url: extract_url,
|
49
|
+
image: extract_image,
|
50
|
+
description: extract_description,
|
51
|
+
id: generate_id,
|
52
|
+
published_at: extract_published_at
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
attr_reader :article_tag, :url, :heading, :extract_url
|
59
|
+
|
60
|
+
def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
|
61
|
+
|
62
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
|
63
|
+
def extract_published_at
|
64
|
+
times = article_tag.css('time[datetime]')
|
65
|
+
.filter_map do |tag|
|
66
|
+
DateTime.parse(tag['datetime'])
|
67
|
+
rescue ArgumentError, TypeError
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
|
71
|
+
times.min
|
72
|
+
end
|
73
|
+
|
74
|
+
def find_heading
|
75
|
+
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
76
|
+
smallest_heading = heading_tags.keys.min
|
77
|
+
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
|
78
|
+
end
|
79
|
+
|
80
|
+
def extract_title
|
81
|
+
@extract_title ||= if heading.children.empty? && heading.text
|
82
|
+
visible_text_from_tag(heading)
|
83
|
+
else
|
84
|
+
visible_text_from_tag(
|
85
|
+
article_tag.css(HEADING_TAGS.join(','))
|
86
|
+
.max_by { |tag| tag.text.size }
|
87
|
+
)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_description
|
92
|
+
text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
|
93
|
+
return text if text
|
94
|
+
|
95
|
+
description = visible_text_from_tag(article_tag)
|
96
|
+
return nil unless description
|
97
|
+
|
98
|
+
title_text = extract_title
|
99
|
+
description.gsub!(title_text, '') if title_text
|
100
|
+
description.strip!
|
101
|
+
description.empty? ? nil : description
|
102
|
+
end
|
103
|
+
|
104
|
+
def find_url
|
105
|
+
closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
|
106
|
+
selector: 'a[href]:not([href=""])')
|
107
|
+
href = closest_anchor&.[]('href')&.split('#')&.first&.strip
|
108
|
+
Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
|
109
|
+
end
|
110
|
+
|
111
|
+
def extract_image
|
112
|
+
Image.call(article_tag, url:)
|
113
|
+
end
|
114
|
+
|
115
|
+
def generate_id
|
116
|
+
[article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
|
117
|
+
extract_url&.path].compact.reject(&:empty?).first
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
module Scraper
|
6
|
+
class SemanticHtml
|
7
|
+
##
|
8
|
+
# Image is responsible for extracting image URLs the article_tag.
|
9
|
+
class Image
|
10
|
+
def self.call(article_tag, url:)
|
11
|
+
img_src = from_source(article_tag) ||
|
12
|
+
from_img(article_tag) ||
|
13
|
+
from_style(article_tag)
|
14
|
+
|
15
|
+
Utils.build_absolute_url_from_relative(img_src, url) if img_src
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.from_img(article_tag)
|
19
|
+
article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Extracts the largest image source from the srcset attribute
|
24
|
+
# of an img tag or a source tag inside a picture tag.
|
25
|
+
#
|
26
|
+
# @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
|
27
|
+
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
|
28
|
+
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
|
29
|
+
def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
|
30
|
+
hash = article_tag.css('img[srcset], picture > source[srcset]')
|
31
|
+
.flat_map { |source| source['srcset'].to_s.split(',') }
|
32
|
+
.filter_map do |line|
|
33
|
+
width, url = line.split.reverse
|
34
|
+
next if url.nil? || url.start_with?('data:')
|
35
|
+
|
36
|
+
width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
|
37
|
+
|
38
|
+
[width_value, url.strip]
|
39
|
+
end.to_h
|
40
|
+
|
41
|
+
hash[hash.keys.max]
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.from_style(article_tag)
|
45
|
+
article_tag.css('[style*="url"]')
|
46
|
+
.map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
|
47
|
+
.reject { |src| !src || src.start_with?('data:') }
|
48
|
+
.max_by(&:size)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
require 'parallel'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
module Scraper
|
9
|
+
##
|
10
|
+
# Scrapes articles by looking for common markup tags (article, section, li)
|
11
|
+
# containing an <a href> tag.
|
12
|
+
#
|
13
|
+
# See:
|
14
|
+
# 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
|
15
|
+
class SemanticHtml
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
##
|
19
|
+
# Map of parent element names to CSS selectors for finding <a href> tags.
|
20
|
+
ANCHOR_TAG_SELECTORS = {
|
21
|
+
'section' => ['section :not(section) a[href]'],
|
22
|
+
'tr' => ['table tr :not(tr) a[href]'],
|
23
|
+
'article' => [
|
24
|
+
'article :not(article) a[href]',
|
25
|
+
'article a[href]'
|
26
|
+
],
|
27
|
+
'li' => [
|
28
|
+
'ul > li :not(li) a[href]',
|
29
|
+
'ol > li :not(li) a[href]'
|
30
|
+
]
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
# Check if the parsed_body contains articles
|
34
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
|
35
|
+
# @return [Boolean] True if articles are found, otherwise false.
|
36
|
+
def self.articles?(parsed_body)
|
37
|
+
return false unless parsed_body
|
38
|
+
|
39
|
+
ANCHOR_TAG_SELECTORS.each_value do |selectors|
|
40
|
+
return true if selectors.any? { |selector| parsed_body.at_css(selector) }
|
41
|
+
end
|
42
|
+
false
|
43
|
+
end
|
44
|
+
|
45
|
+
# Finds the closest ancestor tag matching the specified tag name
|
46
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
47
|
+
# @param tag_name [String] The tag name to search for
|
48
|
+
# @param stop_tag [String] The tag name to stop searching at
|
49
|
+
# @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
|
50
|
+
def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
|
51
|
+
return current_tag if current_tag.name == tag_name
|
52
|
+
|
53
|
+
stop_tags = Set[tag_name, stop_tag]
|
54
|
+
|
55
|
+
while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
|
56
|
+
current_tag = current_tag.parent
|
57
|
+
end
|
58
|
+
|
59
|
+
current_tag
|
60
|
+
end
|
61
|
+
|
62
|
+
# Finds the closest matching selector upwards in the DOM tree
|
63
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
64
|
+
# @param selector [String] The CSS selector to search for
|
65
|
+
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
66
|
+
def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
|
67
|
+
current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Helper method to find a matching selector upwards
|
71
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
72
|
+
# @param selector [String] The CSS selector to search for
|
73
|
+
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
74
|
+
def self.find_closest_selector_upwards(current_tag, selector:)
|
75
|
+
while current_tag
|
76
|
+
found = current_tag.at_css(selector)
|
77
|
+
return found if found
|
78
|
+
|
79
|
+
return nil unless current_tag.respond_to?(:parent)
|
80
|
+
|
81
|
+
current_tag = current_tag.parent
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns an array of [tag_name, selector] pairs
|
86
|
+
# @return [Array<[String, String]>] Array of tag name and selector pairs
|
87
|
+
def self.anchor_tag_selector_pairs
|
88
|
+
ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
|
89
|
+
selectors.map { |selector| [tag_name, selector] }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(parsed_body, url:)
|
94
|
+
@parsed_body = parsed_body
|
95
|
+
@url = url
|
96
|
+
end
|
97
|
+
|
98
|
+
attr_reader :parsed_body
|
99
|
+
|
100
|
+
##
|
101
|
+
# @yieldparam [Hash] The scraped article hash
|
102
|
+
# @return [Enumerator] Enumerator for the scraped articles
|
103
|
+
def each
|
104
|
+
return enum_for(:each) unless block_given?
|
105
|
+
|
106
|
+
SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
|
107
|
+
parsed_body.css(selector).each do |selected_tag|
|
108
|
+
article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
|
109
|
+
article_hash = Extractor.new(article_tag, url: @url).call
|
110
|
+
|
111
|
+
yield article_hash if article_hash
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# The Scraper module contains all scrapers that can be used to extract articles.
|
7
|
+
# Each scraper should implement a `call` method that returns an array of article hashes.
|
8
|
+
# Each scraper should also implement an `articles?` method that returns true if the scraper
|
9
|
+
# can potentially be used to extract articles from the given HTML.
|
10
|
+
#
|
11
|
+
module Scraper
|
12
|
+
SCRAPERS = [
|
13
|
+
Schema,
|
14
|
+
SemanticHtml
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Error raised when no suitable scraper is found.
|
19
|
+
class NoScraperFound < Html2rss::Error; end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Returns an array of scrapers that claim to find articles in the parsed body.
|
23
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
24
|
+
# @return [Array<Class>] An array of scraper classes that can handle the parsed body.
|
25
|
+
def self.from(parsed_body)
|
26
|
+
scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
|
27
|
+
raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
|
28
|
+
|
29
|
+
scrapers
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'parallel'
|
5
|
+
require 'addressable'
|
6
|
+
|
7
|
+
module Html2rss
|
8
|
+
##
|
9
|
+
# The AutoSource class is responsible for extracting channel and articles
|
10
|
+
# from a given URL.
|
11
|
+
# It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
|
12
|
+
# marking articles, e.g. schema, microdata, open graph, etc.
|
13
|
+
class AutoSource
|
14
|
+
class UnsupportedUrlScheme < Html2rss::Error; end
|
15
|
+
class NoArticlesFound < Html2rss::Error; end
|
16
|
+
|
17
|
+
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
|
+
|
19
|
+
##
|
20
|
+
# @param url [Addressable::URI] The URL to extract articles from.
|
21
|
+
# @param body [String] The body of the response.
|
22
|
+
# @param headers [Hash] The headers of the response.
|
23
|
+
def initialize(url, body:, headers: {})
|
24
|
+
raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
|
25
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
26
|
+
raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
|
27
|
+
|
28
|
+
@url = url
|
29
|
+
@body = body
|
30
|
+
@headers = headers
|
31
|
+
end
|
32
|
+
|
33
|
+
def build
|
34
|
+
raise NoArticlesFound if articles.empty?
|
35
|
+
|
36
|
+
Reducer.call(articles, url:)
|
37
|
+
Cleanup.call(articles, url:, keep_different_domain: true)
|
38
|
+
|
39
|
+
channel.articles = articles
|
40
|
+
|
41
|
+
Html2rss::AutoSource::RssBuilder.new(
|
42
|
+
channel:,
|
43
|
+
articles:
|
44
|
+
).call
|
45
|
+
end
|
46
|
+
|
47
|
+
def articles
|
48
|
+
@articles ||= Scraper.from(parsed_body).flat_map do |scraper|
|
49
|
+
instance = scraper.new(parsed_body, url:)
|
50
|
+
|
51
|
+
articles_in_thread = Parallel.map(instance.each) do |article_hash|
|
52
|
+
Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
|
53
|
+
|
54
|
+
Article.new(**article_hash, scraper:)
|
55
|
+
end
|
56
|
+
|
57
|
+
Reducer.call(articles_in_thread, url:)
|
58
|
+
|
59
|
+
articles_in_thread
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def channel
|
64
|
+
@channel ||= Channel.new(parsed_body, headers: @headers, url:)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
attr_reader :url
|
70
|
+
|
71
|
+
# @return [Nokogiri::HTML::Document]
|
72
|
+
def parsed_body
|
73
|
+
@parsed_body ||= Nokogiri.HTML(@body)
|
74
|
+
.tap do |doc|
|
75
|
+
# Remove comments from the document
|
76
|
+
doc.xpath('//comment()').each(&:remove)
|
77
|
+
end.freeze
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
data/lib/html2rss/cli.rb
CHANGED
@@ -2,8 +2,13 @@
|
|
2
2
|
|
3
3
|
require_relative '../html2rss'
|
4
4
|
require 'thor'
|
5
|
+
require 'addressable'
|
5
6
|
|
7
|
+
##
|
8
|
+
# The Html2rss namespace / command line interface.
|
6
9
|
module Html2rss
|
10
|
+
Log = Logger.new($stderr)
|
11
|
+
|
7
12
|
##
|
8
13
|
# The Html2rss command line interface.
|
9
14
|
class CLI < Thor
|
@@ -25,5 +30,10 @@ module Html2rss
|
|
25
30
|
params = options.to_h { |opt| opt.split('=', 2) }
|
26
31
|
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
27
32
|
end
|
33
|
+
|
34
|
+
desc 'auto URL', 'automatically sources an RSS feed from the URL'
|
35
|
+
def auto(url)
|
36
|
+
puts Html2rss.auto_source(url)
|
37
|
+
end
|
28
38
|
end
|
29
39
|
end
|
@@ -16,7 +16,7 @@ module Html2rss
|
|
16
16
|
# @return [Set<String>] the required parameter names
|
17
17
|
def self.required_params_for_config(config)
|
18
18
|
config.each_with_object(Set.new) do |(_, value), required_params|
|
19
|
-
required_params.merge(value.scan(/%<([
|
19
|
+
required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -25,7 +25,9 @@ module Html2rss
|
|
25
25
|
# @param params [Hash]
|
26
26
|
def initialize(channel, params: {})
|
27
27
|
raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
|
28
|
-
|
28
|
+
|
29
|
+
url = channel[:url]
|
30
|
+
raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
|
29
31
|
|
30
32
|
@config = process_params(channel, params.transform_keys(&:to_sym))
|
31
33
|
end
|
@@ -35,8 +35,8 @@ module Html2rss
|
|
35
35
|
|
36
36
|
keywords = config[name].slice(*available_keys)
|
37
37
|
|
38
|
-
if (additional_keys =
|
39
|
-
warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
38
|
+
if (additional_keys = keywords.keys - available_keys).any?
|
39
|
+
Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
40
40
|
end
|
41
41
|
|
42
42
|
Selector.new(keywords)
|
data/lib/html2rss/config.rb
CHANGED
@@ -18,9 +18,6 @@ module Html2rss
|
|
18
18
|
# Thrown when the feed config does not contain a value at `:channel`.
|
19
19
|
class ChannelMissing < Html2rss::Error; end
|
20
20
|
|
21
|
-
# Struct to store XML Stylesheet attributes
|
22
|
-
Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
|
23
|
-
|
24
21
|
def_delegator :@channel, :author
|
25
22
|
def_delegator :@channel, :ttl
|
26
23
|
def_delegator :@channel, :title
|
@@ -75,7 +72,7 @@ module Html2rss
|
|
75
72
|
#
|
76
73
|
# @return [Array<Stylesheet>] Array of Stylesheet structs.
|
77
74
|
def stylesheets
|
78
|
-
@global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
|
75
|
+
@global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
|
79
76
|
end
|
80
77
|
|
81
78
|
# Provides read-only access to the channel object.
|