html2rss 0.12.0 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +38 -10
- data/html2rss.gemspec +1 -0
- data/lib/html2rss/attribute_post_processors/base.rb +9 -6
- data/lib/html2rss/attribute_post_processors/gsub.rb +2 -2
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +2 -2
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +2 -2
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +2 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +13 -2
- data/lib/html2rss/attribute_post_processors/substring.rb +3 -3
- data/lib/html2rss/attribute_post_processors/template.rb +4 -4
- data/lib/html2rss/auto_source/article.rb +95 -0
- data/lib/html2rss/auto_source/channel.rb +79 -0
- data/lib/html2rss/auto_source/cleanup.rb +76 -0
- data/lib/html2rss/auto_source/reducer.rb +48 -0
- data/lib/html2rss/auto_source/rss_builder.rb +68 -0
- data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
- data/lib/html2rss/auto_source/scraper.rb +33 -0
- data/lib/html2rss/auto_source.rb +77 -0
- data/lib/html2rss/cli.rb +10 -0
- data/lib/html2rss/config/channel.rb +4 -2
- data/lib/html2rss/config/selectors.rb +2 -2
- data/lib/html2rss/item.rb +8 -2
- data/lib/html2rss/utils.rb +5 -10
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +21 -0
- metadata +29 -3
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'date'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
module Scraper
|
8
|
+
class Schema
|
9
|
+
##
|
10
|
+
# Base class for Schema.org schema_objects.
|
11
|
+
#
|
12
|
+
# @see https://schema.org/Article
|
13
|
+
class Base
|
14
|
+
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
15
|
+
|
16
|
+
def initialize(schema_object, url:)
|
17
|
+
@schema_object = schema_object
|
18
|
+
@url = url
|
19
|
+
end
|
20
|
+
|
21
|
+
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
22
|
+
def call
|
23
|
+
DEFAULT_ATTRIBUTES.to_h do |attribute|
|
24
|
+
[attribute, public_send(attribute)]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
|
29
|
+
def title = schema_object[:title]
|
30
|
+
|
31
|
+
def description
|
32
|
+
[schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
|
33
|
+
.max_by { |desc| desc.to_s.size }
|
34
|
+
end
|
35
|
+
|
36
|
+
# @return [Addressable::URI, nil] the URL of the schema object
|
37
|
+
def url
|
38
|
+
url = schema_object[:url]
|
39
|
+
if url.to_s.empty?
|
40
|
+
Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
|
41
|
+
return
|
42
|
+
end
|
43
|
+
|
44
|
+
Utils.build_absolute_url_from_relative(url, @url)
|
45
|
+
end
|
46
|
+
|
47
|
+
def image = images.first || nil
|
48
|
+
def published_at = schema_object[:datePublished]
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
attr_reader :schema_object
|
53
|
+
|
54
|
+
def images
|
55
|
+
Array(schema_object[:image]).compact
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
module Scraper
|
6
|
+
##
|
7
|
+
# Scraps articles from Schema.org objects, by looking for the objects in:
|
8
|
+
|
9
|
+
# 1. <script type="application/ld+json"> "schema" tag.
|
10
|
+
# 2. tbd
|
11
|
+
#
|
12
|
+
# See:
|
13
|
+
# 1. https://schema.org/NewsArticle
|
14
|
+
# 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
15
|
+
class Schema
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
19
|
+
SCHEMA_OBJECT_TYPES = %w[
|
20
|
+
AdvertiserContentArticle
|
21
|
+
AnalysisNewsArticle
|
22
|
+
APIReference
|
23
|
+
Article
|
24
|
+
AskPublicNewsArticle
|
25
|
+
BackgroundNewsArticle
|
26
|
+
BlogPosting
|
27
|
+
DiscussionForumPosting
|
28
|
+
LiveBlogPosting
|
29
|
+
NewsArticle
|
30
|
+
OpinionNewsArticle
|
31
|
+
Report
|
32
|
+
ReportageNewsArticle
|
33
|
+
ReviewNewsArticle
|
34
|
+
SatiricalArticle
|
35
|
+
ScholarlyArticle
|
36
|
+
SocialMediaPosting
|
37
|
+
TechArticle
|
38
|
+
].to_set.freeze
|
39
|
+
|
40
|
+
class << self
|
41
|
+
def articles?(parsed_body)
|
42
|
+
parsed_body.css(TAG_SELECTOR).any? do |script|
|
43
|
+
SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
##
|
48
|
+
# Returns a flat array
|
49
|
+
# of all supported schema objects
|
50
|
+
# by recursively traversing the `from` object.
|
51
|
+
#
|
52
|
+
# @param object [Hash, Array]
|
53
|
+
# @return [Array<Hash>] the schema_objects, or an empty array
|
54
|
+
# :reek:DuplicateMethodCall
|
55
|
+
def from(object)
|
56
|
+
case object
|
57
|
+
when Nokogiri::XML::Element
|
58
|
+
from(parse_script_tag(object))
|
59
|
+
when Hash
|
60
|
+
supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
|
61
|
+
when Array
|
62
|
+
object.flat_map { |item| from(item) }
|
63
|
+
else
|
64
|
+
[]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def supported_schema_object?(object)
|
69
|
+
scraper_for_schema_object(object) ? true : false
|
70
|
+
end
|
71
|
+
|
72
|
+
##
|
73
|
+
# @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
|
74
|
+
def scraper_for_schema_object(schema_object)
|
75
|
+
if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
|
76
|
+
Base
|
77
|
+
else
|
78
|
+
Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
|
79
|
+
nil
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def parse_script_tag(script_tag)
|
86
|
+
JSON.parse(script_tag.text, symbolize_names: true)
|
87
|
+
rescue JSON::ParserError => error
|
88
|
+
Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
|
89
|
+
[]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(parsed_body, url:)
|
94
|
+
@parsed_body = parsed_body
|
95
|
+
@url = url
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# @yield [Hash] Each scraped article_hash
|
100
|
+
# @return [Array<Hash>] the scraped article_hashes
|
101
|
+
def each(&)
|
102
|
+
schema_objects.filter_map do |schema_object|
|
103
|
+
next unless (klass = self.class.scraper_for_schema_object(schema_object))
|
104
|
+
next unless (article_hash = klass.new(schema_object, url:).call)
|
105
|
+
|
106
|
+
yield article_hash
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def schema_objects
|
113
|
+
@parsed_body.css(TAG_SELECTOR).flat_map do |tag|
|
114
|
+
Schema.from(tag)
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
attr_reader :parsed_body, :url
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class AutoSource
|
7
|
+
module Scraper
|
8
|
+
class SemanticHtml
|
9
|
+
##
|
10
|
+
# ArticleExtractor is responsible for extracting the details of an article.
|
11
|
+
# It focuses on finding a headline first, and from it traverse as much as possible,
|
12
|
+
# to find the DOM upwards to find the other details.
|
13
|
+
class Extractor
|
14
|
+
INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
|
15
|
+
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
16
|
+
NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
|
17
|
+
INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
|
18
|
+
|
19
|
+
def self.visible_text_from_tag(tag, separator: ' ')
|
20
|
+
text = if (children = tag.children).empty?
|
21
|
+
tag.text.strip
|
22
|
+
else
|
23
|
+
children.filter_map do |child|
|
24
|
+
next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
|
25
|
+
|
26
|
+
visible_text_from_tag(child)
|
27
|
+
end.join(separator)
|
28
|
+
end
|
29
|
+
|
30
|
+
return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
|
31
|
+
|
32
|
+
sanitized_text
|
33
|
+
end
|
34
|
+
|
35
|
+
def initialize(article_tag, url:)
|
36
|
+
@article_tag = article_tag
|
37
|
+
@url = url
|
38
|
+
@heading = find_heading
|
39
|
+
@extract_url = find_url
|
40
|
+
end
|
41
|
+
|
42
|
+
# @return [Hash, nil] The scraped article or nil.
|
43
|
+
def call
|
44
|
+
return unless heading
|
45
|
+
|
46
|
+
{
|
47
|
+
title: extract_title,
|
48
|
+
url: extract_url,
|
49
|
+
image: extract_image,
|
50
|
+
description: extract_description,
|
51
|
+
id: generate_id,
|
52
|
+
published_at: extract_published_at
|
53
|
+
}
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
attr_reader :article_tag, :url, :heading, :extract_url
|
59
|
+
|
60
|
+
def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
|
61
|
+
|
62
|
+
# @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
|
63
|
+
def extract_published_at
|
64
|
+
times = article_tag.css('time[datetime]')
|
65
|
+
.filter_map do |tag|
|
66
|
+
DateTime.parse(tag['datetime'])
|
67
|
+
rescue ArgumentError, TypeError
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
|
71
|
+
times.min
|
72
|
+
end
|
73
|
+
|
74
|
+
def find_heading
|
75
|
+
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
76
|
+
smallest_heading = heading_tags.keys.min
|
77
|
+
heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
|
78
|
+
end
|
79
|
+
|
80
|
+
def extract_title
|
81
|
+
@extract_title ||= if heading.children.empty? && heading.text
|
82
|
+
visible_text_from_tag(heading)
|
83
|
+
else
|
84
|
+
visible_text_from_tag(
|
85
|
+
article_tag.css(HEADING_TAGS.join(','))
|
86
|
+
.max_by { |tag| tag.text.size }
|
87
|
+
)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def extract_description
|
92
|
+
text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
|
93
|
+
return text if text
|
94
|
+
|
95
|
+
description = visible_text_from_tag(article_tag)
|
96
|
+
return nil unless description
|
97
|
+
|
98
|
+
title_text = extract_title
|
99
|
+
description.gsub!(title_text, '') if title_text
|
100
|
+
description.strip!
|
101
|
+
description.empty? ? nil : description
|
102
|
+
end
|
103
|
+
|
104
|
+
def find_url
|
105
|
+
closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
|
106
|
+
selector: 'a[href]:not([href=""])')
|
107
|
+
href = closest_anchor&.[]('href')&.split('#')&.first&.strip
|
108
|
+
Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
|
109
|
+
end
|
110
|
+
|
111
|
+
def extract_image
|
112
|
+
Image.call(article_tag, url:)
|
113
|
+
end
|
114
|
+
|
115
|
+
def generate_id
|
116
|
+
[article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
|
117
|
+
extract_url&.path].compact.reject(&:empty?).first
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
module Scraper
|
6
|
+
class SemanticHtml
|
7
|
+
##
|
8
|
+
# Image is responsible for extracting image URLs the article_tag.
|
9
|
+
class Image
|
10
|
+
def self.call(article_tag, url:)
|
11
|
+
img_src = from_source(article_tag) ||
|
12
|
+
from_img(article_tag) ||
|
13
|
+
from_style(article_tag)
|
14
|
+
|
15
|
+
Utils.build_absolute_url_from_relative(img_src, url) if img_src
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.from_img(article_tag)
|
19
|
+
article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Extracts the largest image source from the srcset attribute
|
24
|
+
# of an img tag or a source tag inside a picture tag.
|
25
|
+
#
|
26
|
+
# @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
|
27
|
+
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
|
28
|
+
# @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
|
29
|
+
def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
|
30
|
+
hash = article_tag.css('img[srcset], picture > source[srcset]')
|
31
|
+
.flat_map { |source| source['srcset'].to_s.split(',') }
|
32
|
+
.filter_map do |line|
|
33
|
+
width, url = line.split.reverse
|
34
|
+
next if url.nil? || url.start_with?('data:')
|
35
|
+
|
36
|
+
width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
|
37
|
+
|
38
|
+
[width_value, url.strip]
|
39
|
+
end.to_h
|
40
|
+
|
41
|
+
hash[hash.keys.max]
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.from_style(article_tag)
|
45
|
+
article_tag.css('[style*="url"]')
|
46
|
+
.map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
|
47
|
+
.reject { |src| !src || src.start_with?('data:') }
|
48
|
+
.max_by(&:size)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
require 'parallel'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
module Scraper
|
9
|
+
##
|
10
|
+
# Scrapes articles by looking for common markup tags (article, section, li)
|
11
|
+
# containing an <a href> tag.
|
12
|
+
#
|
13
|
+
# See:
|
14
|
+
# 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
|
15
|
+
class SemanticHtml
|
16
|
+
include Enumerable
|
17
|
+
|
18
|
+
##
|
19
|
+
# Map of parent element names to CSS selectors for finding <a href> tags.
|
20
|
+
ANCHOR_TAG_SELECTORS = {
|
21
|
+
'section' => ['section :not(section) a[href]'],
|
22
|
+
'tr' => ['table tr :not(tr) a[href]'],
|
23
|
+
'article' => [
|
24
|
+
'article :not(article) a[href]',
|
25
|
+
'article a[href]'
|
26
|
+
],
|
27
|
+
'li' => [
|
28
|
+
'ul > li :not(li) a[href]',
|
29
|
+
'ol > li :not(li) a[href]'
|
30
|
+
]
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
# Check if the parsed_body contains articles
|
34
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
|
35
|
+
# @return [Boolean] True if articles are found, otherwise false.
|
36
|
+
def self.articles?(parsed_body)
|
37
|
+
return false unless parsed_body
|
38
|
+
|
39
|
+
ANCHOR_TAG_SELECTORS.each_value do |selectors|
|
40
|
+
return true if selectors.any? { |selector| parsed_body.at_css(selector) }
|
41
|
+
end
|
42
|
+
false
|
43
|
+
end
|
44
|
+
|
45
|
+
# Finds the closest ancestor tag matching the specified tag name
|
46
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
47
|
+
# @param tag_name [String] The tag name to search for
|
48
|
+
# @param stop_tag [String] The tag name to stop searching at
|
49
|
+
# @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
|
50
|
+
def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
|
51
|
+
return current_tag if current_tag.name == tag_name
|
52
|
+
|
53
|
+
stop_tags = Set[tag_name, stop_tag]
|
54
|
+
|
55
|
+
while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
|
56
|
+
current_tag = current_tag.parent
|
57
|
+
end
|
58
|
+
|
59
|
+
current_tag
|
60
|
+
end
|
61
|
+
|
62
|
+
# Finds the closest matching selector upwards in the DOM tree
|
63
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
64
|
+
# @param selector [String] The CSS selector to search for
|
65
|
+
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
66
|
+
def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
|
67
|
+
current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Helper method to find a matching selector upwards
|
71
|
+
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
72
|
+
# @param selector [String] The CSS selector to search for
|
73
|
+
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
74
|
+
def self.find_closest_selector_upwards(current_tag, selector:)
|
75
|
+
while current_tag
|
76
|
+
found = current_tag.at_css(selector)
|
77
|
+
return found if found
|
78
|
+
|
79
|
+
return nil unless current_tag.respond_to?(:parent)
|
80
|
+
|
81
|
+
current_tag = current_tag.parent
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Returns an array of [tag_name, selector] pairs
|
86
|
+
# @return [Array<[String, String]>] Array of tag name and selector pairs
|
87
|
+
def self.anchor_tag_selector_pairs
|
88
|
+
ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
|
89
|
+
selectors.map { |selector| [tag_name, selector] }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def initialize(parsed_body, url:)
|
94
|
+
@parsed_body = parsed_body
|
95
|
+
@url = url
|
96
|
+
end
|
97
|
+
|
98
|
+
attr_reader :parsed_body
|
99
|
+
|
100
|
+
##
|
101
|
+
# @yieldparam [Hash] The scraped article hash
|
102
|
+
# @return [Enumerator] Enumerator for the scraped articles
|
103
|
+
def each
|
104
|
+
return enum_for(:each) unless block_given?
|
105
|
+
|
106
|
+
SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
|
107
|
+
parsed_body.css(selector).each do |selected_tag|
|
108
|
+
article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
|
109
|
+
article_hash = Extractor.new(article_tag, url: @url).call
|
110
|
+
|
111
|
+
yield article_hash if article_hash
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class AutoSource
|
5
|
+
##
|
6
|
+
# The Scraper module contains all scrapers that can be used to extract articles.
|
7
|
+
# Each scraper should implement a `call` method that returns an array of article hashes.
|
8
|
+
# Each scraper should also implement an `articles?` method that returns true if the scraper
|
9
|
+
# can potentially be used to extract articles from the given HTML.
|
10
|
+
#
|
11
|
+
module Scraper
|
12
|
+
SCRAPERS = [
|
13
|
+
Schema,
|
14
|
+
SemanticHtml
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
##
|
18
|
+
# Error raised when no suitable scraper is found.
|
19
|
+
class NoScraperFound < Html2rss::Error; end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Returns an array of scrapers that claim to find articles in the parsed body.
|
23
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
24
|
+
# @return [Array<Class>] An array of scraper classes that can handle the parsed body.
|
25
|
+
def self.from(parsed_body)
|
26
|
+
scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
|
27
|
+
raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
|
28
|
+
|
29
|
+
scrapers
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,77 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'parallel'
|
5
|
+
require 'addressable'
|
6
|
+
|
7
|
+
module Html2rss
|
8
|
+
##
|
9
|
+
# The AutoSource class is responsible for extracting channel and articles
|
10
|
+
# from a given URL.
|
11
|
+
# It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
|
12
|
+
# marking articles, e.g. schema, microdata, open graph, etc.
|
13
|
+
class AutoSource
|
14
|
+
class UnsupportedUrlScheme < Html2rss::Error; end
|
15
|
+
class NoArticlesFound < Html2rss::Error; end
|
16
|
+
|
17
|
+
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
18
|
+
|
19
|
+
def initialize(url)
|
20
|
+
unless url.is_a?(String) || url.is_a?(Addressable::URI)
|
21
|
+
raise ArgumentError,
|
22
|
+
'URL must be a String or Addressable::URI'
|
23
|
+
end
|
24
|
+
|
25
|
+
@url = Addressable::URI.parse(url)
|
26
|
+
|
27
|
+
raise ArgumentError, 'URL must be absolute' unless @url.absolute?
|
28
|
+
raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
|
29
|
+
end
|
30
|
+
|
31
|
+
def build
|
32
|
+
raise NoArticlesFound if articles.empty?
|
33
|
+
|
34
|
+
Reducer.call(articles, url:)
|
35
|
+
Cleanup.call(articles, url:, keep_different_domain: true)
|
36
|
+
|
37
|
+
Html2rss::AutoSource::RssBuilder.new(
|
38
|
+
channel:,
|
39
|
+
articles:
|
40
|
+
).call
|
41
|
+
end
|
42
|
+
|
43
|
+
def articles
|
44
|
+
@articles ||= Scraper.from(parsed_body).flat_map do |scraper|
|
45
|
+
instance = scraper.new(parsed_body, url:)
|
46
|
+
|
47
|
+
articles_in_thread = Parallel.map(instance.each) do |article_hash|
|
48
|
+
Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
|
49
|
+
|
50
|
+
Article.new(**article_hash, scraper:)
|
51
|
+
end
|
52
|
+
|
53
|
+
Reducer.call(articles_in_thread, url:)
|
54
|
+
|
55
|
+
articles_in_thread
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def channel
|
60
|
+
Channel.new(parsed_body, response:, url:, articles:)
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
attr_reader :url
|
66
|
+
|
67
|
+
def response
|
68
|
+
@response ||= Html2rss::Utils.request_url(url)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Parses the HTML body of the response using Nokogiri.
|
72
|
+
# @return [Nokogiri::HTML::Document]
|
73
|
+
def parsed_body
|
74
|
+
@parsed_body ||= Nokogiri.HTML(response.body).freeze
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
data/lib/html2rss/cli.rb
CHANGED
@@ -2,8 +2,13 @@
|
|
2
2
|
|
3
3
|
require_relative '../html2rss'
|
4
4
|
require 'thor'
|
5
|
+
require 'addressable'
|
5
6
|
|
7
|
+
##
|
8
|
+
# The Html2rss namespace / command line interface.
|
6
9
|
module Html2rss
|
10
|
+
Log = Logger.new($stderr)
|
11
|
+
|
7
12
|
##
|
8
13
|
# The Html2rss command line interface.
|
9
14
|
class CLI < Thor
|
@@ -25,5 +30,10 @@ module Html2rss
|
|
25
30
|
params = options.to_h { |opt| opt.split('=', 2) }
|
26
31
|
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
27
32
|
end
|
33
|
+
|
34
|
+
desc 'auto URL', 'automatically sources an RSS feed from the URL'
|
35
|
+
def auto(url)
|
36
|
+
puts Html2rss.auto_source(url)
|
37
|
+
end
|
28
38
|
end
|
29
39
|
end
|
@@ -16,7 +16,7 @@ module Html2rss
|
|
16
16
|
# @return [Set<String>] the required parameter names
|
17
17
|
def self.required_params_for_config(config)
|
18
18
|
config.each_with_object(Set.new) do |(_, value), required_params|
|
19
|
-
required_params.merge(value.scan(/%<([
|
19
|
+
required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
@@ -25,7 +25,9 @@ module Html2rss
|
|
25
25
|
# @param params [Hash]
|
26
26
|
def initialize(channel, params: {})
|
27
27
|
raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
|
28
|
-
|
28
|
+
|
29
|
+
url = channel[:url]
|
30
|
+
raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
|
29
31
|
|
30
32
|
@config = process_params(channel, params.transform_keys(&:to_sym))
|
31
33
|
end
|
@@ -35,8 +35,8 @@ module Html2rss
|
|
35
35
|
|
36
36
|
keywords = config[name].slice(*available_keys)
|
37
37
|
|
38
|
-
if (additional_keys =
|
39
|
-
warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
38
|
+
if (additional_keys = keywords.keys - available_keys).any?
|
39
|
+
Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
40
40
|
end
|
41
41
|
|
42
42
|
Selector.new(keywords)
|
data/lib/html2rss/item.rb
CHANGED
@@ -23,7 +23,8 @@ module Html2rss
|
|
23
23
|
# @param config [Html2rss::Config] Configuration object.
|
24
24
|
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
25
|
def self.from_url(url, config)
|
26
|
-
body = Utils.
|
26
|
+
body = Utils.request_url(url, headers: config.headers).body
|
27
|
+
body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
|
27
28
|
|
28
29
|
Nokogiri.HTML(body)
|
29
30
|
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
@@ -47,6 +48,7 @@ module Html2rss
|
|
47
48
|
# @param method_name [Symbol]
|
48
49
|
# @param _include_private [true, false]
|
49
50
|
# @return [true, false]
|
51
|
+
# :reek:BooleanParameter { enabled: false }
|
50
52
|
def respond_to_missing?(method_name, _include_private = false)
|
51
53
|
config.selector?(method_name) || super
|
52
54
|
end
|
@@ -110,7 +112,11 @@ module Html2rss
|
|
110
112
|
#
|
111
113
|
# @return [Array<String>] list of categories.
|
112
114
|
def categories
|
113
|
-
config.category_selector_names
|
115
|
+
config.category_selector_names
|
116
|
+
.filter_map do |method_name|
|
117
|
+
category = public_send(method_name)
|
118
|
+
category.strip unless category.to_s.empty?
|
119
|
+
end.uniq
|
114
120
|
end
|
115
121
|
|
116
122
|
##
|