html2rss 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -656
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +115 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class WordpressApi
|
|
7
|
+
##
|
|
8
|
+
# Resolves the WordPress posts endpoint for a given page and scope.
|
|
9
|
+
class PostsEndpoint
|
|
10
|
+
POSTS_PATH = 'wp/v2/posts'
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
14
|
+
# @param page_url [Html2rss::Url] canonical page URL
|
|
15
|
+
# @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
|
|
16
|
+
# @param posts_query [Hash<String, String>] query params for the posts request
|
|
17
|
+
# @param logger [Logger] logger used for operational warnings
|
|
18
|
+
# @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
|
|
19
|
+
def self.resolve(parsed_body:, page_url:, page_scope:, posts_query:, logger: Html2rss::Log)
|
|
20
|
+
new(parsed_body:, page_url:, page_scope:, posts_query:, logger:).call
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
25
|
+
# @param page_url [Html2rss::Url] canonical page URL
|
|
26
|
+
# @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
|
|
27
|
+
# @param posts_query [Hash<String, String>] query params for the posts request
|
|
28
|
+
# @param logger [Logger] logger used for operational warnings
|
|
29
|
+
def initialize(parsed_body:, page_url:, page_scope:, posts_query:, logger:)
|
|
30
|
+
@parsed_body = parsed_body
|
|
31
|
+
@page_url = Html2rss::Url.from_absolute(page_url)
|
|
32
|
+
@page_scope = page_scope
|
|
33
|
+
@posts_query = posts_query
|
|
34
|
+
@logger = logger
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
##
|
|
38
|
+
# @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
|
|
39
|
+
def call
|
|
40
|
+
api_root = api_root_url
|
|
41
|
+
return unless api_root
|
|
42
|
+
return unless fetchable_page_scope?
|
|
43
|
+
|
|
44
|
+
query_style_api_root?(api_root) ? query_root_posts_url(api_root) : posts_collection_url(api_root)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
attr_reader :parsed_body, :page_url, :page_scope, :posts_query, :logger
|
|
50
|
+
|
|
51
|
+
def api_root_url
|
|
52
|
+
href = parsed_body.at_css(WordpressApi::API_LINK_SELECTOR)&.[]('href').to_s.strip
|
|
53
|
+
return log_missing_api_root if href.empty?
|
|
54
|
+
|
|
55
|
+
Html2rss::Url.from_relative(href, page_url)
|
|
56
|
+
rescue Addressable::URI::InvalidURIError, ArgumentError => error
|
|
57
|
+
logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
|
|
58
|
+
nil
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def fetchable_page_scope?
|
|
62
|
+
return true if page_scope.fetchable?
|
|
63
|
+
|
|
64
|
+
if page_scope.reason == :non_archive
|
|
65
|
+
logger.debug(
|
|
66
|
+
"#{WordpressApi}: page advertised WordPress API support " \
|
|
67
|
+
'without a safe WordPress archive scope'
|
|
68
|
+
)
|
|
69
|
+
return false
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
logger.warn("#{WordpressApi}: unable to derive safe WordPress archive scope for #{page_url}")
|
|
73
|
+
false
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def log_missing_api_root
|
|
77
|
+
logger.debug("#{WordpressApi}: page advertised WordPress API support without a usable API root")
|
|
78
|
+
nil
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def query_style_api_root?(api_root)
|
|
82
|
+
api_root.query_values.key?('rest_route')
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def query_root_posts_url(api_root)
|
|
86
|
+
query = api_root.query_values
|
|
87
|
+
route = normalized_rest_route(query.fetch('rest_route', '/'))
|
|
88
|
+
api_root.with_query_values(
|
|
89
|
+
query.merge(
|
|
90
|
+
'rest_route' => append_posts_route(route),
|
|
91
|
+
**posts_query
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def posts_collection_url(api_root)
|
|
97
|
+
Html2rss::Url.from_relative(POSTS_PATH, normalized_api_root(api_root))
|
|
98
|
+
.with_query_values(api_root.query_values.merge(posts_query))
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def normalized_api_root(api_root)
|
|
102
|
+
api_root.with_path(normalized_api_path(api_root.path))
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def normalized_api_path(path)
|
|
106
|
+
segments = path.to_s.split('/').reject(&:empty?)
|
|
107
|
+
normalized_path = "/#{segments.join('/')}"
|
|
108
|
+
normalized_path = '/' if normalized_path == '/'
|
|
109
|
+
normalized_path.end_with?('/') ? normalized_path : "#{normalized_path}/"
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def normalized_rest_route(route)
|
|
113
|
+
value = route.to_s
|
|
114
|
+
value = '/' if value.empty?
|
|
115
|
+
value = "/#{value}" unless value.start_with?('/')
|
|
116
|
+
trim_trailing_slashes(value)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def trim_trailing_slashes(value)
|
|
120
|
+
end_index = value.length
|
|
121
|
+
end_index -= 1 while end_index > 1 && value.getbyte(end_index - 1) == 47
|
|
122
|
+
value[0, end_index]
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def append_posts_route(route)
|
|
126
|
+
return '/wp/v2/posts' if route == '/'
|
|
127
|
+
|
|
128
|
+
"#{route}/wp/v2/posts"
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'date'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
class AutoSource
|
|
8
|
+
module Scraper
|
|
9
|
+
# Scrapes WordPress sites through their REST API instead of parsing article HTML.
|
|
10
|
+
class WordpressApi # rubocop:disable Metrics/ClassLength
|
|
11
|
+
include Enumerable
|
|
12
|
+
|
|
13
|
+
API_LINK_SELECTOR = 'link[rel="https://api.w.org/"][href]'
|
|
14
|
+
CANONICAL_LINK_SELECTOR = 'link[rel="canonical"][href]'
|
|
15
|
+
POSTS_FIELDS = %w[id title excerpt content link date categories].freeze
|
|
16
|
+
def self.options_key = :wordpress_api
|
|
17
|
+
|
|
18
|
+
##
|
|
19
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
20
|
+
# @return [Boolean] whether the page advertises a WordPress REST API endpoint
|
|
21
|
+
def self.articles?(parsed_body)
|
|
22
|
+
return false unless parsed_body
|
|
23
|
+
|
|
24
|
+
!parsed_body.at_css(API_LINK_SELECTOR).nil?
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
29
|
+
# @param url [String, Html2rss::Url] canonical page URL
|
|
30
|
+
# @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
|
|
31
|
+
# @param _opts [Hash] unused scraper-specific options
|
|
32
|
+
# @return [void]
|
|
33
|
+
def initialize(parsed_body, url:, request_session: nil, **_opts)
|
|
34
|
+
@parsed_body = parsed_body
|
|
35
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
36
|
+
@request_session = request_session
|
|
37
|
+
@page_scope = PageScope.from(parsed_body:, url: @url)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
##
|
|
41
|
+
# Yields article hashes from the WordPress posts API.
|
|
42
|
+
#
|
|
43
|
+
# @yieldparam article [Hash<Symbol, Object>] normalized article hash
|
|
44
|
+
# @return [Enumerator, void] enumerator when no block is given
|
|
45
|
+
def each
|
|
46
|
+
return enum_for(:each) unless block_given?
|
|
47
|
+
return unless (posts = fetch_posts)
|
|
48
|
+
|
|
49
|
+
posts.filter_map { article_from(_1) }.each { yield(_1) }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
private
|
|
53
|
+
|
|
54
|
+
attr_reader :parsed_body, :url, :request_session, :page_scope
|
|
55
|
+
|
|
56
|
+
def fetch_posts
|
|
57
|
+
response = posts_response
|
|
58
|
+
return unless response
|
|
59
|
+
|
|
60
|
+
Array(response.parsed_body)
|
|
61
|
+
rescue RequestService::UnsupportedResponseContentType => error
|
|
62
|
+
Log.warn("#{self.class}: unsupported WordPress API posts content type (#{error.message})")
|
|
63
|
+
nil
|
|
64
|
+
rescue JSON::ParserError => error
|
|
65
|
+
Log.warn("#{self.class}: failed to parse WordPress API posts JSON (#{error.message})")
|
|
66
|
+
nil
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def posts_response
|
|
70
|
+
return unless request_session
|
|
71
|
+
return unless (resolved_posts_url = posts_endpoint_url)
|
|
72
|
+
|
|
73
|
+
request_session.follow_up(
|
|
74
|
+
url: resolved_posts_url,
|
|
75
|
+
relation: :auto_source,
|
|
76
|
+
origin_url: url
|
|
77
|
+
)
|
|
78
|
+
rescue Html2rss::Error => error
|
|
79
|
+
Log.warn("#{self.class}: failed to fetch WordPress API posts (#{error.class}: #{error.message})")
|
|
80
|
+
nil
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def article_from(post)
|
|
84
|
+
return unless post.is_a?(Hash)
|
|
85
|
+
|
|
86
|
+
article_url = article_url(post)
|
|
87
|
+
return unless article_url
|
|
88
|
+
|
|
89
|
+
article_attributes(post, article_url).compact
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def article_url(post)
|
|
93
|
+
absolute_link(post[:link])
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def article_id(_post, article_url)
|
|
97
|
+
root_path_query_id(article_url) || string(article_url.path) || article_url.to_s
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def article_title(post)
|
|
101
|
+
rendered_text(post.dig(:title, :rendered))
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def article_description(post)
|
|
105
|
+
rendered_html(post.dig(:content, :rendered)) || rendered_html(post.dig(:excerpt, :rendered))
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def article_published_at(post)
|
|
109
|
+
string(post[:date])
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def article_categories(post)
|
|
113
|
+
Array(post[:categories]).filter_map { |value| string(value) }
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def article_attributes(post, article_url)
|
|
117
|
+
{
|
|
118
|
+
id: article_id(post, article_url),
|
|
119
|
+
title: article_title(post),
|
|
120
|
+
description: article_description(post),
|
|
121
|
+
url: article_url,
|
|
122
|
+
published_at: article_published_at(post),
|
|
123
|
+
categories: article_categories(post)
|
|
124
|
+
}
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def absolute_link(link)
|
|
128
|
+
value = string(link)
|
|
129
|
+
return unless value
|
|
130
|
+
|
|
131
|
+
Html2rss::Url.from_relative(value, url)
|
|
132
|
+
rescue ArgumentError
|
|
133
|
+
nil
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def rendered_text(value)
|
|
137
|
+
rendered_html(value)&.then { Nokogiri::HTML.fragment(_1).text.strip }
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def rendered_html(value)
|
|
141
|
+
text = string(value)
|
|
142
|
+
text unless text.nil?
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def string(value)
|
|
146
|
+
text = value.to_s.strip
|
|
147
|
+
text unless text.empty?
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def root_path_query_id(article_url)
|
|
151
|
+
query = string(article_url.query)
|
|
152
|
+
return unless query
|
|
153
|
+
|
|
154
|
+
path = article_url.path.to_s
|
|
155
|
+
return unless path.empty? || path == '/'
|
|
156
|
+
|
|
157
|
+
"/?#{query}"
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def posts_query
|
|
161
|
+
{
|
|
162
|
+
'_fields' => POSTS_FIELDS.join(','),
|
|
163
|
+
'per_page' => '100'
|
|
164
|
+
}.merge(page_scope.query)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def posts_endpoint_url
|
|
168
|
+
PostsEndpoint.resolve(
|
|
169
|
+
parsed_body:,
|
|
170
|
+
page_url: url,
|
|
171
|
+
page_scope:,
|
|
172
|
+
posts_query:,
|
|
173
|
+
logger: Log
|
|
174
|
+
)
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
@@ -4,31 +4,165 @@ module Html2rss
|
|
|
4
4
|
class AutoSource
|
|
5
5
|
##
|
|
6
6
|
# The Scraper module contains all scrapers that can be used to extract articles.
|
|
7
|
-
# Each scraper should implement
|
|
7
|
+
# Each scraper should implement an `each` method that yields article hashes.
|
|
8
8
|
# Each scraper should also implement an `articles?` method that returns true if the scraper
|
|
9
9
|
# can potentially be used to extract articles from the given HTML.
|
|
10
10
|
#
|
|
11
|
+
# Detection is intentionally shallow for most scrapers, but instance-based
|
|
12
|
+
# matching is available for scrapers that need to carry expensive selection
|
|
13
|
+
# state forward into extraction.
|
|
14
|
+
# Scrapers run in parallel threads, so implementations must avoid shared
|
|
15
|
+
# mutable state and degrade by returning no articles when a follow-up would
|
|
16
|
+
# be unsafe or unsupported.
|
|
17
|
+
#
|
|
11
18
|
module Scraper
|
|
19
|
+
APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
|
|
20
|
+
APP_SHELL_MAX_ANCHORS = 2
|
|
21
|
+
APP_SHELL_MAX_VISIBLE_TEXT_LENGTH = 220
|
|
22
|
+
|
|
12
23
|
SCRAPERS = [
|
|
13
|
-
|
|
24
|
+
WordpressApi,
|
|
14
25
|
Schema,
|
|
15
|
-
|
|
26
|
+
Microdata,
|
|
27
|
+
JsonState,
|
|
28
|
+
SemanticHtml,
|
|
29
|
+
Html
|
|
16
30
|
].freeze
|
|
17
31
|
|
|
18
32
|
##
|
|
19
33
|
# Error raised when no suitable scraper is found.
|
|
20
|
-
class NoScraperFound < Html2rss::Error
|
|
34
|
+
class NoScraperFound < Html2rss::Error
|
|
35
|
+
CATEGORY_MESSAGES = {
|
|
36
|
+
blocked_surface: 'No scrapers found: blocked surface likely (anti-bot or interstitial). ' \
|
|
37
|
+
'Retry with --strategy browserless, try a more specific public listing URL, ' \
|
|
38
|
+
'or run from an environment that can complete anti-bot checks.',
|
|
39
|
+
app_shell: 'No scrapers found: app-shell surface detected (client-rendered page with little or no ' \
|
|
40
|
+
'server-rendered article HTML). Retry with --strategy browserless, or target a direct ' \
|
|
41
|
+
'listing/update URL instead of a homepage or shell entrypoint.',
|
|
42
|
+
unsupported_surface: 'No scrapers found: unsupported extraction surface for auto mode. ' \
|
|
43
|
+
'Try a direct listing/changelog/category URL, ' \
|
|
44
|
+
'or use explicit selectors in a feed config.'
|
|
45
|
+
}.freeze
|
|
46
|
+
|
|
47
|
+
def initialize(message = nil, category: :unsupported_surface)
|
|
48
|
+
validate_category!(category)
|
|
49
|
+
@category = category
|
|
50
|
+
super(message || CATEGORY_MESSAGES.fetch(@category))
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
attr_reader :category
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def validate_category!(category)
|
|
58
|
+
return if CATEGORY_MESSAGES.key?(category)
|
|
59
|
+
|
|
60
|
+
valid_categories = CATEGORY_MESSAGES.keys.join(', ')
|
|
61
|
+
raise ArgumentError, "Unknown category: #{category.inspect}. Valid categories are: #{valid_categories}"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
21
64
|
|
|
22
65
|
##
|
|
23
|
-
# Returns an array of
|
|
66
|
+
# Returns an array of scraper classes that claim to find articles in the parsed body.
|
|
24
67
|
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
|
68
|
+
# @param opts [Hash] The options hash.
|
|
25
69
|
# @return [Array<Class>] An array of scraper classes that can handle the parsed body.
|
|
26
|
-
def self.from(parsed_body)
|
|
27
|
-
scrapers = SCRAPERS.select { |scraper| scraper.
|
|
28
|
-
|
|
70
|
+
def self.from(parsed_body, opts = Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
|
|
71
|
+
scrapers = SCRAPERS.select { |scraper| opts.dig(scraper.options_key, :enabled) }
|
|
72
|
+
scrapers.select! { |scraper| scraper.articles?(parsed_body) }
|
|
73
|
+
|
|
74
|
+
raise no_scraper_found_for(parsed_body) if scrapers.empty?
|
|
29
75
|
|
|
30
76
|
scrapers
|
|
31
77
|
end
|
|
78
|
+
|
|
79
|
+
# Returns scraper instances ready for extraction.
|
|
80
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
|
81
|
+
# @param url [String, Html2rss::Url] The page url.
|
|
82
|
+
# @param request_session [Html2rss::RequestSession, nil] Shared follow-up session.
|
|
83
|
+
# @param opts [Hash] The options hash.
|
|
84
|
+
# @return [Array<Object>] An array of scraper instances that can handle the parsed body.
|
|
85
|
+
#
|
|
86
|
+
# `instances_for` is the main entrypoint for extraction. It lets a scraper
|
|
87
|
+
# decide whether it matches using the same instance that will later yield
|
|
88
|
+
# article hashes, which keeps precomputed state close to the scraper that
|
|
89
|
+
# owns it.
|
|
90
|
+
def self.instances_for(parsed_body, url:, request_session: nil,
|
|
91
|
+
opts: Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
|
|
92
|
+
instances = SCRAPERS.filter_map do |scraper|
|
|
93
|
+
next unless opts.dig(scraper.options_key, :enabled)
|
|
94
|
+
|
|
95
|
+
instance = scraper.new(parsed_body, url:, request_session:, **opts.fetch(scraper.options_key, {}))
|
|
96
|
+
next unless extractable_instance?(instance, parsed_body)
|
|
97
|
+
|
|
98
|
+
instance
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
raise no_scraper_found_for(parsed_body) if instances.empty?
|
|
102
|
+
|
|
103
|
+
instances
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def self.extractable_instance?(instance, parsed_body)
|
|
107
|
+
return instance.extractable? if instance.respond_to?(:extractable?)
|
|
108
|
+
|
|
109
|
+
instance.class.articles?(parsed_body)
|
|
110
|
+
end
|
|
111
|
+
private_class_method :extractable_instance?
|
|
112
|
+
|
|
113
|
+
def self.no_scraper_found_for(parsed_body)
|
|
114
|
+
NoScraperFound.new(category: classify_no_scraper_surface(parsed_body))
|
|
115
|
+
end
|
|
116
|
+
private_class_method :no_scraper_found_for
|
|
117
|
+
|
|
118
|
+
def self.classify_no_scraper_surface(parsed_body)
|
|
119
|
+
return :blocked_surface if blocked_surface?(parsed_body)
|
|
120
|
+
return :app_shell if app_shell_surface?(parsed_body)
|
|
121
|
+
|
|
122
|
+
:unsupported_surface
|
|
123
|
+
end
|
|
124
|
+
private_class_method :classify_no_scraper_surface
|
|
125
|
+
|
|
126
|
+
def self.blocked_surface?(parsed_body)
|
|
127
|
+
Html2rss::BlockedSurface.interstitial?(parsed_body.to_html)
|
|
128
|
+
end
|
|
129
|
+
private_class_method :blocked_surface?
|
|
130
|
+
|
|
131
|
+
def self.app_shell_surface?(parsed_body)
|
|
132
|
+
root_marker = parsed_body.at_css(APP_SHELL_ROOT_SELECTORS)
|
|
133
|
+
return false unless root_marker
|
|
134
|
+
|
|
135
|
+
sparse_anchor_surface?(parsed_body) &&
|
|
136
|
+
no_article_markers?(parsed_body) &&
|
|
137
|
+
short_visible_text?(parsed_body)
|
|
138
|
+
end
|
|
139
|
+
private_class_method :app_shell_surface?
|
|
140
|
+
|
|
141
|
+
def self.sparse_anchor_surface?(parsed_body)
|
|
142
|
+
parsed_body.css('body a[href]').size <= APP_SHELL_MAX_ANCHORS
|
|
143
|
+
end
|
|
144
|
+
private_class_method :sparse_anchor_surface?
|
|
145
|
+
|
|
146
|
+
def self.no_article_markers?(parsed_body)
|
|
147
|
+
parsed_body.css(
|
|
148
|
+
'article, main article, [itemtype*="Article"], [itemprop="articleBody"]'
|
|
149
|
+
).empty?
|
|
150
|
+
end
|
|
151
|
+
private_class_method :no_article_markers?
|
|
152
|
+
|
|
153
|
+
def self.short_visible_text?(parsed_body)
|
|
154
|
+
visible_text_length(parsed_body) <= APP_SHELL_MAX_VISIBLE_TEXT_LENGTH
|
|
155
|
+
end
|
|
156
|
+
private_class_method :short_visible_text?
|
|
157
|
+
|
|
158
|
+
def self.visible_text_length(parsed_body)
|
|
159
|
+
body = parsed_body.at_css('body')
|
|
160
|
+
return 0 unless body
|
|
161
|
+
|
|
162
|
+
text_nodes = body.xpath('.//text()[not(ancestor::script or ancestor::style or ancestor::noscript)]')
|
|
163
|
+
text_nodes.map(&:text).join(' ').gsub(/\s+/, ' ').strip.length
|
|
164
|
+
end
|
|
165
|
+
private_class_method :visible_text_length
|
|
32
166
|
end
|
|
33
167
|
end
|
|
34
168
|
end
|
data/lib/html2rss/auto_source.rb
CHANGED
|
@@ -1,73 +1,145 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'nokogiri'
|
|
4
3
|
require 'parallel'
|
|
5
|
-
require '
|
|
4
|
+
require 'dry-validation'
|
|
6
5
|
|
|
7
6
|
module Html2rss
|
|
8
7
|
##
|
|
9
|
-
# The AutoSource class
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
8
|
+
# The AutoSource class automatically extracts articles from a given URL by
|
|
9
|
+
# utilizing a collection of Scrapers. These scrapers analyze and
|
|
10
|
+
# parse popular structured data formats—such as schema, microdata, and
|
|
11
|
+
# open graph—to identify and compile article elements into unified articles.
|
|
12
|
+
#
|
|
13
|
+
# Scrapers supporting plain HTML are also available for sites without structured data,
|
|
14
|
+
# though results may vary based on page markup.
|
|
15
|
+
#
|
|
16
|
+
# @see Html2rss::AutoSource::Scraper::Schema
|
|
17
|
+
# @see Html2rss::AutoSource::Scraper::SemanticHtml
|
|
18
|
+
# @see Html2rss::AutoSource::Scraper::Html
|
|
13
19
|
class AutoSource
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
DEFAULT_CONFIG = {
|
|
21
|
+
scraper: {
|
|
22
|
+
wordpress_api: {
|
|
23
|
+
enabled: true
|
|
24
|
+
},
|
|
25
|
+
schema: {
|
|
26
|
+
enabled: true
|
|
27
|
+
},
|
|
28
|
+
microdata: {
|
|
29
|
+
enabled: true
|
|
30
|
+
},
|
|
31
|
+
json_state: {
|
|
32
|
+
enabled: true
|
|
33
|
+
},
|
|
34
|
+
semantic_html: {
|
|
35
|
+
enabled: true
|
|
36
|
+
},
|
|
37
|
+
html: {
|
|
38
|
+
enabled: true,
|
|
39
|
+
minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
|
|
40
|
+
use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
cleanup: Cleanup::DEFAULT_CONFIG
|
|
44
|
+
}.freeze
|
|
45
|
+
|
|
46
|
+
SCRAPER_CONFIG = proc do
|
|
47
|
+
optional(:wordpress_api).hash do
|
|
48
|
+
optional(:enabled).filled(:bool)
|
|
49
|
+
end
|
|
50
|
+
optional(:schema).hash do
|
|
51
|
+
optional(:enabled).filled(:bool)
|
|
52
|
+
end
|
|
53
|
+
optional(:microdata).hash do
|
|
54
|
+
optional(:enabled).filled(:bool)
|
|
55
|
+
end
|
|
56
|
+
optional(:json_state).hash do
|
|
57
|
+
optional(:enabled).filled(:bool)
|
|
58
|
+
end
|
|
59
|
+
optional(:semantic_html).hash do
|
|
60
|
+
optional(:enabled).filled(:bool)
|
|
61
|
+
end
|
|
62
|
+
optional(:html).hash do
|
|
63
|
+
optional(:enabled).filled(:bool)
|
|
64
|
+
optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
|
|
65
|
+
optional(:use_top_selectors).filled(:integer, gt?: 0)
|
|
66
|
+
end
|
|
67
|
+
end.freeze
|
|
68
|
+
private_constant :SCRAPER_CONFIG
|
|
28
69
|
|
|
29
|
-
|
|
30
|
-
|
|
70
|
+
Config = Dry::Schema.Params do
|
|
71
|
+
optional(:scraper).hash(&SCRAPER_CONFIG)
|
|
31
72
|
|
|
32
|
-
|
|
73
|
+
optional(:cleanup).hash do
|
|
74
|
+
optional(:keep_different_domain).filled(:bool)
|
|
75
|
+
optional(:min_words_title).filled(:integer, gt?: 0)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
33
78
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
79
|
+
##
|
|
80
|
+
# @param response [Html2rss::RequestService::Response] initial page response
|
|
81
|
+
# @param opts [Hash] validated auto-source options
|
|
82
|
+
# @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
|
|
83
|
+
# @return [void]
|
|
84
|
+
def initialize(response, opts = DEFAULT_CONFIG, request_session: nil)
|
|
85
|
+
@parsed_body = response.parsed_body
|
|
86
|
+
@url = response.url
|
|
87
|
+
@opts = opts
|
|
88
|
+
@request_session = request_session
|
|
38
89
|
end
|
|
39
90
|
|
|
91
|
+
##
|
|
92
|
+
# Extracts article candidates by selecting every scraper that can explain the
|
|
93
|
+
# page shape, running those scrapers, and normalizing the resulting hashes
|
|
94
|
+
# into `RssBuilder::Article` objects.
|
|
95
|
+
#
|
|
96
|
+
# The contributor-facing flow is:
|
|
97
|
+
# 1. choose scraper instances that match the page
|
|
98
|
+
# 2. let each scraper collect its own candidates
|
|
99
|
+
# 3. clean and deduplicate the merged article list
|
|
100
|
+
#
|
|
101
|
+
# Scrapers with expensive precomputation, such as `SemanticHtml`, keep that
|
|
102
|
+
# state on the instance so detection and extraction can reuse the same work.
|
|
103
|
+
#
|
|
104
|
+
# @return [Array<Html2rss::RssBuilder::Article>] extracted articles
|
|
40
105
|
def articles
|
|
41
|
-
@articles ||=
|
|
42
|
-
|
|
106
|
+
@articles ||= extract_articles
|
|
107
|
+
rescue Html2rss::AutoSource::Scraper::NoScraperFound => error
|
|
108
|
+
Log.warn "#{self.class}: no scraper matched #{url} (#{error.message})"
|
|
109
|
+
[]
|
|
110
|
+
end
|
|
43
111
|
|
|
44
|
-
|
|
45
|
-
Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
|
|
112
|
+
private
|
|
46
113
|
|
|
47
|
-
|
|
48
|
-
end
|
|
114
|
+
attr_reader :url, :parsed_body, :request_session
|
|
49
115
|
|
|
50
|
-
|
|
116
|
+
def extract_articles
|
|
117
|
+
scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
|
|
118
|
+
return [] if scraper_instances.empty?
|
|
51
119
|
|
|
52
|
-
|
|
120
|
+
# Scrapers are instantiated and run in parallel threads. Implementations
|
|
121
|
+
# must avoid shared mutable state, treat request_session calls as
|
|
122
|
+
# concurrency-safe from the scraper side, and return no articles when a
|
|
123
|
+
# follow-up would be unsafe or unsupported.
|
|
124
|
+
articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
|
|
125
|
+
run_scraper(instance)
|
|
53
126
|
end
|
|
127
|
+
Cleanup.call(articles, url:, **cleanup_options)
|
|
54
128
|
end
|
|
55
129
|
|
|
56
|
-
def
|
|
57
|
-
|
|
130
|
+
def run_scraper(instance)
|
|
131
|
+
instance.each.map do |article_hash|
|
|
132
|
+
RssBuilder::Article.new(**article_hash, scraper: instance.class)
|
|
133
|
+
end
|
|
58
134
|
end
|
|
59
135
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
136
|
+
def cleanup_options
|
|
137
|
+
@opts.fetch(:cleanup, {})
|
|
138
|
+
end
|
|
63
139
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
.tap do |doc|
|
|
68
|
-
# Remove comments from the document
|
|
69
|
-
doc.xpath('//comment()').each(&:remove)
|
|
70
|
-
end.freeze
|
|
140
|
+
def thread_count_for(scrapers)
|
|
141
|
+
count = [scrapers.size, Parallel.processor_count].min
|
|
142
|
+
count.zero? ? 1 : count
|
|
71
143
|
end
|
|
72
144
|
end
|
|
73
145
|
end
|