html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -4,31 +4,183 @@ module Html2rss
|
|
|
4
4
|
class AutoSource
|
|
5
5
|
##
|
|
6
6
|
# The Scraper module contains all scrapers that can be used to extract articles.
|
|
7
|
-
# Each scraper should implement
|
|
7
|
+
# Each scraper should implement an `each` method that yields article hashes.
|
|
8
8
|
# Each scraper should also implement an `articles?` method that returns true if the scraper
|
|
9
9
|
# can potentially be used to extract articles from the given HTML.
|
|
10
10
|
#
|
|
11
|
+
# Detection is intentionally shallow for most scrapers, but instance-based
|
|
12
|
+
# matching is available for scrapers that need to carry expensive selection
|
|
13
|
+
# state forward into extraction.
|
|
14
|
+
# Scrapers run in parallel threads, so implementations must avoid shared
|
|
15
|
+
# mutable state and degrade by returning no articles when a follow-up would
|
|
16
|
+
# be unsafe or unsupported.
|
|
11
17
|
module Scraper
|
|
18
|
+
# Root markers indicating likely app-shell/client-rendered surfaces.
|
|
19
|
+
APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
|
|
20
|
+
# Maximum anchors tolerated before app-shell detection is considered unlikely.
|
|
21
|
+
APP_SHELL_MAX_ANCHORS = 2
|
|
22
|
+
# Maximum visible text length tolerated for app-shell classification.
|
|
23
|
+
APP_SHELL_MAX_VISIBLE_TEXT_LENGTH = 220
|
|
24
|
+
|
|
25
|
+
# Ordered scraper classes considered during auto-source extraction.
|
|
12
26
|
SCRAPERS = [
|
|
13
|
-
|
|
27
|
+
WordpressApi,
|
|
14
28
|
Schema,
|
|
15
|
-
|
|
29
|
+
Microdata,
|
|
30
|
+
JsonState,
|
|
31
|
+
SemanticHtml,
|
|
32
|
+
Html
|
|
16
33
|
].freeze
|
|
17
34
|
|
|
18
35
|
##
|
|
19
36
|
# Error raised when no suitable scraper is found.
|
|
20
|
-
class NoScraperFound < Html2rss::Error
|
|
37
|
+
class NoScraperFound < Html2rss::Error
|
|
38
|
+
# User-facing messages grouped by no-scraper surface category.
|
|
39
|
+
CATEGORY_MESSAGES = {
|
|
40
|
+
blocked_surface: 'No scrapers found: blocked surface likely (anti-bot or interstitial). ' \
|
|
41
|
+
'Retry with --strategy browserless, try a more specific public listing URL, ' \
|
|
42
|
+
'or run from an environment that can complete anti-bot checks.',
|
|
43
|
+
app_shell: 'No scrapers found: app-shell surface detected (client-rendered page with little or no ' \
|
|
44
|
+
'server-rendered article HTML). Retry with --strategy browserless, or target a direct ' \
|
|
45
|
+
'listing/update URL instead of a homepage or shell entrypoint.',
|
|
46
|
+
unsupported_surface: 'No scrapers found: unsupported extraction surface for auto mode. ' \
|
|
47
|
+
'Try a direct listing/changelog/category URL, ' \
|
|
48
|
+
'or use explicit selectors in a feed config.'
|
|
49
|
+
}.freeze
|
|
50
|
+
|
|
51
|
+
# @param message [String, nil] custom error message override
|
|
52
|
+
# @param category [Symbol] no-scraper classification
|
|
53
|
+
def initialize(message = nil, category: :unsupported_surface)
|
|
54
|
+
validate_category!(category)
|
|
55
|
+
@category = category
|
|
56
|
+
super(message || CATEGORY_MESSAGES.fetch(@category))
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
attr_reader :category
|
|
60
|
+
|
|
61
|
+
private
|
|
62
|
+
|
|
63
|
+
def validate_category!(category)
|
|
64
|
+
return if CATEGORY_MESSAGES.key?(category)
|
|
65
|
+
|
|
66
|
+
valid_categories = CATEGORY_MESSAGES.keys.join(', ')
|
|
67
|
+
raise ArgumentError, "Unknown category: #{category.inspect}. Valid categories are: #{valid_categories}"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
21
70
|
|
|
22
71
|
##
|
|
23
|
-
# Returns an array of
|
|
72
|
+
# Returns an array of scraper classes that claim to find articles in the parsed body.
|
|
24
73
|
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
|
74
|
+
# @param opts [Hash] The options hash.
|
|
75
|
+
# @option opts [Hash] :wordpress_api scraper toggle and configuration
|
|
76
|
+
# @option opts [Hash] :schema scraper toggle and configuration
|
|
77
|
+
# @option opts [Hash] :microdata scraper toggle and configuration
|
|
78
|
+
# @option opts [Hash] :json_state scraper toggle and configuration
|
|
79
|
+
# @option opts [Hash] :semantic_html scraper toggle and configuration
|
|
80
|
+
# @option opts [Hash] :html scraper toggle and configuration
|
|
25
81
|
# @return [Array<Class>] An array of scraper classes that can handle the parsed body.
|
|
26
|
-
def self.from(parsed_body)
|
|
27
|
-
scrapers = SCRAPERS.select { |scraper| scraper.
|
|
28
|
-
|
|
82
|
+
def self.from(parsed_body, opts = Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
|
|
83
|
+
scrapers = SCRAPERS.select { |scraper| opts.dig(scraper.options_key, :enabled) }
|
|
84
|
+
scrapers.select! { |scraper| scraper.articles?(parsed_body) }
|
|
85
|
+
|
|
86
|
+
raise no_scraper_found_for(parsed_body) if scrapers.empty?
|
|
29
87
|
|
|
30
88
|
scrapers
|
|
31
89
|
end
|
|
90
|
+
|
|
91
|
+
# Returns scraper instances ready for extraction.
|
|
92
|
+
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
|
93
|
+
# @param url [String, Html2rss::Url] The page url.
|
|
94
|
+
# @param request_session [Html2rss::RequestSession, nil] Shared follow-up session.
|
|
95
|
+
# @param opts [Hash] The options hash.
|
|
96
|
+
# @option opts [Hash] :wordpress_api scraper toggle and configuration
|
|
97
|
+
# @option opts [Hash] :schema scraper toggle and configuration
|
|
98
|
+
# @option opts [Hash] :microdata scraper toggle and configuration
|
|
99
|
+
# @option opts [Hash] :json_state scraper toggle and configuration
|
|
100
|
+
# @option opts [Hash] :semantic_html scraper toggle and configuration
|
|
101
|
+
# @option opts [Hash] :html scraper toggle and configuration
|
|
102
|
+
# @return [Array<Object>] An array of scraper instances that can handle the parsed body.
|
|
103
|
+
#
|
|
104
|
+
# `instances_for` is the main entrypoint for extraction. It lets a scraper
|
|
105
|
+
# decide whether it matches using the same instance that will later yield
|
|
106
|
+
# article hashes, which keeps precomputed state close to the scraper that
|
|
107
|
+
# owns it.
|
|
108
|
+
def self.instances_for(parsed_body, url:, request_session: nil,
|
|
109
|
+
opts: Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
|
|
110
|
+
instances = SCRAPERS.filter_map do |scraper|
|
|
111
|
+
next unless opts.dig(scraper.options_key, :enabled)
|
|
112
|
+
|
|
113
|
+
instance = scraper.new(parsed_body, url:, request_session:, **opts.fetch(scraper.options_key, {}))
|
|
114
|
+
next unless extractable_instance?(instance, parsed_body)
|
|
115
|
+
|
|
116
|
+
instance
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
raise no_scraper_found_for(parsed_body) if instances.empty?
|
|
120
|
+
|
|
121
|
+
instances
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def self.extractable_instance?(instance, parsed_body)
|
|
125
|
+
return instance.extractable? if instance.respond_to?(:extractable?)
|
|
126
|
+
|
|
127
|
+
instance.class.articles?(parsed_body)
|
|
128
|
+
end
|
|
129
|
+
private_class_method :extractable_instance?
|
|
130
|
+
|
|
131
|
+
def self.no_scraper_found_for(parsed_body)
|
|
132
|
+
NoScraperFound.new(category: classify_no_scraper_surface(parsed_body))
|
|
133
|
+
end
|
|
134
|
+
private_class_method :no_scraper_found_for
|
|
135
|
+
|
|
136
|
+
def self.classify_no_scraper_surface(parsed_body)
|
|
137
|
+
return :blocked_surface if blocked_surface?(parsed_body)
|
|
138
|
+
return :app_shell if app_shell_surface?(parsed_body)
|
|
139
|
+
|
|
140
|
+
:unsupported_surface
|
|
141
|
+
end
|
|
142
|
+
private_class_method :classify_no_scraper_surface
|
|
143
|
+
|
|
144
|
+
def self.blocked_surface?(parsed_body)
|
|
145
|
+
Html2rss::BlockedSurface.interstitial?(parsed_body.to_html)
|
|
146
|
+
end
|
|
147
|
+
private_class_method :blocked_surface?
|
|
148
|
+
|
|
149
|
+
def self.app_shell_surface?(parsed_body)
|
|
150
|
+
root_marker = parsed_body.at_css(APP_SHELL_ROOT_SELECTORS)
|
|
151
|
+
return false unless root_marker
|
|
152
|
+
|
|
153
|
+
sparse_anchor_surface?(parsed_body) &&
|
|
154
|
+
no_article_markers?(parsed_body) &&
|
|
155
|
+
short_visible_text?(parsed_body)
|
|
156
|
+
end
|
|
157
|
+
private_class_method :app_shell_surface?
|
|
158
|
+
|
|
159
|
+
def self.sparse_anchor_surface?(parsed_body)
|
|
160
|
+
parsed_body.css('body a[href]').size <= APP_SHELL_MAX_ANCHORS
|
|
161
|
+
end
|
|
162
|
+
private_class_method :sparse_anchor_surface?
|
|
163
|
+
|
|
164
|
+
def self.no_article_markers?(parsed_body)
|
|
165
|
+
parsed_body.css(
|
|
166
|
+
'article, main article, [itemtype*="Article"], [itemprop="articleBody"]'
|
|
167
|
+
).empty?
|
|
168
|
+
end
|
|
169
|
+
private_class_method :no_article_markers?
|
|
170
|
+
|
|
171
|
+
def self.short_visible_text?(parsed_body)
|
|
172
|
+
visible_text_length(parsed_body) <= APP_SHELL_MAX_VISIBLE_TEXT_LENGTH
|
|
173
|
+
end
|
|
174
|
+
private_class_method :short_visible_text?
|
|
175
|
+
|
|
176
|
+
def self.visible_text_length(parsed_body)
|
|
177
|
+
body = parsed_body.at_css('body')
|
|
178
|
+
return 0 unless body
|
|
179
|
+
|
|
180
|
+
text_nodes = body.xpath('.//text()[not(ancestor::script or ancestor::style or ancestor::noscript)]')
|
|
181
|
+
text_nodes.map(&:text).join(' ').gsub(/\s+/, ' ').strip.length
|
|
182
|
+
end
|
|
183
|
+
private_class_method :visible_text_length
|
|
32
184
|
end
|
|
33
185
|
end
|
|
34
186
|
end
|
data/lib/html2rss/auto_source.rb
CHANGED
|
@@ -1,73 +1,149 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'nokogiri'
|
|
4
3
|
require 'parallel'
|
|
5
|
-
require '
|
|
4
|
+
require 'dry-validation'
|
|
6
5
|
|
|
7
6
|
module Html2rss
|
|
8
7
|
##
|
|
9
|
-
# The AutoSource class
|
|
10
|
-
#
|
|
11
|
-
#
|
|
12
|
-
#
|
|
8
|
+
# The AutoSource class automatically extracts articles from a given URL by
|
|
9
|
+
# utilizing a collection of Scrapers. These scrapers analyze and
|
|
10
|
+
# parse popular structured data formats—such as schema, microdata, and
|
|
11
|
+
# open graph—to identify and compile article elements into unified articles.
|
|
12
|
+
#
|
|
13
|
+
# Scrapers supporting plain HTML are also available for sites without structured data,
|
|
14
|
+
# though results may vary based on page markup.
|
|
15
|
+
#
|
|
16
|
+
# @see Html2rss::AutoSource::Scraper::Schema
|
|
17
|
+
# @see Html2rss::AutoSource::Scraper::SemanticHtml
|
|
18
|
+
# @see Html2rss::AutoSource::Scraper::Html
|
|
13
19
|
class AutoSource
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
# Default auto-source configuration shipped for scraper and cleanup behavior.
|
|
21
|
+
DEFAULT_CONFIG = {
|
|
22
|
+
scraper: {
|
|
23
|
+
wordpress_api: {
|
|
24
|
+
enabled: true
|
|
25
|
+
},
|
|
26
|
+
schema: {
|
|
27
|
+
enabled: true
|
|
28
|
+
},
|
|
29
|
+
microdata: {
|
|
30
|
+
enabled: true
|
|
31
|
+
},
|
|
32
|
+
json_state: {
|
|
33
|
+
enabled: true
|
|
34
|
+
},
|
|
35
|
+
semantic_html: {
|
|
36
|
+
enabled: true
|
|
37
|
+
},
|
|
38
|
+
html: {
|
|
39
|
+
enabled: true,
|
|
40
|
+
minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
|
|
41
|
+
use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
cleanup: Cleanup::DEFAULT_CONFIG
|
|
45
|
+
}.freeze
|
|
46
|
+
|
|
47
|
+
SCRAPER_CONFIG = proc do
|
|
48
|
+
optional(:wordpress_api).hash do
|
|
49
|
+
optional(:enabled).filled(:bool)
|
|
50
|
+
end
|
|
51
|
+
optional(:schema).hash do
|
|
52
|
+
optional(:enabled).filled(:bool)
|
|
53
|
+
end
|
|
54
|
+
optional(:microdata).hash do
|
|
55
|
+
optional(:enabled).filled(:bool)
|
|
56
|
+
end
|
|
57
|
+
optional(:json_state).hash do
|
|
58
|
+
optional(:enabled).filled(:bool)
|
|
59
|
+
end
|
|
60
|
+
optional(:semantic_html).hash do
|
|
61
|
+
optional(:enabled).filled(:bool)
|
|
62
|
+
end
|
|
63
|
+
optional(:html).hash do
|
|
64
|
+
optional(:enabled).filled(:bool)
|
|
65
|
+
optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
|
|
66
|
+
optional(:use_top_selectors).filled(:integer, gt?: 0)
|
|
67
|
+
end
|
|
68
|
+
end.freeze
|
|
69
|
+
private_constant :SCRAPER_CONFIG
|
|
28
70
|
|
|
29
|
-
|
|
30
|
-
|
|
71
|
+
# Runtime schema used to validate auto-source config values.
|
|
72
|
+
Config = Dry::Schema.Params do
|
|
73
|
+
optional(:scraper).hash(&SCRAPER_CONFIG)
|
|
31
74
|
|
|
32
|
-
|
|
75
|
+
optional(:cleanup).hash do
|
|
76
|
+
optional(:keep_different_domain).filled(:bool)
|
|
77
|
+
optional(:min_words_title).filled(:integer, gt?: 0)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
33
80
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
81
|
+
##
|
|
82
|
+
# @param response [Html2rss::RequestService::Response] initial page response
|
|
83
|
+
# @param opts [Hash] validated auto-source options
|
|
84
|
+
# @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
|
|
85
|
+
# @option opts [Hash] :scraper scraper configuration map
|
|
86
|
+
# @option opts [Hash] :cleanup cleanup configuration map
|
|
87
|
+
# @return [void]
|
|
88
|
+
def initialize(response, opts = DEFAULT_CONFIG, request_session: nil)
|
|
89
|
+
@parsed_body = response.parsed_body
|
|
90
|
+
@url = response.url
|
|
91
|
+
@opts = opts
|
|
92
|
+
@request_session = request_session
|
|
38
93
|
end
|
|
39
94
|
|
|
95
|
+
##
|
|
96
|
+
# Extracts article candidates by selecting every scraper that can explain the
|
|
97
|
+
# page shape, running those scrapers, and normalizing the resulting hashes
|
|
98
|
+
# into `RssBuilder::Article` objects.
|
|
99
|
+
#
|
|
100
|
+
# The contributor-facing flow is:
|
|
101
|
+
# 1. choose scraper instances that match the page
|
|
102
|
+
# 2. let each scraper collect its own candidates
|
|
103
|
+
# 3. clean and deduplicate the merged article list
|
|
104
|
+
#
|
|
105
|
+
# Scrapers with expensive precomputation, such as `SemanticHtml`, keep that
|
|
106
|
+
# state on the instance so detection and extraction can reuse the same work.
|
|
107
|
+
#
|
|
108
|
+
# @return [Array<Html2rss::RssBuilder::Article>] extracted articles
|
|
40
109
|
def articles
|
|
41
|
-
@articles ||=
|
|
42
|
-
|
|
110
|
+
@articles ||= extract_articles
|
|
111
|
+
rescue Html2rss::AutoSource::Scraper::NoScraperFound => error
|
|
112
|
+
Log.warn "#{self.class}: no scraper matched #{url} (#{error.message})"
|
|
113
|
+
[]
|
|
114
|
+
end
|
|
43
115
|
|
|
44
|
-
|
|
45
|
-
Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
|
|
116
|
+
private
|
|
46
117
|
|
|
47
|
-
|
|
48
|
-
end
|
|
118
|
+
attr_reader :url, :parsed_body, :request_session
|
|
49
119
|
|
|
50
|
-
|
|
120
|
+
def extract_articles
|
|
121
|
+
scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
|
|
122
|
+
return [] if scraper_instances.empty?
|
|
51
123
|
|
|
52
|
-
|
|
124
|
+
# Scrapers are instantiated and run in parallel threads. Implementations
|
|
125
|
+
# must avoid shared mutable state, treat request_session calls as
|
|
126
|
+
# concurrency-safe from the scraper side, and return no articles when a
|
|
127
|
+
# follow-up would be unsafe or unsupported.
|
|
128
|
+
articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
|
|
129
|
+
run_scraper(instance)
|
|
53
130
|
end
|
|
131
|
+
Cleanup.call(articles, url:, **cleanup_options)
|
|
54
132
|
end
|
|
55
133
|
|
|
56
|
-
def
|
|
57
|
-
|
|
134
|
+
def run_scraper(instance)
|
|
135
|
+
instance.each.map do |article_hash|
|
|
136
|
+
RssBuilder::Article.new(**article_hash, scraper: instance.class)
|
|
137
|
+
end
|
|
58
138
|
end
|
|
59
139
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
140
|
+
def cleanup_options
|
|
141
|
+
@opts.fetch(:cleanup, {})
|
|
142
|
+
end
|
|
63
143
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
.tap do |doc|
|
|
68
|
-
# Remove comments from the document
|
|
69
|
-
doc.xpath('//comment()').each(&:remove)
|
|
70
|
-
end.freeze
|
|
144
|
+
def thread_count_for(scrapers)
|
|
145
|
+
count = [scrapers.size, Parallel.processor_count].min
|
|
146
|
+
count.zero? ? 1 : count
|
|
71
147
|
end
|
|
72
148
|
end
|
|
73
149
|
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Shared anti-bot/interstitial signatures used by request and auto-source flows.
|
|
6
|
+
#
|
|
7
|
+
# This module centralizes signature matching so request-time guards and
|
|
8
|
+
# auto-source surface classification stay consistent.
|
|
9
|
+
module BlockedSurface
|
|
10
|
+
# Known interstitial fingerprints used to detect blocked or anti-bot surfaces.
|
|
11
|
+
INTERSTITIAL_SIGNATURES = [
|
|
12
|
+
{
|
|
13
|
+
key: :cloudflare_interstitial,
|
|
14
|
+
min_matches: 2,
|
|
15
|
+
patterns: [
|
|
16
|
+
%r{<title>\s*just a moment\.\.\.\s*</title>}i,
|
|
17
|
+
/checking your browser before accessing/i,
|
|
18
|
+
/please (?:enable|turn on) javascript and cookies/i,
|
|
19
|
+
%r{cdn-cgi/challenge-platform}i,
|
|
20
|
+
/cloudflare ray id/i
|
|
21
|
+
],
|
|
22
|
+
message: 'Blocked surface detected: Cloudflare anti-bot interstitial page. ' \
|
|
23
|
+
'Retry with --strategy browserless, try a more specific public listing URL, ' \
|
|
24
|
+
'or run from an environment that can complete anti-bot checks.'
|
|
25
|
+
}
|
|
26
|
+
].freeze
|
|
27
|
+
|
|
28
|
+
##
|
|
29
|
+
# Returns the first matching interstitial signature for the provided body.
|
|
30
|
+
#
|
|
31
|
+
# @param body [String, nil] response body candidate
|
|
32
|
+
# @return [Hash, nil] signature hash when matched, otherwise nil
|
|
33
|
+
def self.interstitial_signature_for(body)
|
|
34
|
+
normalized_body = normalize_body(body)
|
|
35
|
+
INTERSTITIAL_SIGNATURES.find { |signature| interstitial_signature_match?(normalized_body, signature) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# @param body [String, nil] response body candidate
|
|
40
|
+
# @return [Boolean] true when body matches a known interstitial signature
|
|
41
|
+
def self.interstitial?(body)
|
|
42
|
+
!interstitial_signature_for(body).nil?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.interstitial_signature_match?(body, signature)
|
|
46
|
+
min_matches = signature.fetch(:min_matches, 1)
|
|
47
|
+
matches = 0
|
|
48
|
+
|
|
49
|
+
signature.fetch(:patterns).each do |pattern|
|
|
50
|
+
matches += 1 if pattern.match?(body)
|
|
51
|
+
return true if matches >= min_matches
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
false
|
|
55
|
+
end
|
|
56
|
+
private_class_method :interstitial_signature_match?
|
|
57
|
+
|
|
58
|
+
def self.normalize_body(body)
|
|
59
|
+
body.to_s.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
60
|
+
rescue Encoding::CompatibilityError, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
61
|
+
body.to_s.force_encoding(Encoding::UTF_8).scrub
|
|
62
|
+
end
|
|
63
|
+
private_class_method :normalize_body
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# CategoryExtractor is responsible for extracting categories from HTML elements
|
|
6
|
+
# by looking for CSS class names containing common category-related terms.
|
|
7
|
+
class CategoryExtractor
|
|
8
|
+
# Common category-related terms to look for in class names
|
|
9
|
+
CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
|
|
10
|
+
|
|
11
|
+
# CSS selectors to find elements with category-related class names
|
|
12
|
+
CATEGORY_SELECTORS = CATEGORY_TERMS.map { |term| "[class*=\"#{term}\"]" }.freeze
|
|
13
|
+
|
|
14
|
+
# Regex pattern for matching category-related attribute names
|
|
15
|
+
CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
|
|
16
|
+
|
|
17
|
+
##
|
|
18
|
+
# Extracts categories from the given article tag by looking for elements
|
|
19
|
+
# with class names containing common category-related terms.
|
|
20
|
+
#
|
|
21
|
+
# @param article_tag [Nokogiri::XML::Element] The article element to extract categories from
|
|
22
|
+
# @return [Array<String>] Array of category strings, empty if none found
|
|
23
|
+
def self.call(article_tag)
|
|
24
|
+
return [] unless article_tag
|
|
25
|
+
|
|
26
|
+
# Single optimized traversal that extracts all category types
|
|
27
|
+
extract_all_categories(article_tag)
|
|
28
|
+
.map(&:strip)
|
|
29
|
+
.reject(&:empty?)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# Optimized single DOM traversal that extracts all category types.
|
|
34
|
+
#
|
|
35
|
+
# @param article_tag [Nokogiri::XML::Element] The article element
|
|
36
|
+
# @return [Set<String>] Set of category strings
|
|
37
|
+
def self.extract_all_categories(article_tag)
|
|
38
|
+
Set.new.tap do |categories|
|
|
39
|
+
article_tag.css('*').each do |element|
|
|
40
|
+
# Extract text categories from elements with category-related class names
|
|
41
|
+
categories.merge(extract_text_categories(element)) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
|
|
42
|
+
|
|
43
|
+
# Extract data categories from all elements
|
|
44
|
+
categories.merge(extract_element_data_categories(element))
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
##
|
|
50
|
+
# Extracts categories from data attributes of a single element.
|
|
51
|
+
#
|
|
52
|
+
# @param element [Nokogiri::XML::Element] metadata element that may contain category links
|
|
53
|
+
# @return [Set<String>] Set of category strings
|
|
54
|
+
def self.extract_element_data_categories(element)
|
|
55
|
+
Set.new.tap do |categories|
|
|
56
|
+
element.attributes.each_value do |attr|
|
|
57
|
+
next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
|
|
58
|
+
|
|
59
|
+
value = attr.value&.strip
|
|
60
|
+
categories.add(value) if value && !value.empty?
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
##
|
|
66
|
+
# Extracts text-based categories from elements, splitting content into discrete values.
|
|
67
|
+
#
|
|
68
|
+
# @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
|
|
69
|
+
# @return [Set<String>] Set of category strings
|
|
70
|
+
def self.extract_text_categories(element)
|
|
71
|
+
anchor_values = element.css('a').filter_map do |node|
|
|
72
|
+
HtmlExtractor.extract_visible_text(node)
|
|
73
|
+
end
|
|
74
|
+
return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
|
|
75
|
+
|
|
76
|
+
text = HtmlExtractor.extract_visible_text(element)
|
|
77
|
+
return Set.new unless text
|
|
78
|
+
|
|
79
|
+
Set.new(text.split(/\n+/).map(&:strip).reject(&:empty?))
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|