html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -4,31 +4,183 @@ module Html2rss
4
4
  class AutoSource
5
5
  ##
6
6
  # The Scraper module contains all scrapers that can be used to extract articles.
7
- # Each scraper should implement a `call` method that returns an array of article hashes.
7
+ # Each scraper should implement an `each` method that yields article hashes.
8
8
  # Each scraper should also implement an `articles?` method that returns true if the scraper
9
9
  # can potentially be used to extract articles from the given HTML.
10
10
  #
11
+ # Detection is intentionally shallow for most scrapers, but instance-based
12
+ # matching is available for scrapers that need to carry expensive selection
13
+ # state forward into extraction.
14
+ # Scrapers run in parallel threads, so implementations must avoid shared
15
+ # mutable state and degrade by returning no articles when a follow-up would
16
+ # be unsafe or unsupported.
11
17
  module Scraper
18
+ # Root markers indicating likely app-shell/client-rendered surfaces.
19
+ APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
20
+ # Maximum anchors tolerated before app-shell detection is considered unlikely.
21
+ APP_SHELL_MAX_ANCHORS = 2
22
+ # Maximum visible text length tolerated for app-shell classification.
23
+ APP_SHELL_MAX_VISIBLE_TEXT_LENGTH = 220
24
+
25
+ # Ordered scraper classes considered during auto-source extraction.
12
26
  SCRAPERS = [
13
- Html,
27
+ WordpressApi,
14
28
  Schema,
15
- SemanticHtml
29
+ Microdata,
30
+ JsonState,
31
+ SemanticHtml,
32
+ Html
16
33
  ].freeze
17
34
 
18
35
  ##
19
36
  # Error raised when no suitable scraper is found.
20
- class NoScraperFound < Html2rss::Error; end
37
+ class NoScraperFound < Html2rss::Error
38
+ # User-facing messages grouped by no-scraper surface category.
39
+ CATEGORY_MESSAGES = {
40
+ blocked_surface: 'No scrapers found: blocked surface likely (anti-bot or interstitial). ' \
41
+ 'Retry with --strategy browserless, try a more specific public listing URL, ' \
42
+ 'or run from an environment that can complete anti-bot checks.',
43
+ app_shell: 'No scrapers found: app-shell surface detected (client-rendered page with little or no ' \
44
+ 'server-rendered article HTML). Retry with --strategy browserless, or target a direct ' \
45
+ 'listing/update URL instead of a homepage or shell entrypoint.',
46
+ unsupported_surface: 'No scrapers found: unsupported extraction surface for auto mode. ' \
47
+ 'Try a direct listing/changelog/category URL, ' \
48
+ 'or use explicit selectors in a feed config.'
49
+ }.freeze
50
+
51
+ # @param message [String, nil] custom error message override
52
+ # @param category [Symbol] no-scraper classification
53
+ def initialize(message = nil, category: :unsupported_surface)
54
+ validate_category!(category)
55
+ @category = category
56
+ super(message || CATEGORY_MESSAGES.fetch(@category))
57
+ end
58
+
59
+ attr_reader :category
60
+
61
+ private
62
+
63
+ def validate_category!(category)
64
+ return if CATEGORY_MESSAGES.key?(category)
65
+
66
+ valid_categories = CATEGORY_MESSAGES.keys.join(', ')
67
+ raise ArgumentError, "Unknown category: #{category.inspect}. Valid categories are: #{valid_categories}"
68
+ end
69
+ end
21
70
 
22
71
  ##
23
- # Returns an array of scrapers that claim to find articles in the parsed body.
72
+ # Returns an array of scraper classes that claim to find articles in the parsed body.
24
73
  # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
74
+ # @param opts [Hash] The options hash.
75
+ # @option opts [Hash] :wordpress_api scraper toggle and configuration
76
+ # @option opts [Hash] :schema scraper toggle and configuration
77
+ # @option opts [Hash] :microdata scraper toggle and configuration
78
+ # @option opts [Hash] :json_state scraper toggle and configuration
79
+ # @option opts [Hash] :semantic_html scraper toggle and configuration
80
+ # @option opts [Hash] :html scraper toggle and configuration
25
81
  # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
26
- def self.from(parsed_body)
27
- scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
28
- raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
82
+ def self.from(parsed_body, opts = Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
83
+ scrapers = SCRAPERS.select { |scraper| opts.dig(scraper.options_key, :enabled) }
84
+ scrapers.select! { |scraper| scraper.articles?(parsed_body) }
85
+
86
+ raise no_scraper_found_for(parsed_body) if scrapers.empty?
29
87
 
30
88
  scrapers
31
89
  end
90
+
91
+ # Returns scraper instances ready for extraction.
92
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
93
+ # @param url [String, Html2rss::Url] The page url.
94
+ # @param request_session [Html2rss::RequestSession, nil] Shared follow-up session.
95
+ # @param opts [Hash] The options hash.
96
+ # @option opts [Hash] :wordpress_api scraper toggle and configuration
97
+ # @option opts [Hash] :schema scraper toggle and configuration
98
+ # @option opts [Hash] :microdata scraper toggle and configuration
99
+ # @option opts [Hash] :json_state scraper toggle and configuration
100
+ # @option opts [Hash] :semantic_html scraper toggle and configuration
101
+ # @option opts [Hash] :html scraper toggle and configuration
102
+ # @return [Array<Object>] An array of scraper instances that can handle the parsed body.
103
+ #
104
+ # `instances_for` is the main entrypoint for extraction. It lets a scraper
105
+ # decide whether it matches using the same instance that will later yield
106
+ # article hashes, which keeps precomputed state close to the scraper that
107
+ # owns it.
108
+ def self.instances_for(parsed_body, url:, request_session: nil,
109
+ opts: Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
110
+ instances = SCRAPERS.filter_map do |scraper|
111
+ next unless opts.dig(scraper.options_key, :enabled)
112
+
113
+ instance = scraper.new(parsed_body, url:, request_session:, **opts.fetch(scraper.options_key, {}))
114
+ next unless extractable_instance?(instance, parsed_body)
115
+
116
+ instance
117
+ end
118
+
119
+ raise no_scraper_found_for(parsed_body) if instances.empty?
120
+
121
+ instances
122
+ end
123
+
124
+ def self.extractable_instance?(instance, parsed_body)
125
+ return instance.extractable? if instance.respond_to?(:extractable?)
126
+
127
+ instance.class.articles?(parsed_body)
128
+ end
129
+ private_class_method :extractable_instance?
130
+
131
+ def self.no_scraper_found_for(parsed_body)
132
+ NoScraperFound.new(category: classify_no_scraper_surface(parsed_body))
133
+ end
134
+ private_class_method :no_scraper_found_for
135
+
136
+ def self.classify_no_scraper_surface(parsed_body)
137
+ return :blocked_surface if blocked_surface?(parsed_body)
138
+ return :app_shell if app_shell_surface?(parsed_body)
139
+
140
+ :unsupported_surface
141
+ end
142
+ private_class_method :classify_no_scraper_surface
143
+
144
+ def self.blocked_surface?(parsed_body)
145
+ Html2rss::BlockedSurface.interstitial?(parsed_body.to_html)
146
+ end
147
+ private_class_method :blocked_surface?
148
+
149
+ def self.app_shell_surface?(parsed_body)
150
+ root_marker = parsed_body.at_css(APP_SHELL_ROOT_SELECTORS)
151
+ return false unless root_marker
152
+
153
+ sparse_anchor_surface?(parsed_body) &&
154
+ no_article_markers?(parsed_body) &&
155
+ short_visible_text?(parsed_body)
156
+ end
157
+ private_class_method :app_shell_surface?
158
+
159
+ def self.sparse_anchor_surface?(parsed_body)
160
+ parsed_body.css('body a[href]').size <= APP_SHELL_MAX_ANCHORS
161
+ end
162
+ private_class_method :sparse_anchor_surface?
163
+
164
+ def self.no_article_markers?(parsed_body)
165
+ parsed_body.css(
166
+ 'article, main article, [itemtype*="Article"], [itemprop="articleBody"]'
167
+ ).empty?
168
+ end
169
+ private_class_method :no_article_markers?
170
+
171
+ def self.short_visible_text?(parsed_body)
172
+ visible_text_length(parsed_body) <= APP_SHELL_MAX_VISIBLE_TEXT_LENGTH
173
+ end
174
+ private_class_method :short_visible_text?
175
+
176
+ def self.visible_text_length(parsed_body)
177
+ body = parsed_body.at_css('body')
178
+ return 0 unless body
179
+
180
+ text_nodes = body.xpath('.//text()[not(ancestor::script or ancestor::style or ancestor::noscript)]')
181
+ text_nodes.map(&:text).join(' ').gsub(/\s+/, ' ').strip.length
182
+ end
183
+ private_class_method :visible_text_length
32
184
  end
33
185
  end
34
186
  end
@@ -1,73 +1,149 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'nokogiri'
4
3
  require 'parallel'
5
- require 'addressable'
4
+ require 'dry-validation'
6
5
 
7
6
  module Html2rss
8
7
  ##
9
- # The AutoSource class is responsible for extracting channel and articles
10
- # from a given URL.
11
- # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
- # marking articles, e.g. schema, microdata, open graph, etc.
8
+ # The AutoSource class automatically extracts articles from a given URL by
9
+ # utilizing a collection of Scrapers. These scrapers analyze and
10
+ # parse popular structured data formats—such as schema, microdata, and
11
+ # open graph—to identify and compile article elements into unified articles.
12
+ #
13
+ # Scrapers supporting plain HTML are also available for sites without structured data,
14
+ # though results may vary based on page markup.
15
+ #
16
+ # @see Html2rss::AutoSource::Scraper::Schema
17
+ # @see Html2rss::AutoSource::Scraper::SemanticHtml
18
+ # @see Html2rss::AutoSource::Scraper::Html
13
19
  class AutoSource
14
- class NoArticlesFound < Html2rss::Error; end
15
-
16
- ##
17
- # @param url [Addressable::URI] The URL to extract articles from.
18
- # @param body [String] The body of the response.
19
- # @param headers [Hash] The headers of the response.
20
- def initialize(url, body:, headers: {})
21
- @url = url
22
- @body = body
23
- @headers = headers
24
- end
25
-
26
- def build
27
- raise NoArticlesFound if articles.empty?
20
+ # Default auto-source configuration shipped for scraper and cleanup behavior.
21
+ DEFAULT_CONFIG = {
22
+ scraper: {
23
+ wordpress_api: {
24
+ enabled: true
25
+ },
26
+ schema: {
27
+ enabled: true
28
+ },
29
+ microdata: {
30
+ enabled: true
31
+ },
32
+ json_state: {
33
+ enabled: true
34
+ },
35
+ semantic_html: {
36
+ enabled: true
37
+ },
38
+ html: {
39
+ enabled: true,
40
+ minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
41
+ use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
42
+ }
43
+ },
44
+ cleanup: Cleanup::DEFAULT_CONFIG
45
+ }.freeze
46
+
47
+ SCRAPER_CONFIG = proc do
48
+ optional(:wordpress_api).hash do
49
+ optional(:enabled).filled(:bool)
50
+ end
51
+ optional(:schema).hash do
52
+ optional(:enabled).filled(:bool)
53
+ end
54
+ optional(:microdata).hash do
55
+ optional(:enabled).filled(:bool)
56
+ end
57
+ optional(:json_state).hash do
58
+ optional(:enabled).filled(:bool)
59
+ end
60
+ optional(:semantic_html).hash do
61
+ optional(:enabled).filled(:bool)
62
+ end
63
+ optional(:html).hash do
64
+ optional(:enabled).filled(:bool)
65
+ optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
66
+ optional(:use_top_selectors).filled(:integer, gt?: 0)
67
+ end
68
+ end.freeze
69
+ private_constant :SCRAPER_CONFIG
28
70
 
29
- Reducer.call(articles, url:)
30
- Cleanup.call(articles, url:, keep_different_domain: true)
71
+ # Runtime schema used to validate auto-source config values.
72
+ Config = Dry::Schema.Params do
73
+ optional(:scraper).hash(&SCRAPER_CONFIG)
31
74
 
32
- channel.articles = articles
75
+ optional(:cleanup).hash do
76
+ optional(:keep_different_domain).filled(:bool)
77
+ optional(:min_words_title).filled(:integer, gt?: 0)
78
+ end
79
+ end
33
80
 
34
- Html2rss::AutoSource::RssBuilder.new(
35
- channel:,
36
- articles:
37
- ).call
81
+ ##
82
+ # @param response [Html2rss::RequestService::Response] initial page response
83
+ # @param opts [Hash] validated auto-source options
84
+ # @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
85
+ # @option opts [Hash] :scraper scraper configuration map
86
+ # @option opts [Hash] :cleanup cleanup configuration map
87
+ # @return [void]
88
+ def initialize(response, opts = DEFAULT_CONFIG, request_session: nil)
89
+ @parsed_body = response.parsed_body
90
+ @url = response.url
91
+ @opts = opts
92
+ @request_session = request_session
38
93
  end
39
94
 
95
+ ##
96
+ # Extracts article candidates by selecting every scraper that can explain the
97
+ # page shape, running those scrapers, and normalizing the resulting hashes
98
+ # into `RssBuilder::Article` objects.
99
+ #
100
+ # The contributor-facing flow is:
101
+ # 1. choose scraper instances that match the page
102
+ # 2. let each scraper collect its own candidates
103
+ # 3. clean and deduplicate the merged article list
104
+ #
105
+ # Scrapers with expensive precomputation, such as `SemanticHtml`, keep that
106
+ # state on the instance so detection and extraction can reuse the same work.
107
+ #
108
+ # @return [Array<Html2rss::RssBuilder::Article>] extracted articles
40
109
  def articles
41
- @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
42
- instance = scraper.new(parsed_body, url:)
110
+ @articles ||= extract_articles
111
+ rescue Html2rss::AutoSource::Scraper::NoScraperFound => error
112
+ Log.warn "#{self.class}: no scraper matched #{url} (#{error.message})"
113
+ []
114
+ end
43
115
 
44
- articles_in_thread = Parallel.map(instance.each) do |article_hash|
45
- Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
116
+ private
46
117
 
47
- Article.new(**article_hash, scraper:)
48
- end
118
+ attr_reader :url, :parsed_body, :request_session
49
119
 
50
- Reducer.call(articles_in_thread, url:)
120
+ def extract_articles
121
+ scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
122
+ return [] if scraper_instances.empty?
51
123
 
52
- articles_in_thread
124
+ # Scrapers are instantiated and run in parallel threads. Implementations
125
+ # must avoid shared mutable state, treat request_session calls as
126
+ # concurrency-safe from the scraper side, and return no articles when a
127
+ # follow-up would be unsafe or unsupported.
128
+ articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
129
+ run_scraper(instance)
53
130
  end
131
+ Cleanup.call(articles, url:, **cleanup_options)
54
132
  end
55
133
 
56
- def channel
57
- @channel ||= Channel.new(parsed_body, headers: @headers, url:)
134
+ def run_scraper(instance)
135
+ instance.each.map do |article_hash|
136
+ RssBuilder::Article.new(**article_hash, scraper: instance.class)
137
+ end
58
138
  end
59
139
 
60
- private
61
-
62
- attr_reader :url
140
+ def cleanup_options
141
+ @opts.fetch(:cleanup, {})
142
+ end
63
143
 
64
- # @return [Nokogiri::HTML::Document]
65
- def parsed_body
66
- @parsed_body ||= Nokogiri.HTML(@body)
67
- .tap do |doc|
68
- # Remove comments from the document
69
- doc.xpath('//comment()').each(&:remove)
70
- end.freeze
144
+ def thread_count_for(scrapers)
145
+ count = [scrapers.size, Parallel.processor_count].min
146
+ count.zero? ? 1 : count
71
147
  end
72
148
  end
73
149
  end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Shared anti-bot/interstitial signatures used by request and auto-source flows.
6
+ #
7
+ # This module centralizes signature matching so request-time guards and
8
+ # auto-source surface classification stay consistent.
9
+ module BlockedSurface
10
+ # Known interstitial fingerprints used to detect blocked or anti-bot surfaces.
11
+ INTERSTITIAL_SIGNATURES = [
12
+ {
13
+ key: :cloudflare_interstitial,
14
+ min_matches: 2,
15
+ patterns: [
16
+ %r{<title>\s*just a moment\.\.\.\s*</title>}i,
17
+ /checking your browser before accessing/i,
18
+ /please (?:enable|turn on) javascript and cookies/i,
19
+ %r{cdn-cgi/challenge-platform}i,
20
+ /cloudflare ray id/i
21
+ ],
22
+ message: 'Blocked surface detected: Cloudflare anti-bot interstitial page. ' \
23
+ 'Retry with --strategy browserless, try a more specific public listing URL, ' \
24
+ 'or run from an environment that can complete anti-bot checks.'
25
+ }
26
+ ].freeze
27
+
28
+ ##
29
+ # Returns the first matching interstitial signature for the provided body.
30
+ #
31
+ # @param body [String, nil] response body candidate
32
+ # @return [Hash, nil] signature hash when matched, otherwise nil
33
+ def self.interstitial_signature_for(body)
34
+ normalized_body = normalize_body(body)
35
+ INTERSTITIAL_SIGNATURES.find { |signature| interstitial_signature_match?(normalized_body, signature) }
36
+ end
37
+
38
+ ##
39
+ # @param body [String, nil] response body candidate
40
+ # @return [Boolean] true when body matches a known interstitial signature
41
+ def self.interstitial?(body)
42
+ !interstitial_signature_for(body).nil?
43
+ end
44
+
45
+ def self.interstitial_signature_match?(body, signature)
46
+ min_matches = signature.fetch(:min_matches, 1)
47
+ matches = 0
48
+
49
+ signature.fetch(:patterns).each do |pattern|
50
+ matches += 1 if pattern.match?(body)
51
+ return true if matches >= min_matches
52
+ end
53
+
54
+ false
55
+ end
56
+ private_class_method :interstitial_signature_match?
57
+
58
+ def self.normalize_body(body)
59
+ body.to_s.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
60
+ rescue Encoding::CompatibilityError, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
61
+ body.to_s.force_encoding(Encoding::UTF_8).scrub
62
+ end
63
+ private_class_method :normalize_body
64
+ end
65
+ end
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # CategoryExtractor is responsible for extracting categories from HTML elements
6
+ # by looking for CSS class names containing common category-related terms.
7
+ class CategoryExtractor
8
+ # Common category-related terms to look for in class names
9
+ CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
10
+
11
+ # CSS selectors to find elements with category-related class names
12
+ CATEGORY_SELECTORS = CATEGORY_TERMS.map { |term| "[class*=\"#{term}\"]" }.freeze
13
+
14
+ # Regex pattern for matching category-related attribute names
15
+ CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
16
+
17
+ ##
18
+ # Extracts categories from the given article tag by looking for elements
19
+ # with class names containing common category-related terms.
20
+ #
21
+ # @param article_tag [Nokogiri::XML::Element] The article element to extract categories from
22
+ # @return [Array<String>] Array of category strings, empty if none found
23
+ def self.call(article_tag)
24
+ return [] unless article_tag
25
+
26
+ # Single optimized traversal that extracts all category types
27
+ extract_all_categories(article_tag)
28
+ .map(&:strip)
29
+ .reject(&:empty?)
30
+ end
31
+
32
+ ##
33
+ # Optimized single DOM traversal that extracts all category types.
34
+ #
35
+ # @param article_tag [Nokogiri::XML::Element] The article element
36
+ # @return [Set<String>] Set of category strings
37
+ def self.extract_all_categories(article_tag)
38
+ Set.new.tap do |categories|
39
+ article_tag.css('*').each do |element|
40
+ # Extract text categories from elements with category-related class names
41
+ categories.merge(extract_text_categories(element)) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
42
+
43
+ # Extract data categories from all elements
44
+ categories.merge(extract_element_data_categories(element))
45
+ end
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Extracts categories from data attributes of a single element.
51
+ #
52
+ # @param element [Nokogiri::XML::Element] metadata element that may contain category links
53
+ # @return [Set<String>] Set of category strings
54
+ def self.extract_element_data_categories(element)
55
+ Set.new.tap do |categories|
56
+ element.attributes.each_value do |attr|
57
+ next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
58
+
59
+ value = attr.value&.strip
60
+ categories.add(value) if value && !value.empty?
61
+ end
62
+ end
63
+ end
64
+
65
+ ##
66
+ # Extracts text-based categories from elements, splitting content into discrete values.
67
+ #
68
+ # @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
69
+ # @return [Set<String>] Set of category strings
70
+ def self.extract_text_categories(element)
71
+ anchor_values = element.css('a').filter_map do |node|
72
+ HtmlExtractor.extract_visible_text(node)
73
+ end
74
+ return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
75
+
76
+ text = HtmlExtractor.extract_visible_text(element)
77
+ return Set.new unless text
78
+
79
+ Set.new(text.split(/\n+/).map(&:strip).reject(&:empty?))
80
+ end
81
+ end
82
+ end