html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class WordpressApi
7
+ ##
8
+ # Resolves the WordPress posts endpoint for a given page and scope.
9
+ class PostsEndpoint
10
+ POSTS_PATH = 'wp/v2/posts'
11
+
12
+ ##
13
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
14
+ # @param page_url [Html2rss::Url] canonical page URL
15
+ # @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
16
+ # @param posts_query [Hash<String, String>] query params for the posts request
17
+ # @param logger [Logger] logger used for operational warnings
18
+ # @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
19
+ def self.resolve(parsed_body:, page_url:, page_scope:, posts_query:, logger: Html2rss::Log)
20
+ new(parsed_body:, page_url:, page_scope:, posts_query:, logger:).call
21
+ end
22
+
23
+ ##
24
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
25
+ # @param page_url [Html2rss::Url] canonical page URL
26
+ # @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
27
+ # @param posts_query [Hash<String, String>] query params for the posts request
28
+ # @param logger [Logger] logger used for operational warnings
29
+ def initialize(parsed_body:, page_url:, page_scope:, posts_query:, logger:)
30
+ @parsed_body = parsed_body
31
+ @page_url = Html2rss::Url.from_absolute(page_url)
32
+ @page_scope = page_scope
33
+ @posts_query = posts_query
34
+ @logger = logger
35
+ end
36
+
37
+ ##
38
+ # @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
39
+ def call
40
+ api_root = api_root_url
41
+ return unless api_root
42
+ return unless fetchable_page_scope?
43
+
44
+ query_style_api_root?(api_root) ? query_root_posts_url(api_root) : posts_collection_url(api_root)
45
+ end
46
+
47
+ private
48
+
49
+ attr_reader :parsed_body, :page_url, :page_scope, :posts_query, :logger
50
+
51
+ def api_root_url
52
+ href = parsed_body.at_css(WordpressApi::API_LINK_SELECTOR)&.[]('href').to_s.strip
53
+ return log_missing_api_root if href.empty?
54
+
55
+ Html2rss::Url.from_relative(href, page_url)
56
+ rescue Addressable::URI::InvalidURIError, ArgumentError => error
57
+ logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
58
+ nil
59
+ end
60
+
61
+ def fetchable_page_scope?
62
+ return true if page_scope.fetchable?
63
+
64
+ if page_scope.reason == :non_archive
65
+ logger.debug(
66
+ "#{WordpressApi}: page advertised WordPress API support " \
67
+ 'without a safe WordPress archive scope'
68
+ )
69
+ return false
70
+ end
71
+
72
+ logger.warn("#{WordpressApi}: unable to derive safe WordPress archive scope for #{page_url}")
73
+ false
74
+ end
75
+
76
+ def log_missing_api_root
77
+ logger.debug("#{WordpressApi}: page advertised WordPress API support without a usable API root")
78
+ nil
79
+ end
80
+
81
+ def query_style_api_root?(api_root)
82
+ api_root.query_values.key?('rest_route')
83
+ end
84
+
85
+ def query_root_posts_url(api_root)
86
+ query = api_root.query_values
87
+ route = normalized_rest_route(query.fetch('rest_route', '/'))
88
+ api_root.with_query_values(
89
+ query.merge(
90
+ 'rest_route' => append_posts_route(route),
91
+ **posts_query
92
+ )
93
+ )
94
+ end
95
+
96
+ def posts_collection_url(api_root)
97
+ Html2rss::Url.from_relative(POSTS_PATH, normalized_api_root(api_root))
98
+ .with_query_values(api_root.query_values.merge(posts_query))
99
+ end
100
+
101
+ def normalized_api_root(api_root)
102
+ api_root.with_path(normalized_api_path(api_root.path))
103
+ end
104
+
105
+ def normalized_api_path(path)
106
+ segments = path.to_s.split('/').reject(&:empty?)
107
+ normalized_path = "/#{segments.join('/')}"
108
+ normalized_path = '/' if normalized_path == '/'
109
+ normalized_path.end_with?('/') ? normalized_path : "#{normalized_path}/"
110
+ end
111
+
112
+ def normalized_rest_route(route)
113
+ value = route.to_s
114
+ value = '/' if value.empty?
115
+ value = "/#{value}" unless value.start_with?('/')
116
+ trim_trailing_slashes(value)
117
+ end
118
+
119
+ def trim_trailing_slashes(value)
120
+ end_index = value.length
121
+ end_index -= 1 while end_index > 1 && value.getbyte(end_index - 1) == 47
122
+ value[0, end_index]
123
+ end
124
+
125
+ def append_posts_route(route)
126
+ return '/wp/v2/posts' if route == '/'
127
+
128
+ "#{route}/wp/v2/posts"
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+ require 'nokogiri'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ # Scrapes WordPress sites through their REST API instead of parsing article HTML.
10
+ class WordpressApi # rubocop:disable Metrics/ClassLength
11
+ include Enumerable
12
+
13
+ API_LINK_SELECTOR = 'link[rel="https://api.w.org/"][href]'
14
+ CANONICAL_LINK_SELECTOR = 'link[rel="canonical"][href]'
15
+ POSTS_FIELDS = %w[id title excerpt content link date categories].freeze
16
+ def self.options_key = :wordpress_api
17
+
18
+ ##
19
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
20
+ # @return [Boolean] whether the page advertises a WordPress REST API endpoint
21
+ def self.articles?(parsed_body)
22
+ return false unless parsed_body
23
+
24
+ !parsed_body.at_css(API_LINK_SELECTOR).nil?
25
+ end
26
+
27
+ ##
28
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
29
+ # @param url [String, Html2rss::Url] canonical page URL
30
+ # @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
31
+ # @param _opts [Hash] unused scraper-specific options
32
+ # @return [void]
33
+ def initialize(parsed_body, url:, request_session: nil, **_opts)
34
+ @parsed_body = parsed_body
35
+ @url = Html2rss::Url.from_absolute(url)
36
+ @request_session = request_session
37
+ @page_scope = PageScope.from(parsed_body:, url: @url)
38
+ end
39
+
40
+ ##
41
+ # Yields article hashes from the WordPress posts API.
42
+ #
43
+ # @yieldparam article [Hash<Symbol, Object>] normalized article hash
44
+ # @return [Enumerator, void] enumerator when no block is given
45
+ def each
46
+ return enum_for(:each) unless block_given?
47
+ return unless (posts = fetch_posts)
48
+
49
+ posts.filter_map { article_from(_1) }.each { yield(_1) }
50
+ end
51
+
52
+ private
53
+
54
+ attr_reader :parsed_body, :url, :request_session, :page_scope
55
+
56
+ def fetch_posts
57
+ response = posts_response
58
+ return unless response
59
+
60
+ Array(response.parsed_body)
61
+ rescue RequestService::UnsupportedResponseContentType => error
62
+ Log.warn("#{self.class}: unsupported WordPress API posts content type (#{error.message})")
63
+ nil
64
+ rescue JSON::ParserError => error
65
+ Log.warn("#{self.class}: failed to parse WordPress API posts JSON (#{error.message})")
66
+ nil
67
+ end
68
+
69
+ def posts_response
70
+ return unless request_session
71
+ return unless (resolved_posts_url = posts_endpoint_url)
72
+
73
+ request_session.follow_up(
74
+ url: resolved_posts_url,
75
+ relation: :auto_source,
76
+ origin_url: url
77
+ )
78
+ rescue Html2rss::Error => error
79
+ Log.warn("#{self.class}: failed to fetch WordPress API posts (#{error.class}: #{error.message})")
80
+ nil
81
+ end
82
+
83
+ def article_from(post)
84
+ return unless post.is_a?(Hash)
85
+
86
+ article_url = article_url(post)
87
+ return unless article_url
88
+
89
+ article_attributes(post, article_url).compact
90
+ end
91
+
92
+ def article_url(post)
93
+ absolute_link(post[:link])
94
+ end
95
+
96
+ def article_id(_post, article_url)
97
+ root_path_query_id(article_url) || string(article_url.path) || article_url.to_s
98
+ end
99
+
100
+ def article_title(post)
101
+ rendered_text(post.dig(:title, :rendered))
102
+ end
103
+
104
+ def article_description(post)
105
+ rendered_html(post.dig(:content, :rendered)) || rendered_html(post.dig(:excerpt, :rendered))
106
+ end
107
+
108
+ def article_published_at(post)
109
+ string(post[:date])
110
+ end
111
+
112
+ def article_categories(post)
113
+ Array(post[:categories]).filter_map { |value| string(value) }
114
+ end
115
+
116
+ def article_attributes(post, article_url)
117
+ {
118
+ id: article_id(post, article_url),
119
+ title: article_title(post),
120
+ description: article_description(post),
121
+ url: article_url,
122
+ published_at: article_published_at(post),
123
+ categories: article_categories(post)
124
+ }
125
+ end
126
+
127
+ def absolute_link(link)
128
+ value = string(link)
129
+ return unless value
130
+
131
+ Html2rss::Url.from_relative(value, url)
132
+ rescue ArgumentError
133
+ nil
134
+ end
135
+
136
+ def rendered_text(value)
137
+ rendered_html(value)&.then { Nokogiri::HTML.fragment(_1).text.strip }
138
+ end
139
+
140
+ def rendered_html(value)
141
+ text = string(value)
142
+ text unless text.nil?
143
+ end
144
+
145
+ def string(value)
146
+ text = value.to_s.strip
147
+ text unless text.empty?
148
+ end
149
+
150
+ def root_path_query_id(article_url)
151
+ query = string(article_url.query)
152
+ return unless query
153
+
154
+ path = article_url.path.to_s
155
+ return unless path.empty? || path == '/'
156
+
157
+ "/?#{query}"
158
+ end
159
+
160
+ def posts_query
161
+ {
162
+ '_fields' => POSTS_FIELDS.join(','),
163
+ 'per_page' => '100'
164
+ }.merge(page_scope.query)
165
+ end
166
+
167
+ def posts_endpoint_url
168
+ PostsEndpoint.resolve(
169
+ parsed_body:,
170
+ page_url: url,
171
+ page_scope:,
172
+ posts_query:,
173
+ logger: Log
174
+ )
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
@@ -4,31 +4,165 @@ module Html2rss
4
4
  class AutoSource
5
5
  ##
6
6
  # The Scraper module contains all scrapers that can be used to extract articles.
7
- # Each scraper should implement a `call` method that returns an array of article hashes.
7
+ # Each scraper should implement an `each` method that yields article hashes.
8
8
  # Each scraper should also implement an `articles?` method that returns true if the scraper
9
9
  # can potentially be used to extract articles from the given HTML.
10
10
  #
11
+ # Detection is intentionally shallow for most scrapers, but instance-based
12
+ # matching is available for scrapers that need to carry expensive selection
13
+ # state forward into extraction.
14
+ # Scrapers run in parallel threads, so implementations must avoid shared
15
+ # mutable state and degrade by returning no articles when a follow-up would
16
+ # be unsafe or unsupported.
17
+ #
11
18
  module Scraper
19
+ APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
20
+ APP_SHELL_MAX_ANCHORS = 2
21
+ APP_SHELL_MAX_VISIBLE_TEXT_LENGTH = 220
22
+
12
23
  SCRAPERS = [
13
- Html,
24
+ WordpressApi,
14
25
  Schema,
15
- SemanticHtml
26
+ Microdata,
27
+ JsonState,
28
+ SemanticHtml,
29
+ Html
16
30
  ].freeze
17
31
 
18
32
  ##
19
33
  # Error raised when no suitable scraper is found.
20
- class NoScraperFound < Html2rss::Error; end
34
+ class NoScraperFound < Html2rss::Error
35
+ CATEGORY_MESSAGES = {
36
+ blocked_surface: 'No scrapers found: blocked surface likely (anti-bot or interstitial). ' \
37
+ 'Retry with --strategy browserless, try a more specific public listing URL, ' \
38
+ 'or run from an environment that can complete anti-bot checks.',
39
+ app_shell: 'No scrapers found: app-shell surface detected (client-rendered page with little or no ' \
40
+ 'server-rendered article HTML). Retry with --strategy browserless, or target a direct ' \
41
+ 'listing/update URL instead of a homepage or shell entrypoint.',
42
+ unsupported_surface: 'No scrapers found: unsupported extraction surface for auto mode. ' \
43
+ 'Try a direct listing/changelog/category URL, ' \
44
+ 'or use explicit selectors in a feed config.'
45
+ }.freeze
46
+
47
+ def initialize(message = nil, category: :unsupported_surface)
48
+ validate_category!(category)
49
+ @category = category
50
+ super(message || CATEGORY_MESSAGES.fetch(@category))
51
+ end
52
+
53
+ attr_reader :category
54
+
55
+ private
56
+
57
+ def validate_category!(category)
58
+ return if CATEGORY_MESSAGES.key?(category)
59
+
60
+ valid_categories = CATEGORY_MESSAGES.keys.join(', ')
61
+ raise ArgumentError, "Unknown category: #{category.inspect}. Valid categories are: #{valid_categories}"
62
+ end
63
+ end
21
64
 
22
65
  ##
23
- # Returns an array of scrapers that claim to find articles in the parsed body.
66
+ # Returns an array of scraper classes that claim to find articles in the parsed body.
24
67
  # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
68
+ # @param opts [Hash] The options hash.
25
69
  # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
26
- def self.from(parsed_body)
27
- scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
28
- raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
70
+ def self.from(parsed_body, opts = Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
71
+ scrapers = SCRAPERS.select { |scraper| opts.dig(scraper.options_key, :enabled) }
72
+ scrapers.select! { |scraper| scraper.articles?(parsed_body) }
73
+
74
+ raise no_scraper_found_for(parsed_body) if scrapers.empty?
29
75
 
30
76
  scrapers
31
77
  end
78
+
79
+ # Returns scraper instances ready for extraction.
80
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
81
+ # @param url [String, Html2rss::Url] The page url.
82
+ # @param request_session [Html2rss::RequestSession, nil] Shared follow-up session.
83
+ # @param opts [Hash] The options hash.
84
+ # @return [Array<Object>] An array of scraper instances that can handle the parsed body.
85
+ #
86
+ # `instances_for` is the main entrypoint for extraction. It lets a scraper
87
+ # decide whether it matches using the same instance that will later yield
88
+ # article hashes, which keeps precomputed state close to the scraper that
89
+ # owns it.
90
+ def self.instances_for(parsed_body, url:, request_session: nil,
91
+ opts: Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
92
+ instances = SCRAPERS.filter_map do |scraper|
93
+ next unless opts.dig(scraper.options_key, :enabled)
94
+
95
+ instance = scraper.new(parsed_body, url:, request_session:, **opts.fetch(scraper.options_key, {}))
96
+ next unless extractable_instance?(instance, parsed_body)
97
+
98
+ instance
99
+ end
100
+
101
+ raise no_scraper_found_for(parsed_body) if instances.empty?
102
+
103
+ instances
104
+ end
105
+
106
+ def self.extractable_instance?(instance, parsed_body)
107
+ return instance.extractable? if instance.respond_to?(:extractable?)
108
+
109
+ instance.class.articles?(parsed_body)
110
+ end
111
+ private_class_method :extractable_instance?
112
+
113
+ def self.no_scraper_found_for(parsed_body)
114
+ NoScraperFound.new(category: classify_no_scraper_surface(parsed_body))
115
+ end
116
+ private_class_method :no_scraper_found_for
117
+
118
+ def self.classify_no_scraper_surface(parsed_body)
119
+ return :blocked_surface if blocked_surface?(parsed_body)
120
+ return :app_shell if app_shell_surface?(parsed_body)
121
+
122
+ :unsupported_surface
123
+ end
124
+ private_class_method :classify_no_scraper_surface
125
+
126
+ def self.blocked_surface?(parsed_body)
127
+ Html2rss::BlockedSurface.interstitial?(parsed_body.to_html)
128
+ end
129
+ private_class_method :blocked_surface?
130
+
131
+ def self.app_shell_surface?(parsed_body)
132
+ root_marker = parsed_body.at_css(APP_SHELL_ROOT_SELECTORS)
133
+ return false unless root_marker
134
+
135
+ sparse_anchor_surface?(parsed_body) &&
136
+ no_article_markers?(parsed_body) &&
137
+ short_visible_text?(parsed_body)
138
+ end
139
+ private_class_method :app_shell_surface?
140
+
141
+ def self.sparse_anchor_surface?(parsed_body)
142
+ parsed_body.css('body a[href]').size <= APP_SHELL_MAX_ANCHORS
143
+ end
144
+ private_class_method :sparse_anchor_surface?
145
+
146
+ def self.no_article_markers?(parsed_body)
147
+ parsed_body.css(
148
+ 'article, main article, [itemtype*="Article"], [itemprop="articleBody"]'
149
+ ).empty?
150
+ end
151
+ private_class_method :no_article_markers?
152
+
153
+ def self.short_visible_text?(parsed_body)
154
+ visible_text_length(parsed_body) <= APP_SHELL_MAX_VISIBLE_TEXT_LENGTH
155
+ end
156
+ private_class_method :short_visible_text?
157
+
158
+ def self.visible_text_length(parsed_body)
159
+ body = parsed_body.at_css('body')
160
+ return 0 unless body
161
+
162
+ text_nodes = body.xpath('.//text()[not(ancestor::script or ancestor::style or ancestor::noscript)]')
163
+ text_nodes.map(&:text).join(' ').gsub(/\s+/, ' ').strip.length
164
+ end
165
+ private_class_method :visible_text_length
32
166
  end
33
167
  end
34
168
  end
@@ -1,73 +1,145 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'nokogiri'
4
3
  require 'parallel'
5
- require 'addressable'
4
+ require 'dry-validation'
6
5
 
7
6
  module Html2rss
8
7
  ##
9
- # The AutoSource class is responsible for extracting channel and articles
10
- # from a given URL.
11
- # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
12
- # marking articles, e.g. schema, microdata, open graph, etc.
8
+ # The AutoSource class automatically extracts articles from a given URL by
9
+ # utilizing a collection of Scrapers. These scrapers analyze and
10
+ # parse popular structured data formats—such as schema, microdata, and
11
+ # open graph—to identify and compile article elements into unified articles.
12
+ #
13
+ # Scrapers supporting plain HTML are also available for sites without structured data,
14
+ # though results may vary based on page markup.
15
+ #
16
+ # @see Html2rss::AutoSource::Scraper::Schema
17
+ # @see Html2rss::AutoSource::Scraper::SemanticHtml
18
+ # @see Html2rss::AutoSource::Scraper::Html
13
19
  class AutoSource
14
- class NoArticlesFound < Html2rss::Error; end
15
-
16
- ##
17
- # @param url [Addressable::URI] The URL to extract articles from.
18
- # @param body [String] The body of the response.
19
- # @param headers [Hash] The headers of the response.
20
- def initialize(url, body:, headers: {})
21
- @url = url
22
- @body = body
23
- @headers = headers
24
- end
25
-
26
- def build
27
- raise NoArticlesFound if articles.empty?
20
+ DEFAULT_CONFIG = {
21
+ scraper: {
22
+ wordpress_api: {
23
+ enabled: true
24
+ },
25
+ schema: {
26
+ enabled: true
27
+ },
28
+ microdata: {
29
+ enabled: true
30
+ },
31
+ json_state: {
32
+ enabled: true
33
+ },
34
+ semantic_html: {
35
+ enabled: true
36
+ },
37
+ html: {
38
+ enabled: true,
39
+ minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
40
+ use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
41
+ }
42
+ },
43
+ cleanup: Cleanup::DEFAULT_CONFIG
44
+ }.freeze
45
+
46
+ SCRAPER_CONFIG = proc do
47
+ optional(:wordpress_api).hash do
48
+ optional(:enabled).filled(:bool)
49
+ end
50
+ optional(:schema).hash do
51
+ optional(:enabled).filled(:bool)
52
+ end
53
+ optional(:microdata).hash do
54
+ optional(:enabled).filled(:bool)
55
+ end
56
+ optional(:json_state).hash do
57
+ optional(:enabled).filled(:bool)
58
+ end
59
+ optional(:semantic_html).hash do
60
+ optional(:enabled).filled(:bool)
61
+ end
62
+ optional(:html).hash do
63
+ optional(:enabled).filled(:bool)
64
+ optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
65
+ optional(:use_top_selectors).filled(:integer, gt?: 0)
66
+ end
67
+ end.freeze
68
+ private_constant :SCRAPER_CONFIG
28
69
 
29
- Reducer.call(articles, url:)
30
- Cleanup.call(articles, url:, keep_different_domain: true)
70
+ Config = Dry::Schema.Params do
71
+ optional(:scraper).hash(&SCRAPER_CONFIG)
31
72
 
32
- channel.articles = articles
73
+ optional(:cleanup).hash do
74
+ optional(:keep_different_domain).filled(:bool)
75
+ optional(:min_words_title).filled(:integer, gt?: 0)
76
+ end
77
+ end
33
78
 
34
- Html2rss::AutoSource::RssBuilder.new(
35
- channel:,
36
- articles:
37
- ).call
79
+ ##
80
+ # @param response [Html2rss::RequestService::Response] initial page response
81
+ # @param opts [Hash] validated auto-source options
82
+ # @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
83
+ # @return [void]
84
+ def initialize(response, opts = DEFAULT_CONFIG, request_session: nil)
85
+ @parsed_body = response.parsed_body
86
+ @url = response.url
87
+ @opts = opts
88
+ @request_session = request_session
38
89
  end
39
90
 
91
+ ##
92
+ # Extracts article candidates by selecting every scraper that can explain the
93
+ # page shape, running those scrapers, and normalizing the resulting hashes
94
+ # into `RssBuilder::Article` objects.
95
+ #
96
+ # The contributor-facing flow is:
97
+ # 1. choose scraper instances that match the page
98
+ # 2. let each scraper collect its own candidates
99
+ # 3. clean and deduplicate the merged article list
100
+ #
101
+ # Scrapers with expensive precomputation, such as `SemanticHtml`, keep that
102
+ # state on the instance so detection and extraction can reuse the same work.
103
+ #
104
+ # @return [Array<Html2rss::RssBuilder::Article>] extracted articles
40
105
  def articles
41
- @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
42
- instance = scraper.new(parsed_body, url:)
106
+ @articles ||= extract_articles
107
+ rescue Html2rss::AutoSource::Scraper::NoScraperFound => error
108
+ Log.warn "#{self.class}: no scraper matched #{url} (#{error.message})"
109
+ []
110
+ end
43
111
 
44
- articles_in_thread = Parallel.map(instance.each) do |article_hash|
45
- Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
112
+ private
46
113
 
47
- Article.new(**article_hash, scraper:)
48
- end
114
+ attr_reader :url, :parsed_body, :request_session
49
115
 
50
- Reducer.call(articles_in_thread, url:)
116
+ def extract_articles
117
+ scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
118
+ return [] if scraper_instances.empty?
51
119
 
52
- articles_in_thread
120
+ # Scrapers are instantiated and run in parallel threads. Implementations
121
+ # must avoid shared mutable state, treat request_session calls as
122
+ # concurrency-safe from the scraper side, and return no articles when a
123
+ # follow-up would be unsafe or unsupported.
124
+ articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
125
+ run_scraper(instance)
53
126
  end
127
+ Cleanup.call(articles, url:, **cleanup_options)
54
128
  end
55
129
 
56
- def channel
57
- @channel ||= Channel.new(parsed_body, headers: @headers, url:)
130
+ def run_scraper(instance)
131
+ instance.each.map do |article_hash|
132
+ RssBuilder::Article.new(**article_hash, scraper: instance.class)
133
+ end
58
134
  end
59
135
 
60
- private
61
-
62
- attr_reader :url
136
+ def cleanup_options
137
+ @opts.fetch(:cleanup, {})
138
+ end
63
139
 
64
- # @return [Nokogiri::HTML::Document]
65
- def parsed_body
66
- @parsed_body ||= Nokogiri.HTML(@body)
67
- .tap do |doc|
68
- # Remove comments from the document
69
- doc.xpath('//comment()').each(&:remove)
70
- end.freeze
140
+ def thread_count_for(scrapers)
141
+ count = [scrapers.size, Parallel.processor_count].min
142
+ count.zero? ? 1 : count
71
143
  end
72
144
  end
73
145
  end