html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class SemanticHtml
7
+ ##
8
+ # Selects the best content-like anchor from a semantic container.
9
+ #
10
+ # The selector turns raw DOM anchors into ranked facts so semantic
11
+ # scraping can reason about link intent instead of DOM order. It favors
12
+ # heading-aligned article links and suppresses utility links, duplicate
13
+ # destinations, and weak textless affordances.
14
+ class AnchorSelector # rubocop:disable Metrics/ClassLength
15
+ AnchorFacts = Data.define(
16
+ :anchor,
17
+ :text,
18
+ :url,
19
+ :destination,
20
+ :segments,
21
+ :meaningful_text,
22
+ :content_like_destination,
23
+ :heading_anchor,
24
+ :heading_text_match,
25
+ :score
26
+ )
27
+
28
+ HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
29
+ UTILITY_PATH_SEGMENTS = %w[
30
+ about account author category comment comments contact feedback help
31
+ login newsletter profile register search settings share signup subscribe
32
+ topic topics view-all archive archives
33
+ feed feeds
34
+ recommended
35
+ for-you
36
+ preference preferences
37
+ notification notifications
38
+ privacy terms
39
+ cookie cookies
40
+ logout
41
+ user users
42
+ ].to_set.freeze
43
+ CONTENT_PATH_SEGMENTS = %w[
44
+ article articles news post posts story stories update updates
45
+ ].to_set.freeze
46
+ UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
47
+
48
+ def initialize(base_url)
49
+ @base_url = base_url
50
+ end
51
+
52
+ ##
53
+ # Chooses the single anchor that best represents the story contained
54
+ # in a semantic block.
55
+ #
56
+ # Ranking is scoped to one container at a time. That keeps the logic
57
+ # local, makes duplicate links to the same destination collapse into
58
+ # one candidate, and avoids page-wide heuristics leaking across cards.
59
+ #
60
+ # @param container [Nokogiri::XML::Element] semantic container being evaluated
61
+ # @return [Nokogiri::XML::Element, nil] selected primary anchor or nil when none qualify
62
+ def primary_anchor_for(container)
63
+ facts_for(container).max_by(&:score)&.anchor
64
+ end
65
+
66
+ private
67
+
68
+ attr_reader :base_url
69
+
70
+ def facts_for(container)
71
+ heading = heading_for(container)
72
+ heading_text = visible_text(heading)
73
+
74
+ container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
75
+ next if anchor.path.match?(Html::TAGS_TO_IGNORE)
76
+
77
+ facts = build_facts(anchor, heading, heading_text)
78
+ next unless facts
79
+
80
+ keep_stronger_fact(best_by_destination, facts)
81
+ end.values
82
+ end
83
+
84
+ def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
85
+ text = visible_text(anchor)
86
+ meaningful_text = meaningful_text?(text)
87
+ ancestors = anchor.ancestors.to_a
88
+ url = normalized_destination(anchor)
89
+ return unless url
90
+
91
+ segments = url.path_segments
92
+ content_like_destination = content_like_destination?(segments)
93
+ return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
94
+
95
+ heading_anchor = heading_anchor?(ancestors, heading)
96
+ heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
97
+ return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
98
+
99
+ AnchorFacts.new(
100
+ anchor:,
101
+ text:,
102
+ url:,
103
+ destination: url.to_s,
104
+ segments:,
105
+ meaningful_text:,
106
+ content_like_destination:,
107
+ heading_anchor:,
108
+ heading_text_match:,
109
+ score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
110
+ )
111
+ end
112
+
113
+ def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
114
+ utility_destination?(segments) ||
115
+ utility_text?(text) ||
116
+ icon_only_anchor?(anchor, meaningful_text) ||
117
+ utility_landmark_anchor?(ancestors)
118
+ end
119
+
120
+ def keep_stronger_fact(best_by_destination, facts)
121
+ current = best_by_destination[facts.destination]
122
+ return best_by_destination[facts.destination] = facts unless current
123
+ return if current.score >= facts.score
124
+
125
+ best_by_destination[facts.destination] = facts
126
+ end
127
+
128
+ def content_like_anchor?(meaningful_text, content_like_destination)
129
+ meaningful_text || content_like_destination
130
+ end
131
+
132
+ def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
133
+ score = 0
134
+ score += 100 if heading_anchor
135
+ score += 20 if heading_text_match
136
+ score += 10 if meaningful_text
137
+ score += 10 if content_like_destination
138
+ score
139
+ end
140
+
141
+ def heading_anchor?(ancestors, heading)
142
+ heading && ancestors.include?(heading)
143
+ end
144
+
145
+ def heading_text_match?(heading_text, text, meaningful_text)
146
+ meaningful_text && meaningful_text?(heading_text) && heading_text == text
147
+ end
148
+
149
+ def heading_for(container)
150
+ container.at_css(HEADING_SELECTOR)
151
+ end
152
+
153
+ def icon_only_anchor?(anchor, meaningful_text)
154
+ !meaningful_text && anchor.at_css('img, svg')
155
+ end
156
+
157
+ def utility_destination?(segments)
158
+ segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
159
+ end
160
+
161
+ def content_like_destination?(segments)
162
+ segments.any? do |segment|
163
+ CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
164
+ end
165
+ end
166
+
167
+ def normalized_destination(anchor)
168
+ href = anchor['href'].to_s.split('#').first.to_s.strip
169
+ return if href.empty?
170
+
171
+ Html2rss::Url.from_relative(href, base_url)
172
+ rescue ArgumentError
173
+ nil
174
+ end
175
+
176
+ def meaningful_text?(text)
177
+ text.scan(/\p{Alnum}+/).any?
178
+ end
179
+
180
+ def utility_text?(text)
181
+ text.match?(
182
+ /\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
183
+ )
184
+ end
185
+
186
+ def utility_landmark_anchor?(ancestors)
187
+ ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
188
+ end
189
+
190
+ def visible_text(node)
191
+ return '' unless node
192
+
193
+ HtmlExtractor.extract_visible_text(node).to_s.strip
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
199
+ end
@@ -1,114 +1,120 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'addressable'
4
- require 'parallel'
3
+ require_relative 'semantic_html/anchor_selector'
5
4
 
6
5
  module Html2rss
7
6
  class AutoSource
8
7
  module Scraper
9
8
  ##
10
- # Scrapes articles by looking for common markup tags (article, section, li)
11
- # containing an <a href> tag.
9
+ # Scrapes semantic containers by choosing one primary content link per
10
+ # block before extraction.
12
11
  #
13
- # See:
14
- # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
12
+ # This scraper is intentionally container-first:
13
+ # 1. collect candidate semantic containers once
14
+ # 2. select the strongest content-like anchor within each container
15
+ # 3. extract fields from the container while honoring that anchor choice
16
+ #
17
+ # The result is lower recall on weak-signal blocks, but much better link
18
+ # quality on modern teaser cards that mix headlines, utility links, and
19
+ # duplicate image overlays.
15
20
  class SemanticHtml
16
21
  include Enumerable
17
22
 
23
+ Entry = Data.define(:container, :selected_anchor)
24
+
25
+ CONTAINER_SELECTORS = [
26
+ 'article:not(:has(article))',
27
+ 'section:not(:has(section))',
28
+ 'li:not(:has(li))',
29
+ 'tr:not(:has(tr))',
30
+ 'div:not(:has(div))'
31
+ ].freeze
32
+
18
33
  ##
19
- # Map of parent element names to CSS selectors for finding <a href> tags.
20
- ANCHOR_TAG_SELECTORS = {
21
- 'section' => ['section :not(section) a[href]'],
22
- 'tr' => ['table tr :not(tr) a[href]'],
23
- 'article' => [
24
- 'article :not(article) a[href]',
25
- 'article a[href]'
26
- ],
27
- 'li' => [
28
- 'ul > li :not(li) a[href]',
29
- 'ol > li :not(li) a[href]'
30
- ]
31
- }.freeze
32
-
33
- # Check if the parsed_body contains articles
34
- # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
35
- # @return [Boolean] True if articles are found, otherwise false.
34
+ # @return [Symbol] config key used to enable or configure this scraper
35
+ def self.options_key = :semantic_html
36
+
37
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
38
+ # @return [Boolean] true when at least one semantic container has an eligible anchor
36
39
  def self.articles?(parsed_body)
37
40
  return false unless parsed_body
38
41
 
39
- ANCHOR_TAG_SELECTORS.each_value do |selectors|
40
- return true if selectors.any? { |selector| parsed_body.at_css(selector) }
41
- end
42
- false
42
+ new(parsed_body, url: 'https://example.com').extractable?
43
+ end
44
+
45
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
46
+ # @param url [String, Html2rss::Url] base url
47
+ # @param extractor [Class] extractor class used for article extraction
48
+ def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
49
+ @parsed_body = parsed_body
50
+ @url = url
51
+ @extractor = extractor
52
+ @anchor_selector = AnchorSelector.new(url)
43
53
  end
44
54
 
45
- # Finds the closest ancestor tag matching the specified tag name
46
- # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
47
- # @param tag_name [String] The tag name to search for
48
- # @param stop_tag [String] The tag name to stop searching at
49
- # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
50
- def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
51
- return current_tag if current_tag.name == tag_name
55
+ attr_reader :parsed_body
52
56
 
53
- stop_tags = Set[tag_name, stop_tag]
57
+ ##
58
+ # Yields extracted article hashes for each semantic container that
59
+ # survives anchor selection.
60
+ #
61
+ # Detection and extraction share the same memoized entry list so this
62
+ # scraper does not rerun anchor ranking once a page has already been
63
+ # accepted as extractable.
64
+ #
65
+ # @yieldparam article_hash [Hash] extracted article hash
66
+ # @return [Enumerator<Hash>]
67
+ def each
68
+ return enum_for(:each) unless block_given?
54
69
 
55
- while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
56
- current_tag = current_tag.parent
70
+ extractable_entries.each do |entry|
71
+ article_hash = @extractor.new(
72
+ entry.container,
73
+ base_url: @url,
74
+ selected_anchor: entry.selected_anchor
75
+ ).call
76
+ yield article_hash if article_hash
57
77
  end
58
-
59
- current_tag
60
78
  end
61
79
 
62
- # Finds the closest matching selector upwards in the DOM tree
63
- # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
64
- # @param selector [String] The CSS selector to search for
65
- # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
66
- def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
67
- current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
80
+ ##
81
+ # Reports whether the page contains at least one semantic container with
82
+ # a selectable primary anchor.
83
+ #
84
+ # @return [Boolean] true when at least one candidate container yields a primary anchor
85
+ def extractable?
86
+ extractable_entries.any?
68
87
  end
69
88
 
70
- # Helper method to find a matching selector upwards
71
- # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
72
- # @param selector [String] The CSS selector to search for
73
- # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
74
- def self.find_closest_selector_upwards(current_tag, selector:)
75
- while current_tag
76
- found = current_tag.at_css(selector)
77
- return found if found
78
-
79
- return nil unless current_tag.respond_to?(:parent)
89
+ protected
80
90
 
81
- current_tag = current_tag.parent
82
- end
91
+ def candidate_containers
92
+ @candidate_containers ||= collect_candidate_containers
83
93
  end
84
94
 
85
- # Returns an array of [tag_name, selector] pairs
86
- # @return [Array<[String, String]>] Array of tag name and selector pairs
87
- def self.anchor_tag_selector_pairs
88
- ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
89
- selectors.map { |selector| [tag_name, selector] }
90
- end
95
+ def primary_anchor_for(container)
96
+ @anchor_selector.primary_anchor_for(container)
91
97
  end
92
98
 
93
- def initialize(parsed_body, url:)
94
- @parsed_body = parsed_body
95
- @url = url
96
- end
99
+ def extractable_entries
100
+ @extractable_entries ||= candidate_containers.filter_map do |container|
101
+ selected_anchor = primary_anchor_for(container)
102
+ next unless selected_anchor
97
103
 
98
- attr_reader :parsed_body
104
+ Entry.new(container:, selected_anchor:)
105
+ end
106
+ end
99
107
 
100
- ##
101
- # @yieldparam [Hash] The scraped article hash
102
- # @return [Enumerator] Enumerator for the scraped articles
103
- def each
104
- return enum_for(:each) unless block_given?
108
+ def collect_candidate_containers
109
+ seen = {}.compare_by_identity
105
110
 
106
- SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
- parsed_body.css(selector).each do |selected_tag|
108
- article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
109
- article_hash = Extractor.new(article_tag, url: @url).call
111
+ CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
112
+ parsed_body.css(selector).each do |container|
113
+ next if container.path.match?(Html::TAGS_TO_IGNORE)
114
+ next if seen[container]
110
115
 
111
- yield article_hash if article_hash
116
+ seen[container] = true
117
+ containers << container
112
118
  end
113
119
  end
114
120
  end
@@ -0,0 +1,261 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class WordpressApi
7
+ ##
8
+ # Determines whether a WordPress page can safely be mapped to a posts query.
9
+ class PageScope
10
+ CATEGORY_SEGMENT = 'category'
11
+ TAG_SEGMENT = 'tag'
12
+ AUTHOR_SEGMENT = 'author'
13
+
14
+ ##
15
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
16
+ # @param url [Html2rss::Url] canonical page URL
17
+ # @return [PageScope] derived page scope
18
+ def self.from(parsed_body:, url:)
19
+ Resolver.new(parsed_body:, url:).call
20
+ end
21
+
22
+ ##
23
+ # @param query [Hash<String, String>] scoped query params for the posts endpoint
24
+ # @param fetchable [Boolean] whether a posts follow-up is safe for this page
25
+ # @param reason [Symbol] classification of the resolved page scope
26
+ def initialize(query:, fetchable:, reason:)
27
+ @query = query.freeze
28
+ @fetchable = fetchable
29
+ @reason = reason
30
+ freeze
31
+ end
32
+
33
+ ##
34
+ # @return [Hash<String, String>] query params to apply to the posts request
35
+ attr_reader :query
36
+
37
+ ##
38
+ # @return [Boolean] whether the page may safely use the posts API follow-up
39
+ def fetchable?
40
+ @fetchable
41
+ end
42
+
43
+ ##
44
+ # @return [Symbol] classification of the resolved page scope
45
+ attr_reader :reason
46
+
47
+ ##
48
+ # Resolves the page scope from page markup and canonical URL signals.
49
+ class Resolver # rubocop:disable Metrics/ClassLength
50
+ ##
51
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
52
+ # @param url [Html2rss::Url] canonical page URL
53
+ def initialize(parsed_body:, url:)
54
+ @parsed_body = parsed_body
55
+ @url = Html2rss::Url.from_absolute(url)
56
+ end
57
+
58
+ ##
59
+ # @return [PageScope] derived page scope
60
+ def call
61
+ category_scope ||
62
+ tag_scope ||
63
+ author_scope ||
64
+ date_scope ||
65
+ fallback_scope
66
+ end
67
+
68
+ private
69
+
70
+ attr_reader :parsed_body, :url
71
+
72
+ def category_scope
73
+ return unless category_archive?
74
+
75
+ scoped_scope('categories' => archive_id('category'))
76
+ end
77
+
78
+ def tag_scope
79
+ return unless tag_archive?
80
+
81
+ scoped_scope('tags' => archive_id('tag'))
82
+ end
83
+
84
+ def author_scope
85
+ return unless author_archive?
86
+
87
+ scoped_scope('author' => archive_id('author'))
88
+ end
89
+
90
+ def date_scope
91
+ return unless date_archive?
92
+
93
+ range = date_archive_range
94
+ return unknown_archive_scope unless range
95
+
96
+ PageScope.new(query: range, fetchable: true, reason: :archive)
97
+ end
98
+
99
+ def fallback_scope
100
+ return unknown_archive_scope if archive_like?
101
+ return non_archive_scope if singular_like?
102
+
103
+ PageScope.new(query: {}, fetchable: true, reason: :unscoped)
104
+ end
105
+
106
+ def scoped_scope(query)
107
+ return unknown_archive_scope if query.values.any?(&:nil?)
108
+
109
+ PageScope.new(query:, fetchable: true, reason: :archive)
110
+ end
111
+
112
+ def unknown_archive_scope
113
+ PageScope.new(query: {}, fetchable: false, reason: :unsupported_archive)
114
+ end
115
+
116
+ def non_archive_scope
117
+ PageScope.new(query: {}, fetchable: false, reason: :non_archive)
118
+ end
119
+
120
+ def category_archive?
121
+ body_classes.include?('category') || leading_path_segment == CATEGORY_SEGMENT
122
+ end
123
+
124
+ def tag_archive?
125
+ body_classes.include?('tag') || leading_path_segment == TAG_SEGMENT
126
+ end
127
+
128
+ def author_archive?
129
+ body_classes.include?('author') || leading_path_segment == AUTHOR_SEGMENT
130
+ end
131
+
132
+ def date_archive?
133
+ body_classes.include?('date') || date_archive_path?
134
+ end
135
+
136
+ def archive_like?
137
+ category_archive? || tag_archive? || author_archive? || date_archive? || body_classes.include?('archive')
138
+ end
139
+
140
+ def singular_like?
141
+ body_classes.intersect?(%w[page single singular attachment]) ||
142
+ body_classes.any? { _1.match?(/\A(?:page-id|postid)-\d+\z/) }
143
+ end
144
+
145
+ def body_classes
146
+ @body_classes ||= parsed_body.at_css('body')&.[]('class').to_s.split
147
+ end
148
+
149
+ def archive_id(prefix)
150
+ body_classes.filter_map do |klass|
151
+ klass[Regexp.new("^#{Regexp.escape(prefix)}-(\\d+)$"), 1]
152
+ end.first
153
+ end
154
+
155
+ def canonical_or_current_url
156
+ href = parsed_body.at_css(WordpressApi::CANONICAL_LINK_SELECTOR)&.[]('href').to_s.strip
157
+ return url if href.empty?
158
+
159
+ canonical_url = Html2rss::Url.from_relative(href, url)
160
+ same_origin_url?(canonical_url, url) ? canonical_url : url
161
+ rescue ArgumentError
162
+ url
163
+ end
164
+
165
+ def path_segments
166
+ @path_segments ||= canonical_or_current_url.path_segments
167
+ end
168
+
169
+ def leading_path_segment
170
+ path_segments.first
171
+ end
172
+
173
+ def date_archive_path?
174
+ !date_archive_segments.nil?
175
+ end
176
+
177
+ def date_archive_range
178
+ components = date_archive_components
179
+ return unless components
180
+
181
+ start_date = Date.new(*components.fetch(:start_date_parts))
182
+ {
183
+ 'after' => iso8601_start(start_date),
184
+ 'before' => iso8601_start(next_archive_boundary(start_date, components.fetch(:precision)))
185
+ }
186
+ rescue Date::Error
187
+ nil
188
+ end
189
+
190
+ def date_archive_components
191
+ segments = date_archive_segments
192
+ return unless segments
193
+
194
+ year = segments.fetch(0).to_i
195
+ month = parse_archive_segment(segments[1], 1, 12)
196
+ day = parse_archive_segment(segments[2], 1, 31)
197
+
198
+ {
199
+ start_date_parts: [year, month || 1, day || 1],
200
+ precision: archive_precision(month:, day:)
201
+ }
202
+ end
203
+
204
+ def date_archive_segments
205
+ year_index = path_segments.find_index { _1.match?(/\A\d{4}\z/) }
206
+ return unless year_index
207
+
208
+ segments = path_segments.drop(year_index)
209
+ return unless segments.length.between?(1, 3)
210
+ return unless archive_segment_shape?(segments)
211
+
212
+ segments
213
+ end
214
+
215
+ def archive_segment_shape?(segments)
216
+ month = segments[1]
217
+ day = segments[2]
218
+ return false if day && month.nil?
219
+ return false unless month.nil? || month.match?(/\A\d+\z/)
220
+ return false unless day.nil? || day.match?(/\A\d+\z/)
221
+
222
+ true
223
+ end
224
+
225
+ def same_origin_url?(left, right)
226
+ [left.scheme, left.host, left.port] == [right.scheme, right.host, right.port]
227
+ end
228
+
229
+ def archive_precision(month:, day:)
230
+ return :day if day
231
+ return :month if month
232
+
233
+ :year
234
+ end
235
+
236
+ def next_archive_boundary(start_date, precision)
237
+ {
238
+ year: start_date.next_year,
239
+ month: start_date.next_month,
240
+ day: start_date.next_day
241
+ }.fetch(precision)
242
+ end
243
+
244
+ def iso8601_start(date)
245
+ date.strftime('%Y-%m-%dT00:00:00Z')
246
+ end
247
+
248
+ def parse_archive_segment(value, minimum, maximum)
249
+ return nil unless value&.match?(/\A\d+\z/)
250
+
251
+ number = value.to_i
252
+ return nil if number < minimum || number > maximum
253
+
254
+ number
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
260
+ end
261
+ end