html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -11,6 +11,7 @@ module Html2rss
11
11
  #
12
12
  # @see https://schema.org/Thing
13
13
  class Thing
14
+ # Supported Schema.org `@type` values mapped to article extraction.
14
15
  SUPPORTED_TYPES = %w[
15
16
  AdvertiserContentArticle
16
17
  AnalysisNewsArticle
@@ -32,11 +33,14 @@ module Html2rss
32
33
  TechArticle
33
34
  ].to_set.freeze
34
35
 
35
- DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
36
+ # Attributes exposed by `#call` in generated article hashes.
37
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
36
38
 
39
+ # @param schema_object [Hash{Symbol => Object}] parsed schema.org object
40
+ # @param url [String, Html2rss::Url, nil] base URL used for relative normalization
37
41
  def initialize(schema_object, url:)
38
42
  @schema_object = schema_object
39
- @url = url
43
+ @base_url = normalized_base_url(url)
40
44
  end
41
45
 
42
46
  # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
@@ -46,24 +50,27 @@ module Html2rss
46
50
  end
47
51
  end
48
52
 
53
+ # @return [String, nil] stable schema object identifier
49
54
  def id
50
55
  return @id if defined?(@id)
51
56
 
52
- id = (schema_object[:@id] || url&.path).to_s
57
+ id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
53
58
 
54
59
  return if id.empty?
55
60
 
56
61
  @id = id
57
62
  end
58
63
 
64
+ # @return [String, nil] article title
59
65
  def title = schema_object[:title]
60
66
 
67
+ # @return [String, nil] longest available description field
61
68
  def description
62
69
  schema_object.values_at(:description, :schema_object_body, :abstract)
63
70
  .max_by { |string| string.to_s.size }
64
71
  end
65
72
 
66
- # @return [Addressable::URI, nil] the URL of the schema object
73
+ # @return [Html2rss::Url, nil] the URL of the schema object
67
74
  def url
68
75
  url = schema_object[:url]
69
76
  if url.to_s.empty?
@@ -71,21 +78,29 @@ module Html2rss
71
78
  return
72
79
  end
73
80
 
74
- Utils.build_absolute_url_from_relative(url, @url)
81
+ Url.from_relative(url, base_url || url)
75
82
  end
76
83
 
84
+ # @return [Html2rss::Url, nil] normalized article image URL
77
85
  def image
78
86
  if (image_url = image_urls.first)
79
- Utils.build_absolute_url_from_relative(image_url, @url)
87
+ Url.from_relative(image_url, base_url || image_url)
80
88
  end
81
89
  end
82
90
 
91
+ # @return [String, nil] published-at timestamp string
83
92
  def published_at = schema_object[:datePublished]
84
93
 
85
- private
94
+ # @return [Array<String>, nil] extracted category labels
95
+ def categories
96
+ return @categories if defined?(@categories)
86
97
 
87
- attr_reader :schema_object
98
+ @categories = CategoryExtractor.call(schema_object)
99
+ end
100
+
101
+ attr_reader :schema_object, :base_url
88
102
 
103
+ # @return [Array<String>] normalized image URL candidates
89
104
  def image_urls
90
105
  schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
91
106
  next unless object
@@ -97,6 +112,52 @@ module Html2rss
97
112
  end
98
113
  end
99
114
  end
115
+
116
+ # @param value [String, Symbol, nil] candidate schema identifier
117
+ # @param reference_url [Html2rss::Url, nil] URL used for same-origin normalization
118
+ # @return [String, nil] normalized identifier value
119
+ def normalized_id(value, reference_url:)
120
+ text = value.to_s
121
+ return if text.empty?
122
+
123
+ normalized_url = normalized_id_url(text, reference_url:)
124
+ return text unless reference_url && normalized_url.host == reference_url.host
125
+
126
+ normalized_id_value(normalized_url)
127
+ rescue ArgumentError
128
+ text
129
+ end
130
+
131
+ # @param text [String] raw identifier text
132
+ # @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
133
+ # @return [Html2rss::Url] normalized identifier URL
134
+ def normalized_id_url(text, reference_url:)
135
+ if text.start_with?('/')
136
+ Url.from_relative(text, reference_url || text)
137
+ else
138
+ Url.from_absolute(text)
139
+ end
140
+ end
141
+
142
+ # @param url [Html2rss::Url] normalized identifier URL
143
+ # @return [String, nil] path/query portion used as stable ID
144
+ def normalized_id_value(url)
145
+ path = url.path.to_s
146
+ return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
147
+ return path unless path.empty?
148
+
149
+ url.query
150
+ end
151
+
152
+ # @param url [String, Html2rss::Url, nil] candidate page URL
153
+ # @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
154
+ def normalized_base_url(url)
155
+ return if url.to_s.strip.empty?
156
+
157
+ Url.from_absolute(url)
158
+ rescue ArgumentError
159
+ nil
160
+ end
100
161
  end
101
162
  end
102
163
  end
@@ -8,24 +8,31 @@ module Html2rss
8
8
  module Scraper
9
9
  ##
10
10
  # Scrapes articles from Schema.org objects, by looking for the objects in:
11
-
12
11
  # <script type="application/ld+json"> "schema" tags.
13
12
  #
14
- # See:
15
- # 1. https://schema.org/docs/full.html
16
- # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
13
+ # @see https://schema.org/docs/full.html
14
+ # @see https://developers.google.com/search/docs/appearance/structured-data/article#microdata
17
15
  class Schema
18
16
  include Enumerable
19
17
 
18
+ # Selector for JSON-LD script tags containing Schema.org objects.
20
19
  TAG_SELECTOR = 'script[type="application/ld+json"]'
21
20
 
21
+ # @return [Symbol] scraper config key
22
+ def self.options_key = :schema
23
+
22
24
  class << self
25
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
26
+ # @return [Boolean] whether the page includes supported schema types
23
27
  def articles?(parsed_body)
24
- parsed_body.css(TAG_SELECTOR).any? do |script|
25
- (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
26
- script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
27
- end
28
- end
28
+ parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
29
+ end
30
+
31
+ # @param script [Nokogiri::XML::Element] schema JSON-LD script tag
32
+ # @return [Boolean] whether the tag references a supported schema type
33
+ def supported_schema_type?(script)
34
+ supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
35
+ supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
29
36
  end
30
37
 
31
38
  ##
@@ -49,11 +56,14 @@ module Html2rss
49
56
  end
50
57
  end
51
58
 
59
+ # @param object [Hash{Symbol => Object}] schema candidate object
60
+ # @return [Boolean] whether an extractor exists for the candidate object
52
61
  def supported_schema_object?(object)
53
62
  scraper_for_schema_object(object) ? true : false
54
63
  end
55
64
 
56
65
  ##
66
+ # @param schema_object [Hash{Symbol => Object}] schema object with an @type key
57
67
  # @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
58
68
  def scraper_for_schema_object(schema_object)
59
69
  type = schema_object[:@type]
@@ -63,7 +73,7 @@ module Html2rss
63
73
  elsif ItemList::SUPPORTED_TYPES.member?(type)
64
74
  ItemList
65
75
  else
66
- Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
76
+ Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
67
77
  nil
68
78
  end
69
79
  end
@@ -73,14 +83,19 @@ module Html2rss
73
83
  def parse_script_tag(script_tag)
74
84
  JSON.parse(script_tag.text, symbolize_names: true)
75
85
  rescue JSON::ParserError => error
76
- Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
86
+ Log.warn("#{name}: failed to parse JSON", error: error.message)
77
87
  []
78
88
  end
79
89
  end
80
90
 
81
- def initialize(parsed_body, url:)
91
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
92
+ # @param url [String, Html2rss::Url] base page URL
93
+ # @param opts [Hash] scraper-specific options
94
+ # @option opts [Object] :_reserved reserved for future scraper-specific options
95
+ def initialize(parsed_body, url:, **opts)
82
96
  @parsed_body = parsed_body
83
97
  @url = url
98
+ @opts = opts
84
99
  end
85
100
 
86
101
  ##
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class SemanticHtml
7
+ ##
8
+ # Selects the best content-like anchor from a semantic container.
9
+ #
10
+ # The selector turns raw DOM anchors into ranked facts so semantic
11
+ # scraping can reason about link intent instead of DOM order. It favors
12
+ # heading-aligned article links and suppresses utility links, duplicate
13
+ # destinations, and weak textless affordances.
14
+ class AnchorSelector # rubocop:disable Metrics/ClassLength
15
+ AnchorFacts = Data.define(
16
+ :anchor,
17
+ :text,
18
+ :url,
19
+ :destination,
20
+ :segments,
21
+ :meaningful_text,
22
+ :content_like_destination,
23
+ :heading_anchor,
24
+ :heading_text_match,
25
+ :score
26
+ )
27
+
28
+ # Comma-separated heading selector used for heading/anchor matching.
29
+ HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
30
+ # Path segments that usually represent utility navigation rather than article content.
31
+ UTILITY_PATH_SEGMENTS = %w[
32
+ about account author category comment comments contact feedback help
33
+ login newsletter profile register search settings share signup subscribe
34
+ topic topics view-all archive archives
35
+ feed feeds
36
+ recommended
37
+ for-you
38
+ preference preferences
39
+ notification notifications
40
+ privacy terms
41
+ cookie cookies
42
+ logout
43
+ user users
44
+ ].to_set.freeze
45
+ # Path segments that signal content-like destinations.
46
+ CONTENT_PATH_SEGMENTS = %w[
47
+ article articles news post posts story stories update updates
48
+ ].to_set.freeze
49
+ # Ancestor tags that usually indicate navigation/utility regions.
50
+ UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
51
+
52
+ # @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
53
+ def initialize(base_url)
54
+ @base_url = base_url
55
+ end
56
+
57
+ ##
58
+ # Chooses the single anchor that best represents the story contained
59
+ # in a semantic block.
60
+ #
61
+ # Ranking is scoped to one container at a time. That keeps the logic
62
+ # local, makes duplicate links to the same destination collapse into
63
+ # one candidate, and avoids page-wide heuristics leaking across cards.
64
+ #
65
+ # @param container [Nokogiri::XML::Element] semantic container being evaluated
66
+ # @return [Nokogiri::XML::Element, nil] selected primary anchor or nil when none qualify
67
+ def primary_anchor_for(container)
68
+ facts_for(container).max_by(&:score)&.anchor
69
+ end
70
+
71
+ private
72
+
73
+ attr_reader :base_url
74
+
75
+ def facts_for(container)
76
+ heading = heading_for(container)
77
+ heading_text = visible_text(heading)
78
+
79
+ container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
80
+ next if anchor.path.match?(Html::TAGS_TO_IGNORE)
81
+
82
+ facts = build_facts(anchor, heading, heading_text)
83
+ next unless facts
84
+
85
+ keep_stronger_fact(best_by_destination, facts)
86
+ end.values
87
+ end
88
+
89
+ def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
90
+ text = visible_text(anchor)
91
+ meaningful_text = meaningful_text?(text)
92
+ ancestors = anchor.ancestors.to_a
93
+ url = normalized_destination(anchor)
94
+ return unless url
95
+
96
+ segments = url.path_segments
97
+ content_like_destination = content_like_destination?(segments)
98
+ return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
99
+
100
+ heading_anchor = heading_anchor?(ancestors, heading)
101
+ heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
102
+ return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
103
+
104
+ AnchorFacts.new(
105
+ anchor:,
106
+ text:,
107
+ url:,
108
+ destination: url.to_s,
109
+ segments:,
110
+ meaningful_text:,
111
+ content_like_destination:,
112
+ heading_anchor:,
113
+ heading_text_match:,
114
+ score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
115
+ )
116
+ end
117
+
118
+ def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
119
+ utility_destination?(segments) ||
120
+ utility_text?(text) ||
121
+ icon_only_anchor?(anchor, meaningful_text) ||
122
+ utility_landmark_anchor?(ancestors)
123
+ end
124
+
125
+ def keep_stronger_fact(best_by_destination, facts)
126
+ current = best_by_destination[facts.destination]
127
+ return best_by_destination[facts.destination] = facts unless current
128
+ return if current.score >= facts.score
129
+
130
+ best_by_destination[facts.destination] = facts
131
+ end
132
+
133
+ def content_like_anchor?(meaningful_text, content_like_destination)
134
+ meaningful_text || content_like_destination
135
+ end
136
+
137
+ def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
138
+ score = 0
139
+ score += 100 if heading_anchor
140
+ score += 20 if heading_text_match
141
+ score += 10 if meaningful_text
142
+ score += 10 if content_like_destination
143
+ score
144
+ end
145
+
146
+ def heading_anchor?(ancestors, heading)
147
+ heading && ancestors.include?(heading)
148
+ end
149
+
150
+ def heading_text_match?(heading_text, text, meaningful_text)
151
+ meaningful_text && meaningful_text?(heading_text) && heading_text == text
152
+ end
153
+
154
+ def heading_for(container)
155
+ container.at_css(HEADING_SELECTOR)
156
+ end
157
+
158
+ def icon_only_anchor?(anchor, meaningful_text)
159
+ !meaningful_text && anchor.at_css('img, svg')
160
+ end
161
+
162
+ def utility_destination?(segments)
163
+ segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
164
+ end
165
+
166
+ def content_like_destination?(segments)
167
+ segments.any? do |segment|
168
+ CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
169
+ end
170
+ end
171
+
172
+ def normalized_destination(anchor)
173
+ href = anchor['href'].to_s.split('#').first.to_s.strip
174
+ return if href.empty?
175
+
176
+ Html2rss::Url.from_relative(href, base_url)
177
+ rescue ArgumentError
178
+ nil
179
+ end
180
+
181
+ def meaningful_text?(text)
182
+ text.scan(/\p{Alnum}+/).any?
183
+ end
184
+
185
+ def utility_text?(text)
186
+ text.match?(
187
+ /\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
188
+ )
189
+ end
190
+
191
+ def utility_landmark_anchor?(ancestors)
192
+ ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
193
+ end
194
+
195
+ def visible_text(node)
196
+ return '' unless node
197
+
198
+ HtmlExtractor.extract_visible_text(node).to_s.strip
199
+ end
200
+ end
201
+ end
202
+ end
203
+ end
204
+ end
@@ -1,115 +1,124 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'addressable'
4
- require 'parallel'
3
+ require_relative 'semantic_html/anchor_selector'
5
4
 
6
5
  module Html2rss
7
6
  class AutoSource
8
7
  module Scraper
9
8
  ##
10
- # Scrapes articles by looking for common markup tags (article, section, li)
11
- # containing an <a href> tag.
9
+ # Scrapes semantic containers by choosing one primary content link per
10
+ # block before extraction.
12
11
  #
13
- # See:
14
- # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
12
+ # This scraper is intentionally container-first:
13
+ # 1. collect candidate semantic containers once
14
+ # 2. select the strongest content-like anchor within each container
15
+ # 3. extract fields from the container while honoring that anchor choice
16
+ #
17
+ # The result is lower recall on weak-signal blocks, but much better link
18
+ # quality on modern teaser cards that mix headlines, utility links, and
19
+ # duplicate image overlays.
15
20
  class SemanticHtml
16
21
  include Enumerable
17
22
 
23
+ # Container plus selected anchor chosen for extraction.
24
+ Entry = Data.define(:container, :selected_anchor)
25
+
26
+ # Candidate semantic container selectors used to locate extractable blocks.
27
+ CONTAINER_SELECTORS = [
28
+ 'article:not(:has(article))',
29
+ 'section:not(:has(section))',
30
+ 'li:not(:has(li))',
31
+ 'tr:not(:has(tr))',
32
+ 'div:not(:has(div))'
33
+ ].freeze
34
+
18
35
  ##
19
- # Map of parent element names to CSS selectors for finding <a href> tags.
20
- ANCHOR_TAG_SELECTORS = {
21
- 'section' => ['section :not(section) a[href]'],
22
- 'tr' => ['table tr :not(tr) a[href]'],
23
- 'article' => [
24
- 'article :not(article) a[href]',
25
- 'article a[href]'
26
- ],
27
- 'li' => [
28
- 'ul > li :not(li) a[href]',
29
- 'ol > li :not(li) a[href]'
30
- ]
31
- }.freeze
32
-
33
- # Check if the parsed_body contains articles
34
- # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
35
- # @return [Boolean] True if articles are found, otherwise false.
36
+ # @return [Symbol] config key used to enable or configure this scraper
37
+ def self.options_key = :semantic_html
38
+
39
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
40
+ # @return [Boolean] true when at least one semantic container has an eligible anchor
36
41
  def self.articles?(parsed_body)
37
42
  return false unless parsed_body
38
43
 
39
- ANCHOR_TAG_SELECTORS.each_value do |selectors|
40
- return true if selectors.any? { |selector| parsed_body.at_css(selector) }
41
- end
42
- false
44
+ new(parsed_body, url: 'https://example.com').extractable?
45
+ end
46
+
47
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
48
+ # @param url [String, Html2rss::Url] base url
49
+ # @param extractor [Class] extractor class used for article extraction
50
+ # @param _opts [Hash] scraper-specific options
51
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
52
+ def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
53
+ @parsed_body = parsed_body
54
+ @url = url
55
+ @extractor = extractor
56
+ @anchor_selector = AnchorSelector.new(url)
43
57
  end
44
58
 
45
- # Finds the closest ancestor tag matching the specified tag name
46
- # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
47
- # @param tag_name [String] The tag name to search for
48
- # @param stop_tag [String] The tag name to stop searching at
49
- # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
50
- def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
51
- return current_tag if current_tag.name == tag_name
59
+ attr_reader :parsed_body
52
60
 
53
- stop_tags = Set[tag_name, stop_tag]
61
+ ##
62
+ # Yields extracted article hashes for each semantic container that
63
+ # survives anchor selection.
64
+ #
65
+ # Detection and extraction share the same memoized entry list so this
66
+ # scraper does not rerun anchor ranking once a page has already been
67
+ # accepted as extractable.
68
+ #
69
+ # @yieldparam article_hash [Hash] extracted article hash
70
+ # @return [Enumerator<Hash>]
71
+ def each
72
+ return enum_for(:each) unless block_given?
54
73
 
55
- while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
56
- current_tag = current_tag.parent
74
+ extractable_entries.each do |entry|
75
+ article_hash = @extractor.new(
76
+ entry.container,
77
+ base_url: @url,
78
+ selected_anchor: entry.selected_anchor
79
+ ).call
80
+ yield article_hash if article_hash
57
81
  end
58
-
59
- current_tag
60
82
  end
61
83
 
62
- # Finds the closest matching selector upwards in the DOM tree
63
- # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
64
- # @param selector [String] The CSS selector to search for
65
- # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
66
- def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
67
- current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
84
+ ##
85
+ # Reports whether the page contains at least one semantic container with
86
+ # a selectable primary anchor.
87
+ #
88
+ # @return [Boolean] true when at least one candidate container yields a primary anchor
89
+ def extractable?
90
+ extractable_entries.any?
68
91
  end
69
92
 
70
- # Helper method to find a matching selector upwards
71
- # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
72
- # @param selector [String] The CSS selector to search for
73
- # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
74
- def self.find_closest_selector_upwards(current_tag, selector:)
75
- while current_tag
76
- found = current_tag.at_css(selector)
77
- return found if found
78
-
79
- return nil unless current_tag.respond_to?(:parent)
93
+ protected
80
94
 
81
- current_tag = current_tag.parent
82
- end
95
+ def candidate_containers
96
+ @candidate_containers ||= collect_candidate_containers
83
97
  end
84
98
 
85
- # Returns an array of [tag_name, selector] pairs
86
- # @return [Array<[String, String]>] Array of tag name and selector pairs
87
- def self.anchor_tag_selector_pairs
88
- ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
89
- selectors.map { |selector| [tag_name, selector] }
90
- end
99
+ def primary_anchor_for(container)
100
+ @anchor_selector.primary_anchor_for(container)
91
101
  end
92
102
 
93
- def initialize(parsed_body, url:)
94
- @parsed_body = parsed_body
95
- @url = url
96
- end
103
+ def extractable_entries
104
+ @extractable_entries ||= candidate_containers.filter_map do |container|
105
+ selected_anchor = primary_anchor_for(container)
106
+ next unless selected_anchor
97
107
 
98
- attr_reader :parsed_body
108
+ Entry.new(container:, selected_anchor:)
109
+ end
110
+ end
99
111
 
100
- ##
101
- # @yieldparam [Hash] The scraped article hash
102
- # @return [Enumerator] Enumerator for the scraped articles
103
- def each
104
- return enum_for(:each) unless block_given?
112
+ def collect_candidate_containers
113
+ seen = {}.compare_by_identity
105
114
 
106
- SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
107
- parsed_body.css(selector).each do |selected_tag|
108
- article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
115
+ CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
116
+ parsed_body.css(selector).each do |container|
117
+ next if container.path.match?(Html::TAGS_TO_IGNORE)
118
+ next if seen[container]
109
119
 
110
- if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
111
- yield article_hash
112
- end
120
+ seen[container] = true
121
+ containers << container
113
122
  end
114
123
  end
115
124
  end