html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class SemanticHtml
|
|
7
|
+
##
|
|
8
|
+
# Selects the best content-like anchor from a semantic container.
|
|
9
|
+
#
|
|
10
|
+
# The selector turns raw DOM anchors into ranked facts so semantic
|
|
11
|
+
# scraping can reason about link intent instead of DOM order. It favors
|
|
12
|
+
# heading-aligned article links and suppresses utility links, duplicate
|
|
13
|
+
# destinations, and weak textless affordances.
|
|
14
|
+
class AnchorSelector # rubocop:disable Metrics/ClassLength
|
|
15
|
+
AnchorFacts = Data.define(
|
|
16
|
+
:anchor,
|
|
17
|
+
:text,
|
|
18
|
+
:url,
|
|
19
|
+
:destination,
|
|
20
|
+
:segments,
|
|
21
|
+
:meaningful_text,
|
|
22
|
+
:content_like_destination,
|
|
23
|
+
:heading_anchor,
|
|
24
|
+
:heading_text_match,
|
|
25
|
+
:score
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
|
|
29
|
+
UTILITY_PATH_SEGMENTS = %w[
|
|
30
|
+
about account author category comment comments contact feedback help
|
|
31
|
+
login newsletter profile register search settings share signup subscribe
|
|
32
|
+
topic topics view-all archive archives
|
|
33
|
+
feed feeds
|
|
34
|
+
recommended
|
|
35
|
+
for-you
|
|
36
|
+
preference preferences
|
|
37
|
+
notification notifications
|
|
38
|
+
privacy terms
|
|
39
|
+
cookie cookies
|
|
40
|
+
logout
|
|
41
|
+
user users
|
|
42
|
+
].to_set.freeze
|
|
43
|
+
CONTENT_PATH_SEGMENTS = %w[
|
|
44
|
+
article articles news post posts story stories update updates
|
|
45
|
+
].to_set.freeze
|
|
46
|
+
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
47
|
+
|
|
48
|
+
def initialize(base_url)
|
|
49
|
+
@base_url = base_url
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
# Chooses the single anchor that best represents the story contained
|
|
54
|
+
# in a semantic block.
|
|
55
|
+
#
|
|
56
|
+
# Ranking is scoped to one container at a time. That keeps the logic
|
|
57
|
+
# local, makes duplicate links to the same destination collapse into
|
|
58
|
+
# one candidate, and avoids page-wide heuristics leaking across cards.
|
|
59
|
+
#
|
|
60
|
+
# @param container [Nokogiri::XML::Element] semantic container being evaluated
|
|
61
|
+
# @return [Nokogiri::XML::Element, nil] selected primary anchor or nil when none qualify
|
|
62
|
+
def primary_anchor_for(container)
|
|
63
|
+
facts_for(container).max_by(&:score)&.anchor
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
attr_reader :base_url
|
|
69
|
+
|
|
70
|
+
def facts_for(container)
|
|
71
|
+
heading = heading_for(container)
|
|
72
|
+
heading_text = visible_text(heading)
|
|
73
|
+
|
|
74
|
+
container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
|
|
75
|
+
next if anchor.path.match?(Html::TAGS_TO_IGNORE)
|
|
76
|
+
|
|
77
|
+
facts = build_facts(anchor, heading, heading_text)
|
|
78
|
+
next unless facts
|
|
79
|
+
|
|
80
|
+
keep_stronger_fact(best_by_destination, facts)
|
|
81
|
+
end.values
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
|
|
85
|
+
text = visible_text(anchor)
|
|
86
|
+
meaningful_text = meaningful_text?(text)
|
|
87
|
+
ancestors = anchor.ancestors.to_a
|
|
88
|
+
url = normalized_destination(anchor)
|
|
89
|
+
return unless url
|
|
90
|
+
|
|
91
|
+
segments = url.path_segments
|
|
92
|
+
content_like_destination = content_like_destination?(segments)
|
|
93
|
+
return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
|
|
94
|
+
|
|
95
|
+
heading_anchor = heading_anchor?(ancestors, heading)
|
|
96
|
+
heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
|
|
97
|
+
return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
|
|
98
|
+
|
|
99
|
+
AnchorFacts.new(
|
|
100
|
+
anchor:,
|
|
101
|
+
text:,
|
|
102
|
+
url:,
|
|
103
|
+
destination: url.to_s,
|
|
104
|
+
segments:,
|
|
105
|
+
meaningful_text:,
|
|
106
|
+
content_like_destination:,
|
|
107
|
+
heading_anchor:,
|
|
108
|
+
heading_text_match:,
|
|
109
|
+
score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
|
|
110
|
+
)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
|
|
114
|
+
utility_destination?(segments) ||
|
|
115
|
+
utility_text?(text) ||
|
|
116
|
+
icon_only_anchor?(anchor, meaningful_text) ||
|
|
117
|
+
utility_landmark_anchor?(ancestors)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def keep_stronger_fact(best_by_destination, facts)
|
|
121
|
+
current = best_by_destination[facts.destination]
|
|
122
|
+
return best_by_destination[facts.destination] = facts unless current
|
|
123
|
+
return if current.score >= facts.score
|
|
124
|
+
|
|
125
|
+
best_by_destination[facts.destination] = facts
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def content_like_anchor?(meaningful_text, content_like_destination)
|
|
129
|
+
meaningful_text || content_like_destination
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
|
|
133
|
+
score = 0
|
|
134
|
+
score += 100 if heading_anchor
|
|
135
|
+
score += 20 if heading_text_match
|
|
136
|
+
score += 10 if meaningful_text
|
|
137
|
+
score += 10 if content_like_destination
|
|
138
|
+
score
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def heading_anchor?(ancestors, heading)
|
|
142
|
+
heading && ancestors.include?(heading)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def heading_text_match?(heading_text, text, meaningful_text)
|
|
146
|
+
meaningful_text && meaningful_text?(heading_text) && heading_text == text
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def heading_for(container)
|
|
150
|
+
container.at_css(HEADING_SELECTOR)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def icon_only_anchor?(anchor, meaningful_text)
|
|
154
|
+
!meaningful_text && anchor.at_css('img, svg')
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def utility_destination?(segments)
|
|
158
|
+
segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def content_like_destination?(segments)
|
|
162
|
+
segments.any? do |segment|
|
|
163
|
+
CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def normalized_destination(anchor)
|
|
168
|
+
href = anchor['href'].to_s.split('#').first.to_s.strip
|
|
169
|
+
return if href.empty?
|
|
170
|
+
|
|
171
|
+
Html2rss::Url.from_relative(href, base_url)
|
|
172
|
+
rescue ArgumentError
|
|
173
|
+
nil
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def meaningful_text?(text)
|
|
177
|
+
text.scan(/\p{Alnum}+/).any?
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def utility_text?(text)
|
|
181
|
+
text.match?(
|
|
182
|
+
/\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
|
|
183
|
+
)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def utility_landmark_anchor?(ancestors)
|
|
187
|
+
ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def visible_text(node)
|
|
191
|
+
return '' unless node
|
|
192
|
+
|
|
193
|
+
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
@@ -1,114 +1,120 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
require 'parallel'
|
|
3
|
+
require_relative 'semantic_html/anchor_selector'
|
|
5
4
|
|
|
6
5
|
module Html2rss
|
|
7
6
|
class AutoSource
|
|
8
7
|
module Scraper
|
|
9
8
|
##
|
|
10
|
-
# Scrapes
|
|
11
|
-
#
|
|
9
|
+
# Scrapes semantic containers by choosing one primary content link per
|
|
10
|
+
# block before extraction.
|
|
12
11
|
#
|
|
13
|
-
#
|
|
14
|
-
# 1.
|
|
12
|
+
# This scraper is intentionally container-first:
|
|
13
|
+
# 1. collect candidate semantic containers once
|
|
14
|
+
# 2. select the strongest content-like anchor within each container
|
|
15
|
+
# 3. extract fields from the container while honoring that anchor choice
|
|
16
|
+
#
|
|
17
|
+
# The result is lower recall on weak-signal blocks, but much better link
|
|
18
|
+
# quality on modern teaser cards that mix headlines, utility links, and
|
|
19
|
+
# duplicate image overlays.
|
|
15
20
|
class SemanticHtml
|
|
16
21
|
include Enumerable
|
|
17
22
|
|
|
23
|
+
Entry = Data.define(:container, :selected_anchor)
|
|
24
|
+
|
|
25
|
+
CONTAINER_SELECTORS = [
|
|
26
|
+
'article:not(:has(article))',
|
|
27
|
+
'section:not(:has(section))',
|
|
28
|
+
'li:not(:has(li))',
|
|
29
|
+
'tr:not(:has(tr))',
|
|
30
|
+
'div:not(:has(div))'
|
|
31
|
+
].freeze
|
|
32
|
+
|
|
18
33
|
##
|
|
19
|
-
#
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
'article :not(article) a[href]',
|
|
25
|
-
'article a[href]'
|
|
26
|
-
],
|
|
27
|
-
'li' => [
|
|
28
|
-
'ul > li :not(li) a[href]',
|
|
29
|
-
'ol > li :not(li) a[href]'
|
|
30
|
-
]
|
|
31
|
-
}.freeze
|
|
32
|
-
|
|
33
|
-
# Check if the parsed_body contains articles
|
|
34
|
-
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
|
|
35
|
-
# @return [Boolean] True if articles are found, otherwise false.
|
|
34
|
+
# @return [Symbol] config key used to enable or configure this scraper
|
|
35
|
+
def self.options_key = :semantic_html
|
|
36
|
+
|
|
37
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
38
|
+
# @return [Boolean] true when at least one semantic container has an eligible anchor
|
|
36
39
|
def self.articles?(parsed_body)
|
|
37
40
|
return false unless parsed_body
|
|
38
41
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
42
|
+
new(parsed_body, url: 'https://example.com').extractable?
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
46
|
+
# @param url [String, Html2rss::Url] base url
|
|
47
|
+
# @param extractor [Class] extractor class used for article extraction
|
|
48
|
+
def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
|
|
49
|
+
@parsed_body = parsed_body
|
|
50
|
+
@url = url
|
|
51
|
+
@extractor = extractor
|
|
52
|
+
@anchor_selector = AnchorSelector.new(url)
|
|
43
53
|
end
|
|
44
54
|
|
|
45
|
-
|
|
46
|
-
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
|
47
|
-
# @param tag_name [String] The tag name to search for
|
|
48
|
-
# @param stop_tag [String] The tag name to stop searching at
|
|
49
|
-
# @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
|
|
50
|
-
def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
|
|
51
|
-
return current_tag if current_tag.name == tag_name
|
|
55
|
+
attr_reader :parsed_body
|
|
52
56
|
|
|
53
|
-
|
|
57
|
+
##
|
|
58
|
+
# Yields extracted article hashes for each semantic container that
|
|
59
|
+
# survives anchor selection.
|
|
60
|
+
#
|
|
61
|
+
# Detection and extraction share the same memoized entry list so this
|
|
62
|
+
# scraper does not rerun anchor ranking once a page has already been
|
|
63
|
+
# accepted as extractable.
|
|
64
|
+
#
|
|
65
|
+
# @yieldparam article_hash [Hash] extracted article hash
|
|
66
|
+
# @return [Enumerator<Hash>]
|
|
67
|
+
def each
|
|
68
|
+
return enum_for(:each) unless block_given?
|
|
54
69
|
|
|
55
|
-
|
|
56
|
-
|
|
70
|
+
extractable_entries.each do |entry|
|
|
71
|
+
article_hash = @extractor.new(
|
|
72
|
+
entry.container,
|
|
73
|
+
base_url: @url,
|
|
74
|
+
selected_anchor: entry.selected_anchor
|
|
75
|
+
).call
|
|
76
|
+
yield article_hash if article_hash
|
|
57
77
|
end
|
|
58
|
-
|
|
59
|
-
current_tag
|
|
60
78
|
end
|
|
61
79
|
|
|
62
|
-
|
|
63
|
-
#
|
|
64
|
-
#
|
|
65
|
-
#
|
|
66
|
-
|
|
67
|
-
|
|
80
|
+
##
|
|
81
|
+
# Reports whether the page contains at least one semantic container with
|
|
82
|
+
# a selectable primary anchor.
|
|
83
|
+
#
|
|
84
|
+
# @return [Boolean] true when at least one candidate container yields a primary anchor
|
|
85
|
+
def extractable?
|
|
86
|
+
extractable_entries.any?
|
|
68
87
|
end
|
|
69
88
|
|
|
70
|
-
|
|
71
|
-
# @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
|
|
72
|
-
# @param selector [String] The CSS selector to search for
|
|
73
|
-
# @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
|
|
74
|
-
def self.find_closest_selector_upwards(current_tag, selector:)
|
|
75
|
-
while current_tag
|
|
76
|
-
found = current_tag.at_css(selector)
|
|
77
|
-
return found if found
|
|
78
|
-
|
|
79
|
-
return nil unless current_tag.respond_to?(:parent)
|
|
89
|
+
protected
|
|
80
90
|
|
|
81
|
-
|
|
82
|
-
|
|
91
|
+
def candidate_containers
|
|
92
|
+
@candidate_containers ||= collect_candidate_containers
|
|
83
93
|
end
|
|
84
94
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def self.anchor_tag_selector_pairs
|
|
88
|
-
ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
|
|
89
|
-
selectors.map { |selector| [tag_name, selector] }
|
|
90
|
-
end
|
|
95
|
+
def primary_anchor_for(container)
|
|
96
|
+
@anchor_selector.primary_anchor_for(container)
|
|
91
97
|
end
|
|
92
98
|
|
|
93
|
-
def
|
|
94
|
-
@
|
|
95
|
-
|
|
96
|
-
|
|
99
|
+
def extractable_entries
|
|
100
|
+
@extractable_entries ||= candidate_containers.filter_map do |container|
|
|
101
|
+
selected_anchor = primary_anchor_for(container)
|
|
102
|
+
next unless selected_anchor
|
|
97
103
|
|
|
98
|
-
|
|
104
|
+
Entry.new(container:, selected_anchor:)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
99
107
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
# @return [Enumerator] Enumerator for the scraped articles
|
|
103
|
-
def each
|
|
104
|
-
return enum_for(:each) unless block_given?
|
|
108
|
+
def collect_candidate_containers
|
|
109
|
+
seen = {}.compare_by_identity
|
|
105
110
|
|
|
106
|
-
|
|
107
|
-
parsed_body.css(selector).each do |
|
|
108
|
-
|
|
109
|
-
|
|
111
|
+
CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
|
|
112
|
+
parsed_body.css(selector).each do |container|
|
|
113
|
+
next if container.path.match?(Html::TAGS_TO_IGNORE)
|
|
114
|
+
next if seen[container]
|
|
110
115
|
|
|
111
|
-
|
|
116
|
+
seen[container] = true
|
|
117
|
+
containers << container
|
|
112
118
|
end
|
|
113
119
|
end
|
|
114
120
|
end
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class WordpressApi
|
|
7
|
+
##
|
|
8
|
+
# Determines whether a WordPress page can safely be mapped to a posts query.
|
|
9
|
+
class PageScope
|
|
10
|
+
CATEGORY_SEGMENT = 'category'
|
|
11
|
+
TAG_SEGMENT = 'tag'
|
|
12
|
+
AUTHOR_SEGMENT = 'author'
|
|
13
|
+
|
|
14
|
+
##
|
|
15
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
16
|
+
# @param url [Html2rss::Url] canonical page URL
|
|
17
|
+
# @return [PageScope] derived page scope
|
|
18
|
+
def self.from(parsed_body:, url:)
|
|
19
|
+
Resolver.new(parsed_body:, url:).call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# @param query [Hash<String, String>] scoped query params for the posts endpoint
|
|
24
|
+
# @param fetchable [Boolean] whether a posts follow-up is safe for this page
|
|
25
|
+
# @param reason [Symbol] classification of the resolved page scope
|
|
26
|
+
def initialize(query:, fetchable:, reason:)
|
|
27
|
+
@query = query.freeze
|
|
28
|
+
@fetchable = fetchable
|
|
29
|
+
@reason = reason
|
|
30
|
+
freeze
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
##
|
|
34
|
+
# @return [Hash<String, String>] query params to apply to the posts request
|
|
35
|
+
attr_reader :query
|
|
36
|
+
|
|
37
|
+
##
|
|
38
|
+
# @return [Boolean] whether the page may safely use the posts API follow-up
|
|
39
|
+
def fetchable?
|
|
40
|
+
@fetchable
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
##
|
|
44
|
+
# @return [Symbol] classification of the resolved page scope
|
|
45
|
+
attr_reader :reason
|
|
46
|
+
|
|
47
|
+
##
|
|
48
|
+
# Resolves the page scope from page markup and canonical URL signals.
|
|
49
|
+
class Resolver # rubocop:disable Metrics/ClassLength
|
|
50
|
+
##
|
|
51
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
52
|
+
# @param url [Html2rss::Url] canonical page URL
|
|
53
|
+
def initialize(parsed_body:, url:)
|
|
54
|
+
@parsed_body = parsed_body
|
|
55
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
##
|
|
59
|
+
# @return [PageScope] derived page scope
|
|
60
|
+
def call
|
|
61
|
+
category_scope ||
|
|
62
|
+
tag_scope ||
|
|
63
|
+
author_scope ||
|
|
64
|
+
date_scope ||
|
|
65
|
+
fallback_scope
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
attr_reader :parsed_body, :url
|
|
71
|
+
|
|
72
|
+
def category_scope
|
|
73
|
+
return unless category_archive?
|
|
74
|
+
|
|
75
|
+
scoped_scope('categories' => archive_id('category'))
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def tag_scope
|
|
79
|
+
return unless tag_archive?
|
|
80
|
+
|
|
81
|
+
scoped_scope('tags' => archive_id('tag'))
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def author_scope
|
|
85
|
+
return unless author_archive?
|
|
86
|
+
|
|
87
|
+
scoped_scope('author' => archive_id('author'))
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def date_scope
|
|
91
|
+
return unless date_archive?
|
|
92
|
+
|
|
93
|
+
range = date_archive_range
|
|
94
|
+
return unknown_archive_scope unless range
|
|
95
|
+
|
|
96
|
+
PageScope.new(query: range, fetchable: true, reason: :archive)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def fallback_scope
|
|
100
|
+
return unknown_archive_scope if archive_like?
|
|
101
|
+
return non_archive_scope if singular_like?
|
|
102
|
+
|
|
103
|
+
PageScope.new(query: {}, fetchable: true, reason: :unscoped)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def scoped_scope(query)
|
|
107
|
+
return unknown_archive_scope if query.values.any?(&:nil?)
|
|
108
|
+
|
|
109
|
+
PageScope.new(query:, fetchable: true, reason: :archive)
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def unknown_archive_scope
|
|
113
|
+
PageScope.new(query: {}, fetchable: false, reason: :unsupported_archive)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def non_archive_scope
|
|
117
|
+
PageScope.new(query: {}, fetchable: false, reason: :non_archive)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def category_archive?
|
|
121
|
+
body_classes.include?('category') || leading_path_segment == CATEGORY_SEGMENT
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def tag_archive?
|
|
125
|
+
body_classes.include?('tag') || leading_path_segment == TAG_SEGMENT
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def author_archive?
|
|
129
|
+
body_classes.include?('author') || leading_path_segment == AUTHOR_SEGMENT
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def date_archive?
|
|
133
|
+
body_classes.include?('date') || date_archive_path?
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def archive_like?
|
|
137
|
+
category_archive? || tag_archive? || author_archive? || date_archive? || body_classes.include?('archive')
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def singular_like?
|
|
141
|
+
body_classes.intersect?(%w[page single singular attachment]) ||
|
|
142
|
+
body_classes.any? { _1.match?(/\A(?:page-id|postid)-\d+\z/) }
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def body_classes
|
|
146
|
+
@body_classes ||= parsed_body.at_css('body')&.[]('class').to_s.split
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def archive_id(prefix)
|
|
150
|
+
body_classes.filter_map do |klass|
|
|
151
|
+
klass[Regexp.new("^#{Regexp.escape(prefix)}-(\\d+)$"), 1]
|
|
152
|
+
end.first
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def canonical_or_current_url
|
|
156
|
+
href = parsed_body.at_css(WordpressApi::CANONICAL_LINK_SELECTOR)&.[]('href').to_s.strip
|
|
157
|
+
return url if href.empty?
|
|
158
|
+
|
|
159
|
+
canonical_url = Html2rss::Url.from_relative(href, url)
|
|
160
|
+
same_origin_url?(canonical_url, url) ? canonical_url : url
|
|
161
|
+
rescue ArgumentError
|
|
162
|
+
url
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def path_segments
|
|
166
|
+
@path_segments ||= canonical_or_current_url.path_segments
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def leading_path_segment
|
|
170
|
+
path_segments.first
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def date_archive_path?
|
|
174
|
+
!date_archive_segments.nil?
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def date_archive_range
|
|
178
|
+
components = date_archive_components
|
|
179
|
+
return unless components
|
|
180
|
+
|
|
181
|
+
start_date = Date.new(*components.fetch(:start_date_parts))
|
|
182
|
+
{
|
|
183
|
+
'after' => iso8601_start(start_date),
|
|
184
|
+
'before' => iso8601_start(next_archive_boundary(start_date, components.fetch(:precision)))
|
|
185
|
+
}
|
|
186
|
+
rescue Date::Error
|
|
187
|
+
nil
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def date_archive_components
|
|
191
|
+
segments = date_archive_segments
|
|
192
|
+
return unless segments
|
|
193
|
+
|
|
194
|
+
year = segments.fetch(0).to_i
|
|
195
|
+
month = parse_archive_segment(segments[1], 1, 12)
|
|
196
|
+
day = parse_archive_segment(segments[2], 1, 31)
|
|
197
|
+
|
|
198
|
+
{
|
|
199
|
+
start_date_parts: [year, month || 1, day || 1],
|
|
200
|
+
precision: archive_precision(month:, day:)
|
|
201
|
+
}
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def date_archive_segments
|
|
205
|
+
year_index = path_segments.find_index { _1.match?(/\A\d{4}\z/) }
|
|
206
|
+
return unless year_index
|
|
207
|
+
|
|
208
|
+
segments = path_segments.drop(year_index)
|
|
209
|
+
return unless segments.length.between?(1, 3)
|
|
210
|
+
return unless archive_segment_shape?(segments)
|
|
211
|
+
|
|
212
|
+
segments
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def archive_segment_shape?(segments)
|
|
216
|
+
month = segments[1]
|
|
217
|
+
day = segments[2]
|
|
218
|
+
return false if day && month.nil?
|
|
219
|
+
return false unless month.nil? || month.match?(/\A\d+\z/)
|
|
220
|
+
return false unless day.nil? || day.match?(/\A\d+\z/)
|
|
221
|
+
|
|
222
|
+
true
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def same_origin_url?(left, right)
|
|
226
|
+
[left.scheme, left.host, left.port] == [right.scheme, right.host, right.port]
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def archive_precision(month:, day:)
|
|
230
|
+
return :day if day
|
|
231
|
+
return :month if month
|
|
232
|
+
|
|
233
|
+
:year
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def next_archive_boundary(start_date, precision)
|
|
237
|
+
{
|
|
238
|
+
year: start_date.next_year,
|
|
239
|
+
month: start_date.next_month,
|
|
240
|
+
day: start_date.next_day
|
|
241
|
+
}.fetch(precision)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def iso8601_start(date)
|
|
245
|
+
date.strftime('%Y-%m-%dT00:00:00Z')
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def parse_archive_segment(value, minimum, maximum)
|
|
249
|
+
return nil unless value&.match?(/\A\d+\z/)
|
|
250
|
+
|
|
251
|
+
number = value.to_i
|
|
252
|
+
return nil if number < minimum || number > maximum
|
|
253
|
+
|
|
254
|
+
number
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
end
|