html2rss 0.19.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/auto_source/scraper/html.rb +48 -56
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +447 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +6 -161
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +102 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +172 -30
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +1 -1
- data/lib/html2rss/config/class_methods.rb +2 -2
- data/lib/html2rss/config/request_headers.rb +18 -9
- data/lib/html2rss/configuration.rb +176 -0
- data/lib/html2rss/html_extractor/list_candidates.rb +94 -0
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +257 -0
- data/lib/html2rss/html_extractor/semantic_containers.rb +70 -0
- data/lib/html2rss/html_extractor.rb +11 -0
- data/lib/html2rss/rss_builder/channel.rb +10 -7
- data/lib/html2rss/url.rb +2 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +54 -5
- metadata +9 -3
|
@@ -11,47 +11,13 @@ module Html2rss
|
|
|
11
11
|
# scraping can reason about link intent instead of DOM order. It favors
|
|
12
12
|
# heading-aligned article links and suppresses utility links, duplicate
|
|
13
13
|
# destinations, and weak textless affordances.
|
|
14
|
-
class AnchorSelector
|
|
15
|
-
AnchorFacts = Data.define(
|
|
16
|
-
:anchor,
|
|
17
|
-
:text,
|
|
18
|
-
:url,
|
|
19
|
-
:destination,
|
|
20
|
-
:segments,
|
|
21
|
-
:meaningful_text,
|
|
22
|
-
:content_like_destination,
|
|
23
|
-
:heading_anchor,
|
|
24
|
-
:heading_text_match,
|
|
25
|
-
:score
|
|
26
|
-
)
|
|
27
|
-
|
|
14
|
+
class AnchorSelector
|
|
28
15
|
# Comma-separated heading selector used for heading/anchor matching.
|
|
29
16
|
HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
|
|
30
|
-
# Path segments that usually represent utility navigation rather than article content.
|
|
31
|
-
UTILITY_PATH_SEGMENTS = %w[
|
|
32
|
-
about account author category comment comments contact feedback help
|
|
33
|
-
login newsletter profile register search settings share signup subscribe
|
|
34
|
-
topic topics view-all archive archives
|
|
35
|
-
feed feeds
|
|
36
|
-
recommended
|
|
37
|
-
for-you
|
|
38
|
-
preference preferences
|
|
39
|
-
notification notifications
|
|
40
|
-
privacy terms
|
|
41
|
-
cookie cookies
|
|
42
|
-
logout
|
|
43
|
-
user users
|
|
44
|
-
].to_set.freeze
|
|
45
|
-
# Path segments that signal content-like destinations.
|
|
46
|
-
CONTENT_PATH_SEGMENTS = %w[
|
|
47
|
-
article articles news post posts story stories update updates
|
|
48
|
-
].to_set.freeze
|
|
49
|
-
# Ancestor tags that usually indicate navigation/utility regions.
|
|
50
|
-
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
51
17
|
|
|
52
18
|
# @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
|
|
53
19
|
def initialize(base_url)
|
|
54
|
-
@
|
|
20
|
+
@link_heuristics = LinkHeuristics.new(base_url)
|
|
55
21
|
end
|
|
56
22
|
|
|
57
23
|
##
|
|
@@ -70,132 +36,11 @@ module Html2rss
|
|
|
70
36
|
|
|
71
37
|
private
|
|
72
38
|
|
|
73
|
-
attr_reader :base_url
|
|
74
|
-
|
|
75
39
|
def facts_for(container)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
next if anchor.path.match?(Html::TAGS_TO_IGNORE)
|
|
81
|
-
|
|
82
|
-
facts = build_facts(anchor, heading, heading_text)
|
|
83
|
-
next unless facts
|
|
84
|
-
|
|
85
|
-
keep_stronger_fact(best_by_destination, facts)
|
|
86
|
-
end.values
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
|
|
90
|
-
text = visible_text(anchor)
|
|
91
|
-
meaningful_text = meaningful_text?(text)
|
|
92
|
-
ancestors = anchor.ancestors.to_a
|
|
93
|
-
url = normalized_destination(anchor)
|
|
94
|
-
return unless url
|
|
95
|
-
|
|
96
|
-
segments = url.path_segments
|
|
97
|
-
content_like_destination = content_like_destination?(segments)
|
|
98
|
-
return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
|
|
99
|
-
|
|
100
|
-
heading_anchor = heading_anchor?(ancestors, heading)
|
|
101
|
-
heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
|
|
102
|
-
return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
|
|
103
|
-
|
|
104
|
-
AnchorFacts.new(
|
|
105
|
-
anchor:,
|
|
106
|
-
text:,
|
|
107
|
-
url:,
|
|
108
|
-
destination: url.to_s,
|
|
109
|
-
segments:,
|
|
110
|
-
meaningful_text:,
|
|
111
|
-
content_like_destination:,
|
|
112
|
-
heading_anchor:,
|
|
113
|
-
heading_text_match:,
|
|
114
|
-
score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
|
|
115
|
-
)
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
|
|
119
|
-
utility_destination?(segments) ||
|
|
120
|
-
utility_text?(text) ||
|
|
121
|
-
icon_only_anchor?(anchor, meaningful_text) ||
|
|
122
|
-
utility_landmark_anchor?(ancestors)
|
|
123
|
-
end
|
|
124
|
-
|
|
125
|
-
def keep_stronger_fact(best_by_destination, facts)
|
|
126
|
-
current = best_by_destination[facts.destination]
|
|
127
|
-
return best_by_destination[facts.destination] = facts unless current
|
|
128
|
-
return if current.score >= facts.score
|
|
129
|
-
|
|
130
|
-
best_by_destination[facts.destination] = facts
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
def content_like_anchor?(meaningful_text, content_like_destination)
|
|
134
|
-
meaningful_text || content_like_destination
|
|
135
|
-
end
|
|
136
|
-
|
|
137
|
-
def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
|
|
138
|
-
score = 0
|
|
139
|
-
score += 100 if heading_anchor
|
|
140
|
-
score += 20 if heading_text_match
|
|
141
|
-
score += 10 if meaningful_text
|
|
142
|
-
score += 10 if content_like_destination
|
|
143
|
-
score
|
|
144
|
-
end
|
|
145
|
-
|
|
146
|
-
def heading_anchor?(ancestors, heading)
|
|
147
|
-
heading && ancestors.include?(heading)
|
|
148
|
-
end
|
|
149
|
-
|
|
150
|
-
def heading_text_match?(heading_text, text, meaningful_text)
|
|
151
|
-
meaningful_text && meaningful_text?(heading_text) && heading_text == text
|
|
152
|
-
end
|
|
153
|
-
|
|
154
|
-
def heading_for(container)
|
|
155
|
-
container.at_css(HEADING_SELECTOR)
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
def icon_only_anchor?(anchor, meaningful_text)
|
|
159
|
-
!meaningful_text && anchor.at_css('img, svg')
|
|
160
|
-
end
|
|
161
|
-
|
|
162
|
-
def utility_destination?(segments)
|
|
163
|
-
segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
|
|
164
|
-
end
|
|
165
|
-
|
|
166
|
-
def content_like_destination?(segments)
|
|
167
|
-
segments.any? do |segment|
|
|
168
|
-
CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
|
|
169
|
-
end
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
def normalized_destination(anchor)
|
|
173
|
-
href = anchor['href'].to_s.split('#').first.to_s.strip
|
|
174
|
-
return if href.empty?
|
|
175
|
-
|
|
176
|
-
Html2rss::Url.from_relative(href, base_url)
|
|
177
|
-
rescue ArgumentError
|
|
178
|
-
nil
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
def meaningful_text?(text)
|
|
182
|
-
text.scan(/\p{Alnum}+/).any?
|
|
183
|
-
end
|
|
184
|
-
|
|
185
|
-
def utility_text?(text)
|
|
186
|
-
text.match?(
|
|
187
|
-
/\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
|
|
188
|
-
)
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
def utility_landmark_anchor?(ancestors)
|
|
192
|
-
ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
193
|
-
end
|
|
194
|
-
|
|
195
|
-
def visible_text(node)
|
|
196
|
-
return '' unless node
|
|
197
|
-
|
|
198
|
-
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
40
|
+
HtmlExtractor::SemanticAnchorCandidates.new(
|
|
41
|
+
container,
|
|
42
|
+
link_heuristics: @link_heuristics
|
|
43
|
+
).to_a
|
|
199
44
|
end
|
|
200
45
|
end
|
|
201
46
|
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class SemanticHtml
|
|
7
|
+
##
|
|
8
|
+
# Collapses nested containers and deduplicates entries pointing to the same destination.
|
|
9
|
+
# It resolves ties using scoring precedence and payload richness comparison.
|
|
10
|
+
class Deduplicator
|
|
11
|
+
# @param url [String, Html2rss::Url] base url used to resolve relative hrefs
|
|
12
|
+
# @param extractor [Class] extractor class used to materialize articles
|
|
13
|
+
def initialize(url, extractor)
|
|
14
|
+
@url = url
|
|
15
|
+
@extractor = extractor
|
|
16
|
+
@article_cache = {}.compare_by_identity
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Collapses and deduplicates the given entries.
|
|
20
|
+
#
|
|
21
|
+
# @param entries [Array<Entry>] list of scraper entries
|
|
22
|
+
# @return [Array<Entry>] deduplicated list of scraper entries
|
|
23
|
+
def call(entries)
|
|
24
|
+
destination_groups(entries).filter_map do |group|
|
|
25
|
+
collapsed_group = collapse_nested_destination_group(group)
|
|
26
|
+
collapsed_group.reduce do |best, entry|
|
|
27
|
+
stronger_entry?(entry, best) ? entry : best
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Returns the materialized article hash for the entry, using the cache.
|
|
33
|
+
#
|
|
34
|
+
# @param entry [Entry] scraper entry
|
|
35
|
+
# @return [Hash, nil] article payload
|
|
36
|
+
def article_for(entry)
|
|
37
|
+
return entry.article if entry.article
|
|
38
|
+
|
|
39
|
+
@article_cache.fetch(entry) do
|
|
40
|
+
@article_cache[entry] = @extractor.new(
|
|
41
|
+
entry.container, base_url: @url, selected_anchor: entry.selected_anchor
|
|
42
|
+
).call
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Compares two entries to determine which is stronger.
|
|
47
|
+
#
|
|
48
|
+
# @param left [Entry] left entry
|
|
49
|
+
# @param right [Entry] right entry
|
|
50
|
+
# @return [Boolean] true if left is stronger than right
|
|
51
|
+
def stronger_entry?(left, right) # rubocop:disable Metrics/AbcSize
|
|
52
|
+
final_delta = left.final_score <=> right.final_score
|
|
53
|
+
return final_delta.positive? unless final_delta.zero?
|
|
54
|
+
|
|
55
|
+
quality_delta = left.quality_score <=> right.quality_score
|
|
56
|
+
return quality_delta.positive? unless quality_delta.zero?
|
|
57
|
+
|
|
58
|
+
left_article = article_for(left)
|
|
59
|
+
right_article = article_for(right)
|
|
60
|
+
return !right_article if left_article.nil? || right_article.nil?
|
|
61
|
+
|
|
62
|
+
richness_delta = payload_richness_signature(left_article) <=> payload_richness_signature(right_article)
|
|
63
|
+
richness_delta.zero? ? left.position < right.position : richness_delta.positive?
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
private
|
|
67
|
+
|
|
68
|
+
def destination_groups(entries) = entries.group_by { entry_destination(_1) }.values
|
|
69
|
+
|
|
70
|
+
def collapse_nested_destination_group(entries)
|
|
71
|
+
return entries if entries.size <= 1
|
|
72
|
+
|
|
73
|
+
entries.reject do |entry|
|
|
74
|
+
entries.any? do |other|
|
|
75
|
+
next if entry.equal?(other)
|
|
76
|
+
next unless nested_container_pair?(entry.container, other.container)
|
|
77
|
+
|
|
78
|
+
stronger_entry?(other, entry)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def nested_container_pair?(left, right) = left.ancestors.include?(right) || right.ancestors.include?(left)
|
|
84
|
+
|
|
85
|
+
def entry_destination(entry) = entry.destination_facts&.destination || article_for(entry)&.[](:url)&.to_s
|
|
86
|
+
|
|
87
|
+
def payload_richness_signature(article)
|
|
88
|
+
[
|
|
89
|
+
article[:published_at] ? 1 : 0,
|
|
90
|
+
word_count(article[:description]),
|
|
91
|
+
article[:image] ? 1 : 0,
|
|
92
|
+
Array(article[:categories]).length,
|
|
93
|
+
Array(article[:enclosures]).length
|
|
94
|
+
]
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require_relative 'semantic_html/anchor_selector'
|
|
4
|
+
require_relative 'semantic_html/deduplicator'
|
|
4
5
|
|
|
5
6
|
module Html2rss
|
|
6
7
|
class AutoSource
|
|
@@ -17,20 +18,20 @@ module Html2rss
|
|
|
17
18
|
# The result is lower recall on weak-signal blocks, but much better link
|
|
18
19
|
# quality on modern teaser cards that mix headlines, utility links, and
|
|
19
20
|
# duplicate image overlays.
|
|
20
|
-
class SemanticHtml
|
|
21
|
+
class SemanticHtml # rubocop:disable Metrics/ClassLength
|
|
21
22
|
include Enumerable
|
|
22
23
|
|
|
23
|
-
# Container plus selected anchor
|
|
24
|
-
Entry = Data.define(
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
24
|
+
# Container plus selected anchor, scoring metadata, and extracted article.
|
|
25
|
+
Entry = Data.define(
|
|
26
|
+
:container,
|
|
27
|
+
:selected_anchor,
|
|
28
|
+
:destination_facts,
|
|
29
|
+
:quality_score,
|
|
30
|
+
:junk_score,
|
|
31
|
+
:final_score,
|
|
32
|
+
:position,
|
|
33
|
+
:article
|
|
34
|
+
)
|
|
34
35
|
|
|
35
36
|
##
|
|
36
37
|
# @return [Symbol] config key used to enable or configure this scraper
|
|
@@ -53,6 +54,7 @@ module Html2rss
|
|
|
53
54
|
@parsed_body = parsed_body
|
|
54
55
|
@url = url
|
|
55
56
|
@extractor = extractor
|
|
57
|
+
@link_heuristics = LinkHeuristics.new(url)
|
|
56
58
|
@anchor_selector = AnchorSelector.new(url)
|
|
57
59
|
end
|
|
58
60
|
|
|
@@ -71,14 +73,7 @@ module Html2rss
|
|
|
71
73
|
def each
|
|
72
74
|
return enum_for(:each) unless block_given?
|
|
73
75
|
|
|
74
|
-
|
|
75
|
-
article_hash = @extractor.new(
|
|
76
|
-
entry.container,
|
|
77
|
-
base_url: @url,
|
|
78
|
-
selected_anchor: entry.selected_anchor
|
|
79
|
-
).call
|
|
80
|
-
yield article_hash if article_hash
|
|
81
|
-
end
|
|
76
|
+
ranked_entries.each { yield _1.article }
|
|
82
77
|
end
|
|
83
78
|
|
|
84
79
|
##
|
|
@@ -100,28 +95,175 @@ module Html2rss
|
|
|
100
95
|
@anchor_selector.primary_anchor_for(container)
|
|
101
96
|
end
|
|
102
97
|
|
|
103
|
-
def extractable_entries
|
|
98
|
+
def extractable_entries # rubocop:disable Metrics/MethodLength
|
|
104
99
|
@extractable_entries ||= candidate_containers.filter_map do |container|
|
|
105
100
|
selected_anchor = primary_anchor_for(container)
|
|
101
|
+
|
|
106
102
|
next unless selected_anchor
|
|
107
103
|
|
|
108
|
-
|
|
104
|
+
destination_facts = normalized_destination(selected_anchor)
|
|
105
|
+
next unless destination_facts
|
|
106
|
+
next if hard_junk_entry?(container, selected_anchor, destination_facts)
|
|
107
|
+
|
|
108
|
+
quality = quality_score(container, selected_anchor, destination_facts)
|
|
109
|
+
junk = junk_score(container, selected_anchor, destination_facts)
|
|
110
|
+
|
|
111
|
+
Entry.new(
|
|
112
|
+
container:,
|
|
113
|
+
selected_anchor:,
|
|
114
|
+
destination_facts:,
|
|
115
|
+
quality_score: quality,
|
|
116
|
+
junk_score: junk,
|
|
117
|
+
final_score: quality - junk,
|
|
118
|
+
position: document_position(container),
|
|
119
|
+
article: nil
|
|
120
|
+
)
|
|
109
121
|
end
|
|
110
122
|
end
|
|
111
123
|
|
|
112
|
-
|
|
113
|
-
|
|
124
|
+
# rubocop:disable Metrics/MethodLength
|
|
125
|
+
def ranked_entries
|
|
126
|
+
@ranked_entries ||= begin
|
|
127
|
+
deduplicator = Deduplicator.new(@url, @extractor)
|
|
128
|
+
entries = deduplicator.call(extractable_entries)
|
|
129
|
+
entries = stable_rank(entries)
|
|
114
130
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
next
|
|
118
|
-
next if seen[container]
|
|
131
|
+
entries.filter_map do |entry|
|
|
132
|
+
article = deduplicator.article_for(entry)
|
|
133
|
+
next unless article
|
|
119
134
|
|
|
120
|
-
|
|
121
|
-
|
|
135
|
+
Entry.new(
|
|
136
|
+
container: entry.container,
|
|
137
|
+
selected_anchor: entry.selected_anchor,
|
|
138
|
+
destination_facts: entry.destination_facts,
|
|
139
|
+
quality_score: entry.quality_score,
|
|
140
|
+
junk_score: entry.junk_score,
|
|
141
|
+
final_score: entry.final_score,
|
|
142
|
+
position: entry.position,
|
|
143
|
+
article:
|
|
144
|
+
)
|
|
122
145
|
end
|
|
123
146
|
end
|
|
124
147
|
end
|
|
148
|
+
# rubocop:enable Metrics/MethodLength
|
|
149
|
+
|
|
150
|
+
def collect_candidate_containers
|
|
151
|
+
HtmlExtractor::SemanticContainers.call(parsed_body)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
private
|
|
155
|
+
|
|
156
|
+
def document_position(container)
|
|
157
|
+
(@document_positions ||= candidate_containers.each_with_index.to_h).fetch(container)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def quality_score(container, selected_anchor, destination_facts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
161
|
+
title = entry_title(container, selected_anchor)
|
|
162
|
+
words = word_count(title)
|
|
163
|
+
container_text = visible_text(container)
|
|
164
|
+
score = 0
|
|
165
|
+
|
|
166
|
+
score += 40 if words >= 3
|
|
167
|
+
score += 15 if words >= 7
|
|
168
|
+
score += 20 if destination_facts.url.path.to_s.length > 6
|
|
169
|
+
score += 15 if destination_facts.content_path
|
|
170
|
+
score += 15 if publish_marker?(container)
|
|
171
|
+
score += 10 if descriptive_context?(container_text, title)
|
|
172
|
+
score += 10 if article_container?(container)
|
|
173
|
+
score += 10 if content_tokens?(container_tokens(container))
|
|
174
|
+
score
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def junk_score(container, selected_anchor, destination_facts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
178
|
+
title = entry_title(container, selected_anchor)
|
|
179
|
+
utility_text = @link_heuristics.utility_prefix_text?(title)
|
|
180
|
+
recommended_text = @link_heuristics.recommended_text?(title)
|
|
181
|
+
content_signal = destination_facts.content_path
|
|
182
|
+
no_content_signal = !content_signal
|
|
183
|
+
non_content_utility_path =
|
|
184
|
+
destination_facts.utility_path &&
|
|
185
|
+
no_content_signal &&
|
|
186
|
+
!destination_facts.strong_post_suffix
|
|
187
|
+
publish_signal = publish_marker?(container)
|
|
188
|
+
descriptive_signal = descriptive_context?(visible_text(container), title)
|
|
189
|
+
weak_container = !publish_signal && !descriptive_signal
|
|
190
|
+
score = 0
|
|
191
|
+
|
|
192
|
+
score += 25 if non_content_utility_path
|
|
193
|
+
score += 15 if utility_text && word_count(title) <= 6
|
|
194
|
+
score += 10 if destination_facts.shallow
|
|
195
|
+
score += 10 if weak_container
|
|
196
|
+
score += 10 if recommended_text && no_content_signal
|
|
197
|
+
score += 5 if destination_facts.high_confidence_junk_path
|
|
198
|
+
score += 15 if junk_tokens?(container_tokens(container))
|
|
199
|
+
score
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def hard_junk_entry?(container, selected_anchor, destination_facts) # rubocop:disable Metrics/MethodLength
|
|
203
|
+
title = entry_title(container, selected_anchor)
|
|
204
|
+
publish_signal = publish_marker?(container)
|
|
205
|
+
descriptive_signal = descriptive_context?(visible_text(container), title)
|
|
206
|
+
content_signal = destination_facts.content_path
|
|
207
|
+
weak_article_candidate = article_signal_count(
|
|
208
|
+
container,
|
|
209
|
+
publish_signal:,
|
|
210
|
+
descriptive_signal:,
|
|
211
|
+
content_signal:
|
|
212
|
+
) < 2
|
|
213
|
+
|
|
214
|
+
destination_facts.high_confidence_junk_path ||
|
|
215
|
+
(@link_heuristics.recommended_text?(title) && destination_facts.shallow && weak_article_candidate) ||
|
|
216
|
+
(@link_heuristics.utility_prefix_text?(title) &&
|
|
217
|
+
destination_facts.high_confidence_utility_destination &&
|
|
218
|
+
weak_article_candidate)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def publish_marker?(container)
|
|
222
|
+
container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def article_signal_count(container, publish_signal:, descriptive_signal:, content_signal:)
|
|
226
|
+
[article_container?(container), publish_signal, descriptive_signal, content_signal].count(&:itself)
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def article_container?(container) = container.name == 'article'
|
|
230
|
+
|
|
231
|
+
def descriptive_context?(container_text, title)
|
|
232
|
+
snippet = container_text.to_s.sub(/\A#{Regexp.escape(title.to_s)}/i, '')
|
|
233
|
+
word_count(snippet) >= 8
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def heading_for(container) = container.at_css(AnchorSelector::HEADING_SELECTOR)
|
|
237
|
+
|
|
238
|
+
def normalized_destination(anchor) = @link_heuristics.destination_facts(anchor)
|
|
239
|
+
|
|
240
|
+
def visible_text(node)
|
|
241
|
+
return '' unless node
|
|
242
|
+
|
|
243
|
+
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def entry_title(container, selected_anchor) = visible_text(heading_for(container) || selected_anchor)
|
|
247
|
+
|
|
248
|
+
def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
|
|
249
|
+
|
|
250
|
+
def container_tokens(container)
|
|
251
|
+
classes = container['class'].to_s.split
|
|
252
|
+
id = container['id'].to_s
|
|
253
|
+
(classes << id).flat_map { |str| str.downcase.split(/[-_]+/) }.reject(&:empty?)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def content_tokens?(tokens)
|
|
257
|
+
(@content_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)).intersect?(tokens.to_set)
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
def junk_tokens?(tokens)
|
|
261
|
+
(@junk_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)).intersect?(tokens.to_set)
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def stable_rank(entries)
|
|
265
|
+
entries.sort_by { |entry| [-entry.final_score, entry.position] }
|
|
266
|
+
end
|
|
125
267
|
end
|
|
126
268
|
end
|
|
127
269
|
end
|
|
@@ -54,7 +54,7 @@ module Html2rss
|
|
|
54
54
|
return log_missing_api_root if href.empty?
|
|
55
55
|
|
|
56
56
|
Html2rss::Url.from_relative(href, page_url)
|
|
57
|
-
rescue
|
|
57
|
+
rescue ArgumentError => error
|
|
58
58
|
logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
|
|
59
59
|
nil
|
|
60
60
|
end
|
|
@@ -138,13 +138,13 @@ module Html2rss
|
|
|
138
138
|
},
|
|
139
139
|
channel: { time_zone: 'UTC' },
|
|
140
140
|
headers: RequestHeaders.browser_defaults,
|
|
141
|
-
stylesheets: []
|
|
141
|
+
stylesheets: Html2rss.configuration.stylesheets || []
|
|
142
142
|
}
|
|
143
143
|
end
|
|
144
144
|
|
|
145
145
|
# @return [Symbol] the default strategy for feed orchestration
|
|
146
146
|
def default_strategy_name
|
|
147
|
-
:auto
|
|
147
|
+
Html2rss.configuration.default_strategy || :auto
|
|
148
148
|
end
|
|
149
149
|
|
|
150
150
|
private
|
|
@@ -17,13 +17,8 @@ module Html2rss
|
|
|
17
17
|
*/*;q=0.8
|
|
18
18
|
].join(',')
|
|
19
19
|
|
|
20
|
-
#
|
|
21
|
-
DEFAULT_USER_AGENT =
|
|
22
|
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
23
|
-
'AppleWebKit/537.36 (KHTML, like Gecko)',
|
|
24
|
-
'Chrome/123.0.0.0',
|
|
25
|
-
'Safari/537.36'
|
|
26
|
-
].join(' ')
|
|
20
|
+
# Default `User-Agent` header value.
|
|
21
|
+
DEFAULT_USER_AGENT = "html2rss/#{Html2rss::VERSION}".freeze
|
|
27
22
|
|
|
28
23
|
# Baseline browser-like header set used for outbound requests.
|
|
29
24
|
DEFAULT_HEADERS = {
|
|
@@ -40,9 +35,23 @@ module Html2rss
|
|
|
40
35
|
|
|
41
36
|
class << self
|
|
42
37
|
##
|
|
43
|
-
#
|
|
38
|
+
# :reek:ManualDispatch
|
|
39
|
+
# :reek:TooManyStatements
|
|
40
|
+
#
|
|
41
|
+
# @return [Hash{String => String}] the default header set merged with global defaults
|
|
44
42
|
def browser_defaults
|
|
45
|
-
DEFAULT_HEADERS.dup
|
|
43
|
+
defaults = DEFAULT_HEADERS.dup
|
|
44
|
+
global_headers = Html2rss.configuration.headers
|
|
45
|
+
global_headers = global_headers.call if global_headers.respond_to?(:call)
|
|
46
|
+
|
|
47
|
+
if global_headers.is_a?(Hash)
|
|
48
|
+
global_headers.each do |key, value|
|
|
49
|
+
canonical_key = key.to_s.split('-').map(&:capitalize).join('-')
|
|
50
|
+
defaults[canonical_key] = value.to_s
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
defaults
|
|
46
55
|
end
|
|
47
56
|
|
|
48
57
|
##
|