html2rss 0.19.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/auto_source/scraper/html.rb +48 -56
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +447 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +6 -161
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +102 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +172 -30
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +1 -1
- data/lib/html2rss/config/class_methods.rb +2 -2
- data/lib/html2rss/config/request_headers.rb +18 -9
- data/lib/html2rss/configuration.rb +176 -0
- data/lib/html2rss/html_extractor/list_candidates.rb +94 -0
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +257 -0
- data/lib/html2rss/html_extractor/semantic_containers.rb +70 -0
- data/lib/html2rss/html_extractor.rb +11 -0
- data/lib/html2rss/rss_builder/channel.rb +10 -7
- data/lib/html2rss/url.rb +2 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +54 -5
- metadata +9 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1e4867b7a4906d0e4bb9d6cb9facfe96da516175a82fe10824e9ed579cf4aa3d
|
|
4
|
+
data.tar.gz: 53a8f699b87817b2b62cbe5d5d1761f33004f0b3be1ddb6c7e2428d449923a7c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a38e85afebf7bd17739915cf9f59846a76212690dc5309a02f37b24f4910247a1e34e3c3eeede165f52ea8c77dff39de472a65b5b5d3b6d8b99c02a19f6dfb0a
|
|
7
|
+
data.tar.gz: e3a0ad5868a070adf65a0cd78b560d196df7ed932fa424372397f906e0e2afc362eeb0afa864ddcb9bef10cf344efc2f6d9505c8515be502df342fc2a0975252
|
|
@@ -19,9 +19,8 @@ module Html2rss
|
|
|
19
19
|
class Html
|
|
20
20
|
include Enumerable
|
|
21
21
|
|
|
22
|
-
#
|
|
23
|
-
|
|
24
|
-
|
|
22
|
+
# Absolute base URL used when probe-time detection needs to normalize relative hrefs.
|
|
23
|
+
DETECTION_BASE_URL = 'https://example.com'
|
|
25
24
|
# Minimum selector frequency required to treat a path as a stable list signal.
|
|
26
25
|
DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
|
|
27
26
|
# Number of most frequent selectors kept for container extraction.
|
|
@@ -39,7 +38,7 @@ module Html2rss
|
|
|
39
38
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
40
39
|
# @return [Boolean] true when the scraper can likely extract articles
|
|
41
40
|
def self.articles?(parsed_body)
|
|
42
|
-
new(parsed_body, url:
|
|
41
|
+
new(parsed_body, url: DETECTION_BASE_URL).any?
|
|
43
42
|
end
|
|
44
43
|
|
|
45
44
|
##
|
|
@@ -49,7 +48,7 @@ module Html2rss
|
|
|
49
48
|
# @param xpath [String] original XPath
|
|
50
49
|
# @return [String] XPath without positional indexes
|
|
51
50
|
def self.simplify_xpath(xpath)
|
|
52
|
-
|
|
51
|
+
HtmlExtractor::ListCandidates.simplify_xpath(xpath)
|
|
53
52
|
end
|
|
54
53
|
|
|
55
54
|
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
|
@@ -63,6 +62,7 @@ module Html2rss
|
|
|
63
62
|
@url = url
|
|
64
63
|
@extractor = extractor
|
|
65
64
|
@opts = opts
|
|
65
|
+
@link_heuristics = LinkHeuristics.new(url)
|
|
66
66
|
end
|
|
67
67
|
|
|
68
68
|
attr_reader :parsed_body
|
|
@@ -73,8 +73,8 @@ module Html2rss
|
|
|
73
73
|
def each
|
|
74
74
|
return enum_for(:each) unless block_given?
|
|
75
75
|
|
|
76
|
-
each_article_tag do |article_tag|
|
|
77
|
-
article_hash = extract_article(article_tag)
|
|
76
|
+
each_article_tag do |article_tag, selected_anchor|
|
|
77
|
+
article_hash = extract_article(article_tag, selected_anchor:)
|
|
78
78
|
yield article_hash if article_hash
|
|
79
79
|
end
|
|
80
80
|
end
|
|
@@ -90,8 +90,8 @@ module Html2rss
|
|
|
90
90
|
# @param node [Nokogiri::XML::Node] candidate boundary node
|
|
91
91
|
# @return [Boolean] true when the node is a good extraction boundary
|
|
92
92
|
def article_tag_condition?(node)
|
|
93
|
-
# Ignore tags that are below
|
|
94
|
-
return false if
|
|
93
|
+
# Ignore tags that are below ignored DOM chrome.
|
|
94
|
+
return false if HtmlExtractor.ignored_container_path?(node)
|
|
95
95
|
return true if %w[body html].include?(node.name)
|
|
96
96
|
return false unless (parent = node.parent)
|
|
97
97
|
|
|
@@ -100,24 +100,6 @@ module Html2rss
|
|
|
100
100
|
|
|
101
101
|
private
|
|
102
102
|
|
|
103
|
-
##
|
|
104
|
-
# Find relevant anchors in root.
|
|
105
|
-
# @return [Set<String>] The set of XPath selectors
|
|
106
|
-
def selectors
|
|
107
|
-
@selectors ||= Hash.new(0).tap do |selectors|
|
|
108
|
-
each_relevant_anchor { |node| increment_selector_count(selectors, node) }
|
|
109
|
-
end
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
##
|
|
113
|
-
# Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
|
|
114
|
-
# @return [Array<String>] The filtered selectors
|
|
115
|
-
def filtered_selectors
|
|
116
|
-
selectors.select { |_selector, count| count >= minimum_selector_frequency }
|
|
117
|
-
.max_by(use_top_selectors, &:last)
|
|
118
|
-
.map(&:first)
|
|
119
|
-
end
|
|
120
|
-
|
|
121
103
|
def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
|
|
122
104
|
def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
|
|
123
105
|
|
|
@@ -126,49 +108,59 @@ module Html2rss
|
|
|
126
108
|
@anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
|
|
127
109
|
end
|
|
128
110
|
|
|
129
|
-
def
|
|
130
|
-
|
|
111
|
+
def relevant_anchor?(node)
|
|
112
|
+
destination_facts = @link_heuristics.destination_facts(node)
|
|
113
|
+
return false unless destination_facts
|
|
131
114
|
|
|
132
|
-
|
|
133
|
-
yield node if relevant_anchor?(node)
|
|
134
|
-
end
|
|
115
|
+
!noise_anchor?(node, destination_facts)
|
|
135
116
|
end
|
|
136
117
|
|
|
137
|
-
def
|
|
138
|
-
|
|
139
|
-
end
|
|
118
|
+
def each_article_tag(&block)
|
|
119
|
+
return enum_for(:each_article_tag) unless block
|
|
140
120
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
121
|
+
list_candidates.each_article_tag(anchor_filter: method(:relevant_anchor?),
|
|
122
|
+
boundary_condition: method(:article_tag_condition?),
|
|
123
|
+
&block)
|
|
144
124
|
end
|
|
145
125
|
|
|
146
|
-
def
|
|
147
|
-
|
|
126
|
+
def extract_article(article_tag, selected_anchor: nil)
|
|
127
|
+
selected_anchor ||= preferred_anchor_for(article_tag)
|
|
128
|
+
return unless selected_anchor
|
|
129
|
+
return if noise_anchor?(selected_anchor, @link_heuristics.destination_facts(selected_anchor))
|
|
130
|
+
|
|
131
|
+
@extractor.new(article_tag, base_url: @url, selected_anchor:).call
|
|
148
132
|
end
|
|
149
133
|
|
|
150
|
-
def
|
|
151
|
-
return
|
|
134
|
+
def noise_anchor?(anchor, destination_facts) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
135
|
+
return true unless destination_facts
|
|
152
136
|
|
|
153
|
-
|
|
154
|
-
parsed_body.xpath(selector).each do |selected_tag|
|
|
155
|
-
article_tag = article_tag_for(selected_tag)
|
|
156
|
-
yield article_tag if article_tag
|
|
157
|
-
end
|
|
158
|
-
end
|
|
159
|
-
end
|
|
137
|
+
text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
|
|
160
138
|
|
|
161
|
-
|
|
162
|
-
|
|
139
|
+
destination_facts.taxonomy_path ||
|
|
140
|
+
short_utility_label?(text, destination_facts) ||
|
|
141
|
+
(@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
|
|
142
|
+
(@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
|
|
143
|
+
(@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
|
|
144
|
+
end
|
|
163
145
|
|
|
164
|
-
|
|
146
|
+
def short_utility_label?(text, destination_facts)
|
|
147
|
+
destination_facts.utility_path &&
|
|
148
|
+
!destination_facts.content_path &&
|
|
149
|
+
!destination_facts.strong_post_suffix &&
|
|
150
|
+
text.scan(/\p{Alnum}+/).size <= 3
|
|
165
151
|
end
|
|
166
152
|
|
|
167
|
-
def
|
|
168
|
-
|
|
169
|
-
|
|
153
|
+
def preferred_anchor_for(article_tag)
|
|
154
|
+
article_tag.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).find { relevant_anchor?(_1) } ||
|
|
155
|
+
HtmlExtractor.main_anchor_for(article_tag)
|
|
156
|
+
end
|
|
170
157
|
|
|
171
|
-
|
|
158
|
+
def list_candidates
|
|
159
|
+
HtmlExtractor::ListCandidates.new(
|
|
160
|
+
parsed_body,
|
|
161
|
+
minimum_selector_frequency:,
|
|
162
|
+
use_top_selectors:
|
|
163
|
+
)
|
|
172
164
|
end
|
|
173
165
|
end
|
|
174
166
|
end
|
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
##
|
|
7
|
+
# Shared link-level heuristics used by scraper-local selection and
|
|
8
|
+
# scoring. This keeps normalization and route/text classification
|
|
9
|
+
# consistent without moving scraper policy into higher orchestration.
|
|
10
|
+
class LinkHeuristics
|
|
11
|
+
# Normalized URL plus reusable route-classification facts for one link.
|
|
12
|
+
DestinationFacts = Data.define(
|
|
13
|
+
:url,
|
|
14
|
+
:destination,
|
|
15
|
+
:segments,
|
|
16
|
+
:content_path,
|
|
17
|
+
:utility_path,
|
|
18
|
+
:taxonomy_path,
|
|
19
|
+
:vanity_path,
|
|
20
|
+
:shallow,
|
|
21
|
+
:strong_post_suffix,
|
|
22
|
+
:high_confidence_junk_path,
|
|
23
|
+
:high_confidence_utility_destination
|
|
24
|
+
) do
|
|
25
|
+
# @param url [Html2rss::Url] normalized destination URL
|
|
26
|
+
# @return [DestinationFacts] route facts for downstream link scoring
|
|
27
|
+
def self.build(url)
|
|
28
|
+
classifier = PathClassifier.new(url.path_segments)
|
|
29
|
+
|
|
30
|
+
new(
|
|
31
|
+
url:,
|
|
32
|
+
destination: url.to_s,
|
|
33
|
+
**classifier.destination_attributes
|
|
34
|
+
)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Extracts a normalized href from a Nokogiri anchor or raw href value.
|
|
39
|
+
class HrefExtractor
|
|
40
|
+
# @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
|
|
41
|
+
# @return [String, nil] href without fragment, or nil when blank
|
|
42
|
+
def self.call(anchor_or_href) = new(anchor_or_href).call
|
|
43
|
+
|
|
44
|
+
# @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
|
|
45
|
+
def initialize(anchor_or_href)
|
|
46
|
+
@anchor_or_href = anchor_or_href
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [String, nil] href without fragment, or nil when blank
|
|
50
|
+
def call
|
|
51
|
+
raw_href.to_s.split('#', 2).first.to_s.strip.then do |href|
|
|
52
|
+
href unless href.empty?
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def raw_href
|
|
59
|
+
case @anchor_or_href
|
|
60
|
+
when Nokogiri::XML::Node
|
|
61
|
+
@anchor_or_href['href']
|
|
62
|
+
else
|
|
63
|
+
@anchor_or_href
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Classifies visible anchor text for utility and recommendation chrome.
|
|
69
|
+
class TextClassifier
|
|
70
|
+
# Prefix labels that usually identify navigation or subscription links.
|
|
71
|
+
UTILITY_PREFIX_PATTERN = /
|
|
72
|
+
\A\s*(
|
|
73
|
+
# English
|
|
74
|
+
view\s+all|see\s+all|all\s+news|subscribe|newsletter|comment\s+feed|comments\s+feed|join|premium|plus|
|
|
75
|
+
# German
|
|
76
|
+
alle\s+anzeigen|alle\s+news|abonnieren|newsletter|kommentar\s+feed|mitmachen|
|
|
77
|
+
# Spanish
|
|
78
|
+
ver\s+todos|ver\s+todo|todas\s+las\s+noticias|suscribirse|bolet(i|í)n|comentarios\s+feed|unirse|
|
|
79
|
+
# French
|
|
80
|
+
voir\s+tout|voir\s+tous|toutes\s+les\s+nouvelles|s['’]abonner|flux\s+de\s+commentaires|rejoindre
|
|
81
|
+
)\b
|
|
82
|
+
/ix
|
|
83
|
+
# Short labels that usually identify non-article navigation links.
|
|
84
|
+
UTILITY_PATTERN = /
|
|
85
|
+
\A\s*(
|
|
86
|
+
# English
|
|
87
|
+
about|contact|comments?|join|log\s+in|login|member(ship)?|
|
|
88
|
+
plus|premium|pricing|recommended(\s+for\s+you)?|
|
|
89
|
+
see\s+all|share|sign\s+up|signup|subscribe|view\s+all|
|
|
90
|
+
# German
|
|
91
|
+
(ue|ü)ber(\s+uns)?|kontakt|kommentare?|mitmachen|anmelden|login|
|
|
92
|
+
mitglied(schaft)?|empfohlen(\s+f(ue|ü)r\s+dich)?|alle\s+anzeigen|
|
|
93
|
+
teilen|registrieren|abonnieren|newsletter|
|
|
94
|
+
# Spanish
|
|
95
|
+
sobre(\s+nosotros)?|contacto|comentarios?|unirse|iniciar\s+sesion|
|
|
96
|
+
login|miembro|membres(i|í)a|recomendado(\s+para\s+ti)?|ver\s+todo|
|
|
97
|
+
compartir|registrarse|suscribirse|bolet(i|í)n|
|
|
98
|
+
# French
|
|
99
|
+
(a|à)\s+propos|(a|à)propos|contact|commentaires?|rejoindre|
|
|
100
|
+
se\s+connecter|login|membre|abonnement|recommand(e|é)(\s+pour\s+vous)?|
|
|
101
|
+
voir\s+tout|partager|s['’]inscrire|s['’]abonner|newsletter
|
|
102
|
+
)\b
|
|
103
|
+
/ix
|
|
104
|
+
# Labels for recommendation chrome rather than source articles.
|
|
105
|
+
RECOMMENDED_PATTERN = /
|
|
106
|
+
\A\s*(
|
|
107
|
+
recommended(\s+for\s+you)?|
|
|
108
|
+
empfohlen(\s+f(ue|ü)r\s+dich)?|
|
|
109
|
+
recomendado(\s+para\s+ti)?|
|
|
110
|
+
recommand(e|é)(\s+pour\s+vous)?
|
|
111
|
+
)\b
|
|
112
|
+
/ix
|
|
113
|
+
|
|
114
|
+
# @param text [String, #to_s] visible anchor text
|
|
115
|
+
# @return [Boolean] true when text matches a utility label
|
|
116
|
+
def utility?(text) = text.to_s.match?(UTILITY_PATTERN)
|
|
117
|
+
|
|
118
|
+
# @param text [String, #to_s] visible anchor text
|
|
119
|
+
# @return [Boolean] true when text begins with a utility label
|
|
120
|
+
def utility_prefix?(text) = text.to_s.match?(UTILITY_PREFIX_PATTERN)
|
|
121
|
+
|
|
122
|
+
# @param text [String, #to_s] visible anchor text
|
|
123
|
+
# @return [Boolean] true when text identifies recommendation chrome
|
|
124
|
+
def recommended?(text) = text.to_s.match?(RECOMMENDED_PATTERN)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Classifies normalized destination path segments for scoring.
|
|
128
|
+
# rubocop:disable Metrics/ClassLength
|
|
129
|
+
class PathClassifier
|
|
130
|
+
attr_reader :segments
|
|
131
|
+
|
|
132
|
+
# Segment groups used to classify article, taxonomy, utility, and vanity routes.
|
|
133
|
+
SEGMENT_SETS = {
|
|
134
|
+
content: %w[
|
|
135
|
+
article articles blog blogs changelog changelogs insight insights
|
|
136
|
+
launch launches news post posts release releases story stories update updates
|
|
137
|
+
artikel beitrag beitraege nachrichten neuigkeiten aktuelles
|
|
138
|
+
articulo articulos noticia noticias entrada entradas publicacion publicaciones
|
|
139
|
+
actualite actualites nouvelle nouvelles
|
|
140
|
+
teaser teasers card cards
|
|
141
|
+
].to_set.freeze,
|
|
142
|
+
utility: %w[
|
|
143
|
+
about account archive archives author authors category categories comment comments
|
|
144
|
+
contact feedback help login logout newsletter newsletters notification notifications
|
|
145
|
+
preference preferences profile register search settings share signup subscribe
|
|
146
|
+
tag tags topic topics
|
|
147
|
+
feed feeds comment-feed comments-feed
|
|
148
|
+
recommended
|
|
149
|
+
for-you
|
|
150
|
+
privacy terms cookie cookies
|
|
151
|
+
join member members membership plus premium plans pricing user users
|
|
152
|
+
kategorie kategorien schlagwort schlagworte thema themen autor autoren archiv
|
|
153
|
+
ueber-uns ueber ueberuns profil kontakt impressum suche hilfe anmelden registrieren
|
|
154
|
+
konto registrierung anmeldung abonnieren abo datenschutz nutzungsbedingungen agb
|
|
155
|
+
categoria categorias etiqueta etiquetas tema temas autores archivos
|
|
156
|
+
sobre-nosotros sobre quienes-somos buscar busqueda ayuda entrar ingresar
|
|
157
|
+
registrarse registro cuenta suscribirse boletin privacidad condiciones
|
|
158
|
+
categorie etiquette etiquettes sujet sujets theme themes auteur auteurs
|
|
159
|
+
a-propos apropos recherche rechercher aide connexion s-inscrire
|
|
160
|
+
sinscrire inscription compte s-abonner saboner lettre-information confidentialite mentions-legales cgu
|
|
161
|
+
menu sidebar widget social modal popup banner promo ad ads
|
|
162
|
+
related recommendation recommendations pagination pager
|
|
163
|
+
].to_set.freeze,
|
|
164
|
+
high_confidence_junk: %w[
|
|
165
|
+
about account archive archives author authors category categories comment comments
|
|
166
|
+
contact cookie cookies feedback feed feeds help login logout notification notifications
|
|
167
|
+
preference preferences privacy profile register search settings share signup subscribe
|
|
168
|
+
tag tags terms topic topics comment-feed comments-feed user users
|
|
169
|
+
kategorie kategorien schlagwort schlagworte thema themen autor autoren archiv
|
|
170
|
+
ueber-uns ueber ueberuns profil kontakt impressum suche hilfe anmelden registrieren
|
|
171
|
+
konto registrierung anmeldung abonnieren abo datenschutz nutzungsbedingungen agb
|
|
172
|
+
categoria categorias etiqueta etiquetas tema temas autores archivos
|
|
173
|
+
sobre-nosotros sobre quienes-somos buscar busqueda ayuda entrar ingresar
|
|
174
|
+
registrarse registro cuenta suscribirse boletin privacidad condiciones
|
|
175
|
+
categorie etiquette etiquettes sujet sujets theme themes auteur auteurs
|
|
176
|
+
a-propos apropos recherche rechercher aide connexion s-inscrire
|
|
177
|
+
sinscrire inscription compte s-abonner saboner lettre-information confidentialite mentions-legales cgu
|
|
178
|
+
menu sidebar widget social modal popup banner promo ad ads
|
|
179
|
+
related recommendation recommendations pagination pager
|
|
180
|
+
].to_set.freeze,
|
|
181
|
+
taxonomy: %w[
|
|
182
|
+
category categories tag tags topic topics
|
|
183
|
+
kategorie kategorien schlagwort schlagworte thema themen
|
|
184
|
+
categoria categorias etiqueta etiquetas tema temas
|
|
185
|
+
categorie etiquette etiquettes sujet sujets theme themes
|
|
186
|
+
].to_set.freeze,
|
|
187
|
+
vanity: %w[
|
|
188
|
+
join membership plus premium pricing plans subscribe signup
|
|
189
|
+
abonnieren abo
|
|
190
|
+
suscribirse boletin
|
|
191
|
+
s-abonner saboner
|
|
192
|
+
].to_set.freeze,
|
|
193
|
+
deep_post_context: %w[
|
|
194
|
+
press newsroom
|
|
195
|
+
presse pressemitteilungen
|
|
196
|
+
prensa
|
|
197
|
+
].to_set.freeze
|
|
198
|
+
}.freeze
|
|
199
|
+
# Path segment that begins with a year-like publishing marker.
|
|
200
|
+
YEARISH_SEGMENT = /\A\d{4,}[\w-]*\z/
|
|
201
|
+
# Hyphenated slug shape common to article permalinks.
|
|
202
|
+
POST_SLUG_SEGMENT = /\A[a-z0-9]+(?:-[a-z0-9]+){2,}\z/i
|
|
203
|
+
|
|
204
|
+
# @param segments [Array<String>] normalized URL path segments
|
|
205
|
+
def initialize(segments)
|
|
206
|
+
@segments = segments
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# @return [Hash] destination attributes consumed by DestinationFacts
|
|
210
|
+
def destination_attributes
|
|
211
|
+
route_attributes.merge(confidence_attributes)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# @return [Hash] baseline path classification attributes
|
|
215
|
+
def route_attributes
|
|
216
|
+
{
|
|
217
|
+
segments:,
|
|
218
|
+
content_path: content_path?,
|
|
219
|
+
utility_path: utility_path?,
|
|
220
|
+
taxonomy_path: taxonomy_path?,
|
|
221
|
+
vanity_path: vanity_path?,
|
|
222
|
+
shallow: shallow?,
|
|
223
|
+
strong_post_suffix: strong_post_suffix?
|
|
224
|
+
}
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# @return [Hash] high-confidence noise classification attributes
|
|
228
|
+
def confidence_attributes
|
|
229
|
+
ConfidenceClassifier.new(self).attributes
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# @return [Boolean] true when the route has article-like path evidence
|
|
233
|
+
def content_path?
|
|
234
|
+
@content_path ||= SEGMENT_SETS.fetch(:content).intersect?(segments.to_set) ||
|
|
235
|
+
yearish_content_context?
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# @return [Boolean] true when the route includes utility/navigation evidence
|
|
239
|
+
def utility_path?
|
|
240
|
+
@utility_path ||= SEGMENT_SETS.fetch(:utility).intersect?(segments.to_set)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# @return [Boolean] true when the route points at conversion or account chrome
|
|
244
|
+
def vanity_path?
|
|
245
|
+
@vanity_path ||= SEGMENT_SETS.fetch(:vanity).intersect?(segments.to_set)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# @return [Boolean] true when the route points at taxonomy/listing chrome
|
|
249
|
+
def taxonomy_path?
|
|
250
|
+
@taxonomy_path ||= SEGMENT_SETS.fetch(:taxonomy).intersect?(segments.to_set)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# @return [Boolean] true when the route is too shallow to strongly indicate an article
|
|
254
|
+
def shallow?
|
|
255
|
+
segment_count = segments.size
|
|
256
|
+
junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
|
|
257
|
+
|
|
258
|
+
segment_count <= 1 || (segment_count == 2 && junk_segments.include?(segments.last))
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# @return [Boolean] true when the final path segment looks like a post slug
|
|
262
|
+
def strong_post_suffix?
|
|
263
|
+
PostSuffixClassifier.new(segments).strong?
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
# @return [Boolean] true when every path segment is utility chrome
|
|
267
|
+
def utility_only_route?
|
|
268
|
+
junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
|
|
269
|
+
|
|
270
|
+
segments.all? { |segment| junk_segments.include?(segment) }
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
# @return [Boolean] true when the route is shallow and contains high-confidence noise
|
|
274
|
+
def shallow_high_confidence_route?
|
|
275
|
+
junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
|
|
276
|
+
vanity_segments = SEGMENT_SETS.fetch(:vanity)
|
|
277
|
+
|
|
278
|
+
shallow? && segments.any? do |segment|
|
|
279
|
+
junk_segments.include?(segment) || vanity_segments.include?(segment)
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# @return [Boolean] true when the leading segments are all utility chrome
|
|
284
|
+
def deep_utility_context_route?
|
|
285
|
+
LeadingSegments.new(segments).all_junk?
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
private
|
|
289
|
+
|
|
290
|
+
def yearish_content_context?
|
|
291
|
+
segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
|
|
292
|
+
(strong_post_suffix? || LeadingSegments.new(segments).trusted_post_context?)
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
# rubocop:enable Metrics/ClassLength
|
|
296
|
+
|
|
297
|
+
# Classifies high-confidence junk and utility routes from path facts.
|
|
298
|
+
class ConfidenceClassifier
|
|
299
|
+
# @param path [PathClassifier] classified destination path
|
|
300
|
+
def initialize(path)
|
|
301
|
+
@path = path
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# @return [Hash] high-confidence route classification attributes
|
|
305
|
+
def attributes
|
|
306
|
+
{
|
|
307
|
+
high_confidence_junk_path: junk_path?,
|
|
308
|
+
high_confidence_utility_destination: utility_destination?
|
|
309
|
+
}
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
private
|
|
313
|
+
|
|
314
|
+
def junk_path?
|
|
315
|
+
return false if excluded_content_route?
|
|
316
|
+
|
|
317
|
+
@path.taxonomy_path? ||
|
|
318
|
+
@path.utility_only_route? ||
|
|
319
|
+
@path.deep_utility_context_route? ||
|
|
320
|
+
@path.shallow_high_confidence_route?
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
def utility_destination?
|
|
324
|
+
return false if excluded_content_route?
|
|
325
|
+
|
|
326
|
+
@path.vanity_path? || utility_route?
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
def excluded_content_route?
|
|
330
|
+
@path.segments.empty? || @path.content_path? || @path.strong_post_suffix?
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def utility_route?
|
|
334
|
+
@path.taxonomy_path? ||
|
|
335
|
+
@path.utility_only_route? ||
|
|
336
|
+
@path.deep_utility_context_route? ||
|
|
337
|
+
shallow_utility_route?
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def shallow_utility_route?
|
|
341
|
+
@path.shallow? && @path.utility_path?
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Classifies route context before the final segment.
|
|
346
|
+
class LeadingSegments
|
|
347
|
+
# @param segments [Array<String>] normalized URL path segments
|
|
348
|
+
def initialize(segments)
|
|
349
|
+
@segments = segments[0...-1]
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# @return [Boolean] true when every leading segment is utility chrome
|
|
353
|
+
def all_junk?
|
|
354
|
+
junk_segments = PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk)
|
|
355
|
+
|
|
356
|
+
@segments.any? && @segments.all? { |segment| junk_segments.include?(segment) }
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# @return [Boolean] true when leading segments provide article context
|
|
360
|
+
def trusted_post_context?
|
|
361
|
+
content_segments = PathClassifier::SEGMENT_SETS.fetch(:content)
|
|
362
|
+
context_segments = PathClassifier::SEGMENT_SETS.fetch(:deep_post_context)
|
|
363
|
+
|
|
364
|
+
@segments.any? do |segment|
|
|
365
|
+
content_segments.include?(segment) ||
|
|
366
|
+
segment.match?(PathClassifier::YEARISH_SEGMENT) ||
|
|
367
|
+
context_segments.include?(segment)
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
# Classifies whether the final segment is a strong post-like suffix.
|
|
373
|
+
class PostSuffixClassifier
|
|
374
|
+
# @param segments [Array<String>] normalized URL path segments
|
|
375
|
+
def initialize(segments)
|
|
376
|
+
@segments = segments
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
# @return [Boolean] true when the final path segment looks like a post slug
|
|
380
|
+
def strong?
|
|
381
|
+
@segments.any? &&
|
|
382
|
+
included_last_segment? &&
|
|
383
|
+
LeadingSegments.new(@segments).trusted_post_context?
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
private
|
|
387
|
+
|
|
388
|
+
def included_last_segment?
|
|
389
|
+
!excluded_last_segment? && slug_last_segment?
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
def excluded_last_segment?
|
|
393
|
+
excluded_segments.any? { |segment| segment.include?(last_segment) }
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def excluded_segments
|
|
397
|
+
[
|
|
398
|
+
PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk),
|
|
399
|
+
PathClassifier::SEGMENT_SETS.fetch(:vanity)
|
|
400
|
+
]
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
def slug_last_segment?
|
|
404
|
+
last_segment.match?(PathClassifier::YEARISH_SEGMENT) ||
|
|
405
|
+
last_segment.match?(PathClassifier::POST_SLUG_SEGMENT)
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
def last_segment
|
|
409
|
+
@segments.last
|
|
410
|
+
end
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
# @param base_url [String, Html2rss::Url] page URL used to resolve relative hrefs
|
|
414
|
+
def initialize(base_url)
|
|
415
|
+
@base_url = base_url
|
|
416
|
+
@text_classifier = TextClassifier.new
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Builds normalized destination facts for an anchor element or href string.
|
|
420
|
+
#
|
|
421
|
+
# @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
|
|
422
|
+
# @return [DestinationFacts, nil] normalized destination facts, or nil for blank/invalid URLs
|
|
423
|
+
def destination_facts(anchor_or_href)
|
|
424
|
+
href = HrefExtractor.call(anchor_or_href)
|
|
425
|
+
return unless href
|
|
426
|
+
|
|
427
|
+
url = Html2rss::Url.from_relative(href, @base_url)
|
|
428
|
+
DestinationFacts.build(url)
|
|
429
|
+
rescue ArgumentError
|
|
430
|
+
nil
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
# @param text [String, #to_s] visible anchor text
|
|
434
|
+
# @return [Boolean] true when text matches a utility label
|
|
435
|
+
def utility_text?(text) = @text_classifier.utility?(text)
|
|
436
|
+
|
|
437
|
+
# @param text [String, #to_s] visible anchor text
|
|
438
|
+
# @return [Boolean] true when text begins with a utility label
|
|
439
|
+
def utility_prefix_text?(text) = @text_classifier.utility_prefix?(text)
|
|
440
|
+
|
|
441
|
+
# @param text [String, #to_s] visible anchor text
|
|
442
|
+
# @return [Boolean] true when text identifies recommendation chrome
|
|
443
|
+
def recommended_text?(text) = @text_classifier.recommended?(text)
|
|
444
|
+
end
|
|
445
|
+
end
|
|
446
|
+
end
|
|
447
|
+
end
|