html2rss 0.20.1 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2rss.gemspec +1 -2
- data/lib/html2rss/auto_source/scraper/html.rb +61 -16
- data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
- data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
- data/lib/html2rss/auto_source/scraper.rb +0 -3
- data/lib/html2rss/auto_source.rb +2 -11
- data/lib/html2rss/category_extractor.rb +54 -20
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
- data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
- data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
- data/lib/html2rss/html_extractor.rb +51 -30
- data/lib/html2rss/rendering/description_builder.rb +3 -3
- data/lib/html2rss/rss_builder/article.rb +44 -23
- data/lib/html2rss/rss_builder/enclosure.rb +4 -2
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
- data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
- data/lib/html2rss/selectors/post_processors/template.rb +3 -2
- data/lib/html2rss/selectors.rb +18 -4
- data/lib/html2rss/url.rb +4 -3
- data/lib/html2rss/version.rb +1 -1
- metadata +3 -17
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8168109d2cc60920d8a18b6b99970a5558e43163ad5cd11cb3d3f0d944d46943
|
|
4
|
+
data.tar.gz: 833a936f89f9ce31c0b4fb0036020c7962a4ac77e0dfa72f1134a0bae8bea4c4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 734f286a486d49c86ab7baf48d157cdee9d988fdc8b693ac7d79bf3c64c661fcd54538d5e94dc19bdc8a6f3021168c1ecac2d8e34417f56879392d71600c7340
|
|
7
|
+
data.tar.gz: f008a767b452557cff1b45b1abb0eccb26f38d839417e95d21b8cf74f4546f9143b067e2d11f9fc00f5955c3f145f8933a1b1d4912ad25756c890280d4bb1a37
|
data/html2rss.gemspec
CHANGED
|
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
|
|
|
14
14
|
spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
|
|
15
15
|
spec.homepage = 'https://github.com/html2rss/html2rss'
|
|
16
16
|
spec.license = 'MIT'
|
|
17
|
-
spec.required_ruby_version = '>= 3.
|
|
17
|
+
spec.required_ruby_version = '>= 3.3'
|
|
18
18
|
|
|
19
19
|
if spec.respond_to?(:metadata)
|
|
20
20
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
|
@@ -41,7 +41,6 @@ Gem::Specification.new do |spec|
|
|
|
41
41
|
spec.add_dependency 'kramdown'
|
|
42
42
|
spec.add_dependency 'mime-types', '> 3.0'
|
|
43
43
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
|
44
|
-
spec.add_dependency 'parallel'
|
|
45
44
|
spec.add_dependency 'puppeteer-ruby'
|
|
46
45
|
spec.add_dependency 'regexp_parser'
|
|
47
46
|
spec.add_dependency 'reverse_markdown', '~> 3.0'
|
|
@@ -63,6 +63,7 @@ module Html2rss
|
|
|
63
63
|
@extractor = extractor
|
|
64
64
|
@opts = opts
|
|
65
65
|
@link_heuristics = LinkHeuristics.new(url)
|
|
66
|
+
@ignored_cache = {}.compare_by_identity
|
|
66
67
|
end
|
|
67
68
|
|
|
68
69
|
attr_reader :parsed_body
|
|
@@ -73,10 +74,13 @@ module Html2rss
|
|
|
73
74
|
def each
|
|
74
75
|
return enum_for(:each) unless block_given?
|
|
75
76
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
77
|
+
articles.each { yield _1 }
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
##
|
|
81
|
+
# @return [Boolean] true when the scraper can likely extract articles
|
|
82
|
+
def extractable?
|
|
83
|
+
articles.any?
|
|
80
84
|
end
|
|
81
85
|
|
|
82
86
|
##
|
|
@@ -91,7 +95,7 @@ module Html2rss
|
|
|
91
95
|
# @return [Boolean] true when the node is a good extraction boundary
|
|
92
96
|
def article_tag_condition?(node)
|
|
93
97
|
# Ignore tags that are below ignored DOM chrome.
|
|
94
|
-
return false if HtmlExtractor.ignored_container_path?(node)
|
|
98
|
+
return false if HtmlExtractor.ignored_container_path?(node, @ignored_cache)
|
|
95
99
|
return true if %w[body html].include?(node.name)
|
|
96
100
|
return false unless (parent = node.parent)
|
|
97
101
|
|
|
@@ -100,14 +104,30 @@ module Html2rss
|
|
|
100
104
|
|
|
101
105
|
private
|
|
102
106
|
|
|
107
|
+
def articles
|
|
108
|
+
@articles ||= each_article_tag.filter_map do |article_tag, selected_anchor|
|
|
109
|
+
extract_article(article_tag, selected_anchor:)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
##
|
|
114
|
+
# @return [Integer]
|
|
103
115
|
def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
|
|
116
|
+
|
|
117
|
+
##
|
|
118
|
+
# @return [Boolean]
|
|
104
119
|
def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
|
|
105
120
|
|
|
121
|
+
##
|
|
122
|
+
# @param node [Nokogiri::XML::Node]
|
|
123
|
+
# @return [Integer]
|
|
106
124
|
def anchor_count(node)
|
|
107
|
-
@anchor_counts ||= {}
|
|
108
|
-
@anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
|
|
125
|
+
(@anchor_counts ||= {}.compare_by_identity)[node] ||= node.name == 'a' ? 1 : node.css('a').size
|
|
109
126
|
end
|
|
110
127
|
|
|
128
|
+
##
|
|
129
|
+
# @param node [Nokogiri::XML::Node]
|
|
130
|
+
# @return [Boolean]
|
|
111
131
|
def relevant_anchor?(node)
|
|
112
132
|
destination_facts = @link_heuristics.destination_facts(node)
|
|
113
133
|
return false unless destination_facts
|
|
@@ -115,14 +135,24 @@ module Html2rss
|
|
|
115
135
|
!noise_anchor?(node, destination_facts)
|
|
116
136
|
end
|
|
117
137
|
|
|
138
|
+
##
|
|
139
|
+
# @yield [article_tag, selected_anchor]
|
|
140
|
+
# @yieldparam article_tag [Nokogiri::XML::Node]
|
|
141
|
+
# @yieldparam selected_anchor [Nokogiri::XML::Node]
|
|
142
|
+
# @return [Enumerator, nil]
|
|
118
143
|
def each_article_tag(&block)
|
|
119
144
|
return enum_for(:each_article_tag) unless block
|
|
120
145
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
146
|
+
anchor_filter = ->(node) { relevant_anchor?(node) }
|
|
147
|
+
boundary_condition = ->(node) { article_tag_condition?(node) }
|
|
148
|
+
|
|
149
|
+
list_candidates.each_article_tag(anchor_filter:, boundary_condition:, &block)
|
|
124
150
|
end
|
|
125
151
|
|
|
152
|
+
##
|
|
153
|
+
# @param article_tag [Nokogiri::XML::Node]
|
|
154
|
+
# @param selected_anchor [Nokogiri::XML::Node, nil]
|
|
155
|
+
# @return [Hash, nil]
|
|
126
156
|
def extract_article(article_tag, selected_anchor: nil)
|
|
127
157
|
selected_anchor ||= preferred_anchor_for(article_tag)
|
|
128
158
|
return unless selected_anchor
|
|
@@ -131,18 +161,28 @@ module Html2rss
|
|
|
131
161
|
@extractor.new(article_tag, base_url: @url, selected_anchor:).call
|
|
132
162
|
end
|
|
133
163
|
|
|
164
|
+
##
|
|
165
|
+
# @param anchor [Nokogiri::XML::Node]
|
|
166
|
+
# @param destination_facts [DestinationFacts]
|
|
167
|
+
# @return [Boolean]
|
|
134
168
|
def noise_anchor?(anchor, destination_facts) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
135
169
|
return true unless destination_facts
|
|
136
170
|
|
|
137
|
-
|
|
171
|
+
(@noise_anchors ||= {}.compare_by_identity)[anchor] ||= begin
|
|
172
|
+
text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
|
|
138
173
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
174
|
+
destination_facts.taxonomy_path ||
|
|
175
|
+
short_utility_label?(text, destination_facts) ||
|
|
176
|
+
(@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
|
|
177
|
+
(@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
|
|
178
|
+
(@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
|
|
179
|
+
end
|
|
144
180
|
end
|
|
145
181
|
|
|
182
|
+
##
|
|
183
|
+
# @param text [String]
|
|
184
|
+
# @param destination_facts [DestinationFacts]
|
|
185
|
+
# @return [Boolean]
|
|
146
186
|
def short_utility_label?(text, destination_facts)
|
|
147
187
|
destination_facts.utility_path &&
|
|
148
188
|
!destination_facts.content_path &&
|
|
@@ -150,11 +190,16 @@ module Html2rss
|
|
|
150
190
|
text.scan(/\p{Alnum}+/).size <= 3
|
|
151
191
|
end
|
|
152
192
|
|
|
193
|
+
##
|
|
194
|
+
# @param article_tag [Nokogiri::XML::Node]
|
|
195
|
+
# @return [Nokogiri::XML::Node, nil]
|
|
153
196
|
def preferred_anchor_for(article_tag)
|
|
154
197
|
article_tag.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).find { relevant_anchor?(_1) } ||
|
|
155
198
|
HtmlExtractor.main_anchor_for(article_tag)
|
|
156
199
|
end
|
|
157
200
|
|
|
201
|
+
##
|
|
202
|
+
# @return [HtmlExtractor::ListCandidates]
|
|
158
203
|
def list_candidates
|
|
159
204
|
HtmlExtractor::ListCandidates.new(
|
|
160
205
|
parsed_body,
|
|
@@ -30,6 +30,9 @@ module Html2rss
|
|
|
30
30
|
/(?:window|self|globalThis)\.angular\s*=\s*/m
|
|
31
31
|
].freeze
|
|
32
32
|
|
|
33
|
+
# Combined regex for faster matching of global assignments.
|
|
34
|
+
GLOBAL_ASSIGNMENT_REGEXP = Regexp.union(GLOBAL_ASSIGNMENT_PATTERNS).freeze
|
|
35
|
+
|
|
33
36
|
# Preferred keys when extracting title-like values from state payloads.
|
|
34
37
|
TITLE_KEYS = %i[title headline name text].freeze
|
|
35
38
|
# Preferred keys when extracting URL-like values from state payloads.
|
|
@@ -53,7 +56,12 @@ module Html2rss
|
|
|
53
56
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
54
57
|
# @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
|
|
55
58
|
def json_documents(parsed_body)
|
|
56
|
-
|
|
59
|
+
# Use identity-based cache to avoid double-parsing of the same document.
|
|
60
|
+
# WeakMap allows the Nokogiri Document (key) to be garbage collected.
|
|
61
|
+
# rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
62
|
+
(@cache ||= ObjectSpace::WeakMap.new)[parsed_body] ||=
|
|
63
|
+
script_documents(parsed_body) + assignment_documents(parsed_body)
|
|
64
|
+
# rubocop:enable ThreadSafety/ClassInstanceVariable
|
|
57
65
|
end
|
|
58
66
|
|
|
59
67
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
@@ -80,15 +88,10 @@ module Html2rss
|
|
|
80
88
|
def assignment_payload(text)
|
|
81
89
|
trimmed = text.to_s.strip
|
|
82
90
|
return if trimmed.empty?
|
|
91
|
+
return unless trimmed.match?(GLOBAL_ASSIGNMENT_REGEXP)
|
|
83
92
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
payload = trimmed.sub(pattern, '')
|
|
88
|
-
return extract_assignment_payload(payload)
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
nil
|
|
93
|
+
payload = trimmed.sub(GLOBAL_ASSIGNMENT_REGEXP, '')
|
|
94
|
+
extract_assignment_payload(payload)
|
|
92
95
|
end
|
|
93
96
|
|
|
94
97
|
# @param text [String] text potentially containing JSON-like payloads
|
|
@@ -116,8 +119,10 @@ module Html2rss
|
|
|
116
119
|
in_string = false
|
|
117
120
|
escape = false
|
|
118
121
|
|
|
119
|
-
|
|
120
|
-
|
|
122
|
+
i = start_index
|
|
123
|
+
len = text.length
|
|
124
|
+
while i < len
|
|
125
|
+
char = text[i]
|
|
121
126
|
|
|
122
127
|
if in_string
|
|
123
128
|
if escape
|
|
@@ -127,24 +132,22 @@ module Html2rss
|
|
|
127
132
|
elsif char == '"'
|
|
128
133
|
in_string = false
|
|
129
134
|
end
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
stack << ']'
|
|
140
|
-
when '}', ']'
|
|
141
|
-
expected = stack.pop
|
|
142
|
-
return index if expected == char && stack.empty?
|
|
135
|
+
else
|
|
136
|
+
case char
|
|
137
|
+
when '"' then in_string = true
|
|
138
|
+
when '{' then stack << '}'
|
|
139
|
+
when '[' then stack << ']'
|
|
140
|
+
when '}', ']'
|
|
141
|
+
expected = stack.pop
|
|
142
|
+
return i if expected == char && stack.empty?
|
|
143
|
+
end
|
|
143
144
|
end
|
|
145
|
+
i += 1
|
|
144
146
|
end
|
|
145
147
|
|
|
146
148
|
nil
|
|
147
149
|
end
|
|
150
|
+
|
|
148
151
|
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
149
152
|
|
|
150
153
|
# @param payload [String, nil] JSON payload to parse
|
|
@@ -184,8 +187,9 @@ module Html2rss
|
|
|
184
187
|
# @param jsonish [String] JSON-like string with potentially unquoted keys
|
|
185
188
|
# @return [String] payload with unquoted object keys quoted
|
|
186
189
|
def quote_unquoted_keys(jsonish)
|
|
187
|
-
jsonish.gsub(/(
|
|
188
|
-
|
|
190
|
+
jsonish.gsub(/(?<prefix>\A\s*|[{,\[]\s*)(?<key>[A-Za-z_]\w*)(?<suffix>\s*:)/) do
|
|
191
|
+
captures = Regexp.last_match.named_captures(symbolize_names: true)
|
|
192
|
+
"#{captures[:prefix]}\"#{captures[:key]}\"#{captures[:suffix]}"
|
|
189
193
|
end
|
|
190
194
|
end
|
|
191
195
|
|
|
@@ -415,12 +419,17 @@ module Html2rss
|
|
|
415
419
|
|
|
416
420
|
attr_reader :parsed_body
|
|
417
421
|
|
|
422
|
+
# @return [Boolean] true when the page contains article-like arrays in JSON state
|
|
423
|
+
def extractable?
|
|
424
|
+
json_documents.any? { CandidateDetector.candidate_array?(_1) }
|
|
425
|
+
end
|
|
426
|
+
|
|
418
427
|
# @yield [Hash{Symbol => Object}] normalized article hash
|
|
419
428
|
# @return [Enumerator, void] article enumerator when no block is given
|
|
420
429
|
def each
|
|
421
430
|
return enum_for(:each) unless block_given?
|
|
422
431
|
|
|
423
|
-
|
|
432
|
+
json_documents.each do |document|
|
|
424
433
|
discover_articles(document) do |article|
|
|
425
434
|
yield article if article
|
|
426
435
|
end
|
|
@@ -431,6 +440,10 @@ module Html2rss
|
|
|
431
440
|
|
|
432
441
|
attr_reader :url
|
|
433
442
|
|
|
443
|
+
def json_documents
|
|
444
|
+
self.class.json_documents(parsed_body)
|
|
445
|
+
end
|
|
446
|
+
|
|
434
447
|
def discover_articles(document, &block)
|
|
435
448
|
case document
|
|
436
449
|
when Array then handle_array(document, &block)
|
|
@@ -24,19 +24,30 @@ module Html2rss
|
|
|
24
24
|
) do
|
|
25
25
|
# @param url [Html2rss::Url] normalized destination URL
|
|
26
26
|
# @return [DestinationFacts] route facts for downstream link scoring
|
|
27
|
-
def self.build(url)
|
|
27
|
+
def self.build(url) # rubocop:disable Metrics/MethodLength
|
|
28
28
|
classifier = PathClassifier.new(url.path_segments)
|
|
29
29
|
|
|
30
30
|
new(
|
|
31
31
|
url:,
|
|
32
32
|
destination: url.to_s,
|
|
33
|
-
|
|
33
|
+
segments: classifier.segments,
|
|
34
|
+
strong_post_suffix: classifier.strong_post_suffix?,
|
|
35
|
+
content_path: classifier.content_path?,
|
|
36
|
+
utility_path: classifier.utility_path?,
|
|
37
|
+
taxonomy_path: classifier.taxonomy_path?,
|
|
38
|
+
vanity_path: classifier.vanity_path?,
|
|
39
|
+
shallow: classifier.shallow?,
|
|
40
|
+
high_confidence_junk_path: classifier.junk_path?,
|
|
41
|
+
high_confidence_utility_destination: classifier.utility_destination?
|
|
34
42
|
)
|
|
35
43
|
end
|
|
36
44
|
end
|
|
37
45
|
|
|
38
46
|
# Extracts a normalized href from a Nokogiri anchor or raw href value.
|
|
39
47
|
class HrefExtractor
|
|
48
|
+
# Regexp to capture everything before the first '#'
|
|
49
|
+
HREF_BASE_PATTERN = /\A([^#]*)/
|
|
50
|
+
|
|
40
51
|
# @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
|
|
41
52
|
# @return [String, nil] href without fragment, or nil when blank
|
|
42
53
|
def self.call(anchor_or_href) = new(anchor_or_href).call
|
|
@@ -48,20 +59,18 @@ module Html2rss
|
|
|
48
59
|
|
|
49
60
|
# @return [String, nil] href without fragment, or nil when blank
|
|
50
61
|
def call
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
62
|
+
href = case @anchor_or_href
|
|
63
|
+
when Nokogiri::XML::Node
|
|
64
|
+
@anchor_or_href['href']
|
|
65
|
+
else
|
|
66
|
+
@anchor_or_href
|
|
67
|
+
end
|
|
55
68
|
|
|
56
|
-
|
|
69
|
+
return unless href
|
|
57
70
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@anchor_or_href['href']
|
|
62
|
-
else
|
|
63
|
-
@anchor_or_href
|
|
64
|
-
end
|
|
71
|
+
# Extract base part before # and strip whitespace
|
|
72
|
+
base = href.to_s[HREF_BASE_PATTERN, 1].strip
|
|
73
|
+
base unless base.empty?
|
|
65
74
|
end
|
|
66
75
|
end
|
|
67
76
|
|
|
@@ -125,8 +134,7 @@ module Html2rss
|
|
|
125
134
|
end
|
|
126
135
|
|
|
127
136
|
# Classifies normalized destination path segments for scoring.
|
|
128
|
-
# rubocop:disable Metrics/ClassLength
|
|
129
|
-
class PathClassifier
|
|
137
|
+
class PathClassifier # rubocop:disable Metrics/ClassLength
|
|
130
138
|
attr_reader :segments
|
|
131
139
|
|
|
132
140
|
# Segment groups used to classify article, taxonomy, utility, and vanity routes.
|
|
@@ -206,48 +214,25 @@ module Html2rss
|
|
|
206
214
|
@segments = segments
|
|
207
215
|
end
|
|
208
216
|
|
|
209
|
-
# @return [Hash] destination attributes consumed by DestinationFacts
|
|
210
|
-
def destination_attributes
|
|
211
|
-
route_attributes.merge(confidence_attributes)
|
|
212
|
-
end
|
|
213
|
-
|
|
214
|
-
# @return [Hash] baseline path classification attributes
|
|
215
|
-
def route_attributes
|
|
216
|
-
{
|
|
217
|
-
segments:,
|
|
218
|
-
content_path: content_path?,
|
|
219
|
-
utility_path: utility_path?,
|
|
220
|
-
taxonomy_path: taxonomy_path?,
|
|
221
|
-
vanity_path: vanity_path?,
|
|
222
|
-
shallow: shallow?,
|
|
223
|
-
strong_post_suffix: strong_post_suffix?
|
|
224
|
-
}
|
|
225
|
-
end
|
|
226
|
-
|
|
227
|
-
# @return [Hash] high-confidence noise classification attributes
|
|
228
|
-
def confidence_attributes
|
|
229
|
-
ConfidenceClassifier.new(self).attributes
|
|
230
|
-
end
|
|
231
|
-
|
|
232
217
|
# @return [Boolean] true when the route has article-like path evidence
|
|
233
218
|
def content_path?
|
|
234
|
-
@content_path ||= SEGMENT_SETS
|
|
219
|
+
@content_path ||= segments.any? { |s| SEGMENT_SETS[:content].include?(s) } ||
|
|
235
220
|
yearish_content_context?
|
|
236
221
|
end
|
|
237
222
|
|
|
238
223
|
# @return [Boolean] true when the route includes utility/navigation evidence
|
|
239
224
|
def utility_path?
|
|
240
|
-
@utility_path ||= SEGMENT_SETS
|
|
225
|
+
@utility_path ||= segments.any? { |s| SEGMENT_SETS[:utility].include?(s) }
|
|
241
226
|
end
|
|
242
227
|
|
|
243
228
|
# @return [Boolean] true when the route points at conversion or account chrome
|
|
244
229
|
def vanity_path?
|
|
245
|
-
@vanity_path ||= SEGMENT_SETS
|
|
230
|
+
@vanity_path ||= segments.any? { |s| SEGMENT_SETS[:vanity].include?(s) }
|
|
246
231
|
end
|
|
247
232
|
|
|
248
233
|
# @return [Boolean] true when the route points at taxonomy/listing chrome
|
|
249
234
|
def taxonomy_path?
|
|
250
|
-
@taxonomy_path ||= SEGMENT_SETS
|
|
235
|
+
@taxonomy_path ||= segments.any? { |s| SEGMENT_SETS[:taxonomy].include?(s) }
|
|
251
236
|
end
|
|
252
237
|
|
|
253
238
|
# @return [Boolean] true when the route is too shallow to strongly indicate an article
|
|
@@ -260,7 +245,9 @@ module Html2rss
|
|
|
260
245
|
|
|
261
246
|
# @return [Boolean] true when the final path segment looks like a post slug
|
|
262
247
|
def strong_post_suffix?
|
|
263
|
-
|
|
248
|
+
@strong_post_suffix ||= segments.any? &&
|
|
249
|
+
included_last_segment? &&
|
|
250
|
+
trusted_post_context?(segments.size - 1)
|
|
264
251
|
end
|
|
265
252
|
|
|
266
253
|
# @return [Boolean] true when every path segment is utility chrome
|
|
@@ -282,131 +269,81 @@ module Html2rss
|
|
|
282
269
|
|
|
283
270
|
# @return [Boolean] true when the leading segments are all utility chrome
|
|
284
271
|
def deep_utility_context_route?
|
|
285
|
-
|
|
272
|
+
all_junk?(segments.size - 1)
|
|
286
273
|
end
|
|
287
274
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
def yearish_content_context?
|
|
291
|
-
segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
|
|
292
|
-
(strong_post_suffix? || LeadingSegments.new(segments).trusted_post_context?)
|
|
293
|
-
end
|
|
294
|
-
end
|
|
295
|
-
# rubocop:enable Metrics/ClassLength
|
|
296
|
-
|
|
297
|
-
# Classifies high-confidence junk and utility routes from path facts.
|
|
298
|
-
class ConfidenceClassifier
|
|
299
|
-
# @param path [PathClassifier] classified destination path
|
|
300
|
-
def initialize(path)
|
|
301
|
-
@path = path
|
|
302
|
-
end
|
|
303
|
-
|
|
304
|
-
# @return [Hash] high-confidence route classification attributes
|
|
305
|
-
def attributes
|
|
306
|
-
{
|
|
307
|
-
high_confidence_junk_path: junk_path?,
|
|
308
|
-
high_confidence_utility_destination: utility_destination?
|
|
309
|
-
}
|
|
310
|
-
end
|
|
311
|
-
|
|
312
|
-
private
|
|
313
|
-
|
|
275
|
+
# @return [Boolean] true when the route is shallow and contains high-confidence noise
|
|
314
276
|
def junk_path?
|
|
315
277
|
return false if excluded_content_route?
|
|
316
278
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
279
|
+
taxonomy_path? ||
|
|
280
|
+
utility_only_route? ||
|
|
281
|
+
deep_utility_context_route? ||
|
|
282
|
+
shallow_high_confidence_route?
|
|
321
283
|
end
|
|
322
284
|
|
|
285
|
+
# @return [Boolean] true when the route points at conversion or account chrome
|
|
323
286
|
def utility_destination?
|
|
324
287
|
return false if excluded_content_route?
|
|
325
288
|
|
|
326
|
-
|
|
289
|
+
vanity_path? || utility_route?
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
private
|
|
293
|
+
|
|
294
|
+
def yearish_content_context?
|
|
295
|
+
segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
|
|
296
|
+
(strong_post_suffix? || trusted_post_context?(segments.size - 1))
|
|
327
297
|
end
|
|
328
298
|
|
|
329
299
|
def excluded_content_route?
|
|
330
|
-
|
|
300
|
+
segments.empty? || content_path? || strong_post_suffix?
|
|
331
301
|
end
|
|
332
302
|
|
|
333
303
|
def utility_route?
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
304
|
+
taxonomy_path? ||
|
|
305
|
+
utility_only_route? ||
|
|
306
|
+
deep_utility_context_route? ||
|
|
337
307
|
shallow_utility_route?
|
|
338
308
|
end
|
|
339
309
|
|
|
340
310
|
def shallow_utility_route?
|
|
341
|
-
|
|
342
|
-
end
|
|
343
|
-
end
|
|
344
|
-
|
|
345
|
-
# Classifies route context before the final segment.
|
|
346
|
-
class LeadingSegments
|
|
347
|
-
# @param segments [Array<String>] normalized URL path segments
|
|
348
|
-
def initialize(segments)
|
|
349
|
-
@segments = segments[0...-1]
|
|
311
|
+
shallow? && utility_path?
|
|
350
312
|
end
|
|
351
313
|
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
junk_segments = PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk)
|
|
314
|
+
def all_junk?(limit)
|
|
315
|
+
return false if limit <= 0
|
|
355
316
|
|
|
356
|
-
|
|
317
|
+
junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
|
|
318
|
+
(0...limit).all? { |i| junk_segments.include?(segments[i]) }
|
|
357
319
|
end
|
|
358
320
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
321
|
+
def trusted_post_context?(limit)
|
|
322
|
+
return false if limit <= 0
|
|
323
|
+
|
|
324
|
+
content_segments = SEGMENT_SETS.fetch(:content)
|
|
325
|
+
context_segments = SEGMENT_SETS.fetch(:deep_post_context)
|
|
363
326
|
|
|
364
|
-
|
|
327
|
+
(0...limit).any? do |i|
|
|
328
|
+
segment = segments[i]
|
|
365
329
|
content_segments.include?(segment) ||
|
|
366
330
|
segment.match?(PathClassifier::YEARISH_SEGMENT) ||
|
|
367
331
|
context_segments.include?(segment)
|
|
368
332
|
end
|
|
369
333
|
end
|
|
370
|
-
end
|
|
371
|
-
|
|
372
|
-
# Classifies whether the final segment is a strong post-like suffix.
|
|
373
|
-
class PostSuffixClassifier
|
|
374
|
-
# @param segments [Array<String>] normalized URL path segments
|
|
375
|
-
def initialize(segments)
|
|
376
|
-
@segments = segments
|
|
377
|
-
end
|
|
378
|
-
|
|
379
|
-
# @return [Boolean] true when the final path segment looks like a post slug
|
|
380
|
-
def strong?
|
|
381
|
-
@segments.any? &&
|
|
382
|
-
included_last_segment? &&
|
|
383
|
-
LeadingSegments.new(@segments).trusted_post_context?
|
|
384
|
-
end
|
|
385
|
-
|
|
386
|
-
private
|
|
387
334
|
|
|
388
335
|
def included_last_segment?
|
|
389
336
|
!excluded_last_segment? && slug_last_segment?
|
|
390
337
|
end
|
|
391
338
|
|
|
392
339
|
def excluded_last_segment?
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
def excluded_segments
|
|
397
|
-
[
|
|
398
|
-
PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk),
|
|
399
|
-
PathClassifier::SEGMENT_SETS.fetch(:vanity)
|
|
400
|
-
]
|
|
340
|
+
last = segments.last
|
|
341
|
+
[SEGMENT_SETS[:high_confidence_junk], SEGMENT_SETS[:vanity]].any? { |set| set.include?(last) }
|
|
401
342
|
end
|
|
402
343
|
|
|
403
344
|
def slug_last_segment?
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
end
|
|
407
|
-
|
|
408
|
-
def last_segment
|
|
409
|
-
@segments.last
|
|
345
|
+
last = segments.last
|
|
346
|
+
last.match?(YEARISH_SEGMENT) || last.match?(POST_SLUG_SEGMENT)
|
|
410
347
|
end
|
|
411
348
|
end
|
|
412
349
|
|
|
@@ -421,11 +358,15 @@ module Html2rss
|
|
|
421
358
|
# @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
|
|
422
359
|
# @return [DestinationFacts, nil] normalized destination facts, or nil for blank/invalid URLs
|
|
423
360
|
def destination_facts(anchor_or_href)
|
|
361
|
+
return node_facts[anchor_or_href] if node_facts.key?(anchor_or_href)
|
|
362
|
+
|
|
424
363
|
href = HrefExtractor.call(anchor_or_href)
|
|
425
364
|
return unless href
|
|
426
365
|
|
|
427
|
-
|
|
428
|
-
|
|
366
|
+
res = memoized_destination_facts(href)
|
|
367
|
+
|
|
368
|
+
node_facts[anchor_or_href] = res if anchor_or_href.is_a?(Nokogiri::XML::Node)
|
|
369
|
+
res
|
|
429
370
|
rescue ArgumentError
|
|
430
371
|
nil
|
|
431
372
|
end
|
|
@@ -441,6 +382,19 @@ module Html2rss
|
|
|
441
382
|
# @param text [String, #to_s] visible anchor text
|
|
442
383
|
# @return [Boolean] true when text identifies recommendation chrome
|
|
443
384
|
def recommended_text?(text) = @text_classifier.recommended?(text)
|
|
385
|
+
|
|
386
|
+
private
|
|
387
|
+
|
|
388
|
+
def node_facts
|
|
389
|
+
@node_facts ||= {}.compare_by_identity
|
|
390
|
+
end
|
|
391
|
+
|
|
392
|
+
def memoized_destination_facts(href)
|
|
393
|
+
(@destination_facts ||= {})[href] ||= begin
|
|
394
|
+
url = Html2rss::Url.from_relative(href, @base_url)
|
|
395
|
+
DestinationFacts.build(url)
|
|
396
|
+
end
|
|
397
|
+
end
|
|
444
398
|
end
|
|
445
399
|
end
|
|
446
400
|
end
|