html2rss 0.19.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/auto_source/scraper/html.rb +48 -56
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +447 -0
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +6 -161
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +102 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +172 -30
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +1 -1
- data/lib/html2rss/config/class_methods.rb +2 -2
- data/lib/html2rss/config/request_headers.rb +18 -9
- data/lib/html2rss/configuration.rb +176 -0
- data/lib/html2rss/html_extractor/list_candidates.rb +94 -0
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +257 -0
- data/lib/html2rss/html_extractor/semantic_containers.rb +70 -0
- data/lib/html2rss/html_extractor.rb +11 -0
- data/lib/html2rss/rss_builder/channel.rb +10 -7
- data/lib/html2rss/url.rb +2 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +54 -5
- metadata +9 -3
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Global configuration defaults for the Html2rss gem.
|
|
6
|
+
class Configuration
|
|
7
|
+
# The valid symbol log levels.
|
|
8
|
+
VALID_LOG_LEVELS = %i[debug info warn error fatal unknown].freeze
|
|
9
|
+
|
|
10
|
+
# @return [Object] the logger
|
|
11
|
+
attr_reader :logger
|
|
12
|
+
|
|
13
|
+
# @return [Proc, nil] the logger formatter
|
|
14
|
+
attr_reader :logger_formatter
|
|
15
|
+
|
|
16
|
+
# @return [Symbol, Integer] the current log level
|
|
17
|
+
attr_reader :log_level
|
|
18
|
+
|
|
19
|
+
# @return [Hash, Proc, nil] the globally configured headers
|
|
20
|
+
attr_reader :headers
|
|
21
|
+
|
|
22
|
+
# @return [Symbol, nil] the default strategy name
|
|
23
|
+
attr_reader :default_strategy
|
|
24
|
+
|
|
25
|
+
# @return [Integer, nil] the minimum TTL in minutes
|
|
26
|
+
attr_reader :min_ttl
|
|
27
|
+
|
|
28
|
+
# @return [Array<Hash>] the globally configured stylesheets
|
|
29
|
+
attr_reader :stylesheets
|
|
30
|
+
|
|
31
|
+
##
|
|
32
|
+
# Initializes a new Configuration instance with defaults.
|
|
33
|
+
def initialize
|
|
34
|
+
@logger_formatter = proc do |severity, datetime, _progname, msg|
|
|
35
|
+
"#{datetime} [#{severity}] #{msg}\n"
|
|
36
|
+
end
|
|
37
|
+
@logger = Logger.new($stdout)
|
|
38
|
+
@logger.formatter = @logger_formatter
|
|
39
|
+
self.log_level = ENV.fetch('LOG_LEVEL', :warn)
|
|
40
|
+
@headers = nil
|
|
41
|
+
@default_strategy = nil
|
|
42
|
+
@min_ttl = nil
|
|
43
|
+
@stylesheets = [].freeze
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
##
|
|
47
|
+
# Sets the logger.
|
|
48
|
+
#
|
|
49
|
+
# @param logger [Object]
|
|
50
|
+
# @return [Object] the logger
|
|
51
|
+
def logger=(logger)
|
|
52
|
+
@logger = logger
|
|
53
|
+
@logger.level = @log_level if @logger.respond_to?(:level=)
|
|
54
|
+
@logger.formatter = @logger_formatter if @logger_formatter && @logger.respond_to?(:formatter=)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
##
|
|
58
|
+
# Sets the log level.
|
|
59
|
+
#
|
|
60
|
+
# @param level [Symbol, String, Integer] the new log level
|
|
61
|
+
# @return [Integer] the normalized log level
|
|
62
|
+
# @raise [ArgumentError] if the log level is invalid
|
|
63
|
+
def log_level=(level)
|
|
64
|
+
@log_level = normalize_log_level(level)
|
|
65
|
+
@logger.level = @log_level if @logger.respond_to?(:level=)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
##
|
|
69
|
+
# Sets the logger formatter.
|
|
70
|
+
#
|
|
71
|
+
# @param formatter [Proc, #call, nil] the new logger formatter
|
|
72
|
+
# @return [Proc, #call, nil] the new logger formatter
|
|
73
|
+
# @raise [ArgumentError] if formatter does not respond to #call
|
|
74
|
+
def logger_formatter=(formatter)
|
|
75
|
+
raise ArgumentError, 'formatter must respond to #call or be nil' if formatter && !formatter.respond_to?(:call)
|
|
76
|
+
|
|
77
|
+
@logger_formatter = formatter
|
|
78
|
+
@logger.formatter = @logger_formatter if @logger.respond_to?(:formatter=)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
##
|
|
82
|
+
# Sets the global request headers.
|
|
83
|
+
#
|
|
84
|
+
# @param headers [Hash, Proc, #call, nil] the HTTP request headers to globally apply
|
|
85
|
+
# @return [Hash, Proc, #call, nil] the assigned headers
|
|
86
|
+
# @raise [ArgumentError] if headers is not a Hash or callable
|
|
87
|
+
def headers=(headers)
|
|
88
|
+
if headers && !headers.is_a?(Hash) && !headers.respond_to?(:call)
|
|
89
|
+
raise ArgumentError, 'headers must be a Hash or respond to #call'
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
@headers = headers.is_a?(Hash) ? headers.dup.freeze : headers
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
##
|
|
96
|
+
# Sets the default strategy.
|
|
97
|
+
#
|
|
98
|
+
# @param strategy [Symbol, String, nil] the strategy name
|
|
99
|
+
# @return [Symbol, nil] the normalized strategy name
|
|
100
|
+
# @raise [ArgumentError] if the strategy is not registered
|
|
101
|
+
def default_strategy=(strategy)
|
|
102
|
+
if strategy.nil?
|
|
103
|
+
@default_strategy = nil
|
|
104
|
+
else
|
|
105
|
+
unless strategy.is_a?(Symbol) || strategy.is_a?(String)
|
|
106
|
+
raise ArgumentError, 'strategy must be a Symbol or String'
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
normalized = strategy.to_sym
|
|
110
|
+
raise ArgumentError, "unknown strategy: #{strategy}" unless RequestService.strategy_registered?(normalized)
|
|
111
|
+
|
|
112
|
+
@default_strategy = normalized
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
##
|
|
117
|
+
# Sets the minimum TTL in minutes.
|
|
118
|
+
#
|
|
119
|
+
# @param ttl [Integer, String, nil] the minimum TTL
|
|
120
|
+
# @return [Integer, nil] the normalized minimum TTL
|
|
121
|
+
# @raise [ArgumentError] if ttl is not a positive integer
|
|
122
|
+
def min_ttl=(ttl)
|
|
123
|
+
if ttl.nil?
|
|
124
|
+
@min_ttl = nil
|
|
125
|
+
else
|
|
126
|
+
val = Integer(ttl)
|
|
127
|
+
raise ArgumentError unless val.positive?
|
|
128
|
+
|
|
129
|
+
@min_ttl = val
|
|
130
|
+
end
|
|
131
|
+
rescue ArgumentError, TypeError
|
|
132
|
+
raise ArgumentError, "min_ttl must be a positive integer, got #{ttl.inspect}"
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
##
|
|
136
|
+
# Sets the global stylesheets.
|
|
137
|
+
#
|
|
138
|
+
# @param stylesheets [Array<Hash>] the XML stylesheet processing instructions to include in the generated feed
|
|
139
|
+
# @return [Array<Hash>] the assigned stylesheets
|
|
140
|
+
# @raise [ArgumentError] if stylesheets is not an Array of hashes
|
|
141
|
+
def stylesheets=(stylesheets)
|
|
142
|
+
raise ArgumentError, 'stylesheets must be an Array' unless stylesheets.is_a?(Array)
|
|
143
|
+
raise ArgumentError, 'stylesheets must be an Array of Hashes' unless stylesheets.all?(Hash)
|
|
144
|
+
|
|
145
|
+
@stylesheets = stylesheets.map { |h| h.dup.freeze }.freeze
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
protected
|
|
149
|
+
|
|
150
|
+
##
|
|
151
|
+
# Copy constructor for duplicating configuration.
|
|
152
|
+
#
|
|
153
|
+
# @param other [Html2rss::Configuration] the original configuration
|
|
154
|
+
# @return [void]
|
|
155
|
+
def initialize_copy(other)
|
|
156
|
+
super
|
|
157
|
+
@headers = @headers.dup if @headers.is_a?(Hash)
|
|
158
|
+
@stylesheets = @stylesheets.map(&:dup) if @stylesheets.is_a?(Array)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
private
|
|
162
|
+
|
|
163
|
+
def normalize_log_level(level)
|
|
164
|
+
if level.is_a?(Integer)
|
|
165
|
+
raise ArgumentError, "invalid log level: #{level}" unless level.between?(0, 5)
|
|
166
|
+
|
|
167
|
+
level
|
|
168
|
+
else
|
|
169
|
+
sym = level.to_s.downcase.to_sym
|
|
170
|
+
raise ArgumentError, "invalid log level: #{level}" unless VALID_LOG_LEVELS.include?(sym)
|
|
171
|
+
|
|
172
|
+
Logger.const_get(sym.upcase)
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# Builds repeated-list article container candidates from generic HTML.
|
|
7
|
+
class ListCandidates
|
|
8
|
+
##
|
|
9
|
+
# Simplify an XPath selector by removing index notation.
|
|
10
|
+
#
|
|
11
|
+
# @param xpath [String] original XPath
|
|
12
|
+
# @return [String] XPath without positional indexes
|
|
13
|
+
def self.simplify_xpath(xpath)
|
|
14
|
+
xpath.gsub(/\[\d+\]/, '')
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed document
|
|
18
|
+
# @param minimum_selector_frequency [Integer] minimum repeated anchor path count
|
|
19
|
+
# @param use_top_selectors [Integer] number of frequent anchor paths to inspect
|
|
20
|
+
def initialize(parsed_body, minimum_selector_frequency:, use_top_selectors:)
|
|
21
|
+
@parsed_body = parsed_body
|
|
22
|
+
@minimum_selector_frequency = minimum_selector_frequency
|
|
23
|
+
@use_top_selectors = use_top_selectors
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
##
|
|
27
|
+
# @param anchor_filter [#call] predicate for scraper-specific anchor eligibility
|
|
28
|
+
# @param boundary_condition [#call] predicate for article container boundary
|
|
29
|
+
# @yieldparam article_tag [Nokogiri::XML::Node] candidate article container
|
|
30
|
+
# @yieldparam selected_anchor [Nokogiri::XML::Node] anchor that made the container eligible
|
|
31
|
+
# @return [Enumerator]
|
|
32
|
+
def each_article_tag(anchor_filter:, boundary_condition:)
|
|
33
|
+
return enum_for(:each_article_tag, anchor_filter:, boundary_condition:) unless block_given?
|
|
34
|
+
|
|
35
|
+
article_tags(anchor_filter:, boundary_condition:).each { yield _1[:article_tag], _1[:selected_anchor] }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
private
|
|
39
|
+
|
|
40
|
+
attr_reader :parsed_body, :minimum_selector_frequency, :use_top_selectors
|
|
41
|
+
|
|
42
|
+
def article_tags(anchor_filter:, boundary_condition:)
|
|
43
|
+
selectors(anchor_filter:).flat_map do |selector|
|
|
44
|
+
article_tags_for_selector(selector, boundary_condition)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def article_tags_for_selector(selector, boundary_condition)
|
|
49
|
+
parsed_body.xpath(selector).filter_map do |selected_tag|
|
|
50
|
+
next if HtmlExtractor.ignored_container_path?(selected_tag)
|
|
51
|
+
|
|
52
|
+
article_tag = HtmlNavigator.parent_until_condition(selected_tag, boundary_condition)
|
|
53
|
+
next unless article_tag
|
|
54
|
+
|
|
55
|
+
{ article_tag:, selected_anchor: selected_tag }
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def selectors(anchor_filter:)
|
|
60
|
+
anchor_counts(anchor_filter:)
|
|
61
|
+
.select { |_selector, count| count >= minimum_selector_frequency }
|
|
62
|
+
.max_by(use_top_selectors, &:last)
|
|
63
|
+
.map(&:first)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def anchor_counts(anchor_filter:)
|
|
67
|
+
Hash.new(0).tap do |counts|
|
|
68
|
+
each_anchor(anchor_filter:) do |node|
|
|
69
|
+
path = self.class.simplify_xpath(node.path)
|
|
70
|
+
counts[path] += 1 unless HtmlExtractor.ignored_container_path?(path)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def each_anchor(anchor_filter:)
|
|
76
|
+
return enum_for(:each_anchor, anchor_filter:) unless block_given?
|
|
77
|
+
|
|
78
|
+
traversal_root&.traverse do |node|
|
|
79
|
+
yield node if relevant_anchor?(node, anchor_filter:)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def relevant_anchor?(node, anchor_filter:)
|
|
84
|
+
node.element? &&
|
|
85
|
+
node.matches?(HtmlExtractor::MAIN_ANCHOR_SELECTOR) &&
|
|
86
|
+
anchor_filter.call(node)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def traversal_root
|
|
90
|
+
parsed_body.at_css('body, html') || parsed_body.root
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# Builds ranked anchor facts for one semantic content container.
|
|
7
|
+
class SemanticAnchorCandidates
|
|
8
|
+
# Anchor candidate plus scoring signals used by semantic anchor ranking.
|
|
9
|
+
AnchorFacts = Data.define(
|
|
10
|
+
:anchor,
|
|
11
|
+
:text,
|
|
12
|
+
:url,
|
|
13
|
+
:destination,
|
|
14
|
+
:segments,
|
|
15
|
+
:meaningful_text,
|
|
16
|
+
:content_like_destination,
|
|
17
|
+
:heading_anchor,
|
|
18
|
+
:heading_text_match,
|
|
19
|
+
:score
|
|
20
|
+
) do
|
|
21
|
+
# @param candidate [Candidate] eligible semantic anchor candidate
|
|
22
|
+
# @return [AnchorFacts] serializable facts used for ranking and dedupe
|
|
23
|
+
def self.from_candidate(candidate)
|
|
24
|
+
new(
|
|
25
|
+
**candidate.anchor_identity_attributes,
|
|
26
|
+
**candidate.anchor_signal_attributes,
|
|
27
|
+
score: Score.new(candidate).value
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Shared context for all anchors in one semantic container.
|
|
33
|
+
class Context
|
|
34
|
+
# Ancestor tags that usually indicate navigation/utility regions.
|
|
35
|
+
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
36
|
+
|
|
37
|
+
# @param container [Nokogiri::XML::Node] semantic container
|
|
38
|
+
# @param link_heuristics [Html2rss::AutoSource::Scraper::LinkHeuristics] destination/text heuristics
|
|
39
|
+
def initialize(container, link_heuristics:)
|
|
40
|
+
@container = container
|
|
41
|
+
@link_heuristics = link_heuristics
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @return [Nokogiri::XML::Node, nil] heading used to identify title anchors
|
|
45
|
+
def heading
|
|
46
|
+
@heading ||= @container.at_css(HtmlExtractor::HEADING_TAGS.join(','))
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [String] visible heading text
|
|
50
|
+
def heading_text
|
|
51
|
+
@heading_text ||= visible_text(heading)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# @param node [Nokogiri::XML::Node, nil] node to extract text from
|
|
55
|
+
# @return [String] visible text for the node
|
|
56
|
+
def visible_text(node)
|
|
57
|
+
return '' unless node
|
|
58
|
+
|
|
59
|
+
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# @param anchor [Nokogiri::XML::Node] anchor candidate
|
|
63
|
+
# @return [Html2rss::AutoSource::Scraper::LinkHeuristics::DestinationFacts, nil] destination facts
|
|
64
|
+
def destination_facts(anchor)
|
|
65
|
+
@link_heuristics.destination_facts(anchor)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @param text [String] visible anchor text
|
|
69
|
+
# @return [Boolean] true when text is utility chrome
|
|
70
|
+
def utility_text?(text)
|
|
71
|
+
@link_heuristics.utility_text?(text)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# @param ancestors [Array<Nokogiri::XML::Node>]
|
|
75
|
+
# @return [Boolean] true when the anchor lives inside navigation chrome
|
|
76
|
+
def utility_landmark?(ancestors)
|
|
77
|
+
ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# One anchor plus the facts needed to decide whether it represents content.
|
|
82
|
+
class Candidate
|
|
83
|
+
attr_reader :anchor
|
|
84
|
+
|
|
85
|
+
# @param anchor [Nokogiri::XML::Node] anchor candidate
|
|
86
|
+
# @param context [Context] semantic container context
|
|
87
|
+
def initialize(anchor, context)
|
|
88
|
+
@anchor = anchor
|
|
89
|
+
@context = context
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# @return [AnchorFacts, nil] ranked anchor facts when the anchor is eligible
|
|
93
|
+
def facts
|
|
94
|
+
return unless destination_facts
|
|
95
|
+
return if utility_text_suppressed? || ineligible_anchor?
|
|
96
|
+
return unless representative_content_anchor?
|
|
97
|
+
|
|
98
|
+
AnchorFacts.from_candidate(self)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# @return [Html2rss::AutoSource::Scraper::LinkHeuristics::DestinationFacts, nil] destination facts
|
|
102
|
+
def destination_facts
|
|
103
|
+
@destination_facts ||= @context.destination_facts(@anchor)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @return [String] visible anchor text
|
|
107
|
+
def text
|
|
108
|
+
@text ||= @context.visible_text(@anchor)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# @return [Hash] anchor identity attributes used to build AnchorFacts
|
|
112
|
+
def anchor_identity_attributes
|
|
113
|
+
{
|
|
114
|
+
anchor:,
|
|
115
|
+
text:,
|
|
116
|
+
url: destination_facts.url,
|
|
117
|
+
destination: destination_facts.destination,
|
|
118
|
+
segments: destination_facts.segments
|
|
119
|
+
}
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# @return [Hash] anchor signal attributes used to build AnchorFacts
|
|
123
|
+
def anchor_signal_attributes
|
|
124
|
+
{
|
|
125
|
+
meaningful_text: meaningful_text?,
|
|
126
|
+
content_like_destination: content_like_destination?,
|
|
127
|
+
heading_anchor: heading_anchor?,
|
|
128
|
+
heading_text_match: heading_text_match?
|
|
129
|
+
}
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# @return [Boolean] true when visible anchor text has words
|
|
133
|
+
def meaningful_text?
|
|
134
|
+
text.scan(/\p{Alnum}+/).any?
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# @return [Boolean] true when the destination route has content signals
|
|
138
|
+
def content_like_destination?
|
|
139
|
+
destination_facts.content_path
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# @return [Boolean] true when the anchor is inside the selected heading
|
|
143
|
+
def heading_anchor?
|
|
144
|
+
heading = @context.heading
|
|
145
|
+
|
|
146
|
+
heading && @anchor.ancestors.include?(heading)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# @return [Boolean] true when anchor text exactly matches heading text
|
|
150
|
+
def heading_text_match?
|
|
151
|
+
heading_text = @context.heading_text
|
|
152
|
+
|
|
153
|
+
meaningful_text? &&
|
|
154
|
+
heading_text.scan(/\p{Alnum}+/).any? &&
|
|
155
|
+
heading_text == text
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
private
|
|
159
|
+
|
|
160
|
+
def representative_content_anchor?
|
|
161
|
+
heading_anchor? || meaningful_text? || content_like_destination?
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def utility_text_suppressed?
|
|
165
|
+
!content_like_destination? &&
|
|
166
|
+
@context.utility_text?(text) &&
|
|
167
|
+
(destination_facts.high_confidence_utility_destination || non_heading_weak_post?)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def non_heading_weak_post?
|
|
171
|
+
!heading_anchor? && !destination_facts.strong_post_suffix
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def ineligible_anchor?
|
|
175
|
+
destination_facts.high_confidence_utility_destination ||
|
|
176
|
+
icon_only_anchor? ||
|
|
177
|
+
@context.utility_landmark?(@anchor.ancestors.to_a)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def icon_only_anchor?
|
|
181
|
+
!meaningful_text? && @anchor.at_css('img, svg')
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Scores an eligible semantic anchor candidate.
|
|
186
|
+
class Score
|
|
187
|
+
# Score weights keyed by candidate signal predicate.
|
|
188
|
+
RULES = {
|
|
189
|
+
heading_anchor?: 100,
|
|
190
|
+
heading_text_match?: 20,
|
|
191
|
+
meaningful_text?: 10,
|
|
192
|
+
content_like_destination?: 10
|
|
193
|
+
}.freeze
|
|
194
|
+
|
|
195
|
+
# @param candidate [Candidate] eligible semantic anchor candidate
|
|
196
|
+
def initialize(candidate)
|
|
197
|
+
@candidate = candidate
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# @return [Integer] ranking score
|
|
201
|
+
def value
|
|
202
|
+
RULES.sum { |predicate, weight| @candidate.public_send(predicate) ? weight : 0 }
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Keeps the strongest semantic anchor fact for each destination.
|
|
207
|
+
class DestinationWinners
|
|
208
|
+
def initialize
|
|
209
|
+
@winners = {}
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# @return [Array<AnchorFacts>] strongest candidate per destination
|
|
213
|
+
def to_a
|
|
214
|
+
@winners.values
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# @param facts [AnchorFacts] candidate anchor facts
|
|
218
|
+
# @return [void]
|
|
219
|
+
def add(facts)
|
|
220
|
+
destination = facts.destination
|
|
221
|
+
@winners[destination] = stronger_fact(@winners[destination], facts)
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
private
|
|
225
|
+
|
|
226
|
+
def stronger_fact(current, candidate)
|
|
227
|
+
return candidate unless current
|
|
228
|
+
|
|
229
|
+
current.score >= candidate.score ? current : candidate
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# @param container [Nokogiri::XML::Node] semantic container
|
|
234
|
+
# @param link_heuristics [Html2rss::AutoSource::Scraper::LinkHeuristics] destination/text heuristics
|
|
235
|
+
def initialize(container, link_heuristics:)
|
|
236
|
+
@container = container
|
|
237
|
+
@context = Context.new(container, link_heuristics:)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# @return [Array<AnchorFacts>] strongest candidate per destination
|
|
241
|
+
def to_a
|
|
242
|
+
@container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR)
|
|
243
|
+
.each_with_object(DestinationWinners.new) { |anchor, winners| add_anchor(anchor, winners) }
|
|
244
|
+
.to_a
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
private
|
|
248
|
+
|
|
249
|
+
def add_anchor(anchor, winners)
|
|
250
|
+
return if HtmlExtractor.ignored_container_path?(anchor)
|
|
251
|
+
|
|
252
|
+
facts = Candidate.new(anchor, @context).facts
|
|
253
|
+
winners.add(facts) if facts
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
end
|
|
257
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# Collects semantic content containers from a parsed HTML document.
|
|
7
|
+
class SemanticContainers
|
|
8
|
+
# Candidate selectors used to locate extractable semantic content blocks.
|
|
9
|
+
SELECTORS = [
|
|
10
|
+
'article:not(:has(article))',
|
|
11
|
+
'section:not(:has(section))',
|
|
12
|
+
'li:not(:has(li))',
|
|
13
|
+
'tr:not(:has(tr))',
|
|
14
|
+
'div:not(:has(div))'
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
17
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed document
|
|
18
|
+
# @return [Array<Nokogiri::XML::Node>] candidate semantic containers
|
|
19
|
+
def self.call(parsed_body)
|
|
20
|
+
new(parsed_body).call
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed document
|
|
24
|
+
def initialize(parsed_body)
|
|
25
|
+
@parsed_body = parsed_body
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @return [Array<Nokogiri::XML::Node>] candidate semantic containers
|
|
29
|
+
def call
|
|
30
|
+
containers = SELECTORS.each_with_object([]) do |selector, memo|
|
|
31
|
+
collect_selector_containers(selector, memo)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
containers.sort_by { document_order.fetch(_1) }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def document_order
|
|
40
|
+
@document_order ||= begin
|
|
41
|
+
order = {}
|
|
42
|
+
index = 0
|
|
43
|
+
|
|
44
|
+
@parsed_body.traverse do |node|
|
|
45
|
+
next unless node.element?
|
|
46
|
+
|
|
47
|
+
order[node] = index
|
|
48
|
+
index += 1
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
order.compare_by_identity
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def collect_selector_containers(selector, containers)
|
|
56
|
+
@parsed_body.css(selector).each do |container|
|
|
57
|
+
next if HtmlExtractor.ignored_container_path?(container)
|
|
58
|
+
next if seen[container]
|
|
59
|
+
|
|
60
|
+
seen[container] = true
|
|
61
|
+
containers << container
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def seen
|
|
66
|
+
@seen ||= {}.compare_by_identity
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -7,6 +7,8 @@ module Html2rss
|
|
|
7
7
|
class HtmlExtractor
|
|
8
8
|
# Tags ignored when extracting visible text content from article containers.
|
|
9
9
|
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
10
|
+
# Element path pattern ignored when traversing candidate article containers.
|
|
11
|
+
IGNORED_CONTAINER_PATH = /(nav|footer|header|svg|script|style)/i
|
|
10
12
|
# Heading tags used to prioritize title extraction.
|
|
11
13
|
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
12
14
|
# Selector used to derive non-headline description nodes.
|
|
@@ -87,6 +89,15 @@ module Html2rss
|
|
|
87
89
|
|
|
88
90
|
article_tag.at_css(MAIN_ANCHOR_SELECTOR)
|
|
89
91
|
end
|
|
92
|
+
|
|
93
|
+
##
|
|
94
|
+
# @param node [Nokogiri::XML::Node, String] node or path to test
|
|
95
|
+
# @return [Boolean] true when the node belongs to ignored DOM chrome
|
|
96
|
+
def ignored_container_path?(node)
|
|
97
|
+
path = node.respond_to?(:path) ? node.path : node.to_s
|
|
98
|
+
|
|
99
|
+
path.match?(IGNORED_CONTAINER_PATH)
|
|
100
|
+
end
|
|
90
101
|
end
|
|
91
102
|
|
|
92
103
|
def extract_url
|
|
@@ -41,13 +41,16 @@ module Html2rss
|
|
|
41
41
|
|
|
42
42
|
# @return [Integer] cache time-to-live in minutes
|
|
43
43
|
def ttl
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
44
|
+
calculated = if overrides[:ttl]
|
|
45
|
+
overrides[:ttl].to_i
|
|
46
|
+
elsif (max_age = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1))
|
|
47
|
+
max_age.to_i.fdiv(60).ceil
|
|
48
|
+
else
|
|
49
|
+
DEFAULT_TTL_IN_MINUTES
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
min_ttl = Html2rss.configuration.min_ttl
|
|
53
|
+
min_ttl ? [calculated, min_ttl].max : calculated
|
|
51
54
|
end
|
|
52
55
|
|
|
53
56
|
# @return [String, nil] ISO-like language code when available
|
data/lib/html2rss/url.rb
CHANGED