html2rss 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/auto_source/scraper/html/class_clustering.rb +185 -0
- data/lib/html2rss/auto_source/scraper/html.rb +14 -2
- data/lib/html2rss/auto_source/scraper/json_state.rb +1 -1
- data/lib/html2rss/auto_source/scraper/microdata.rb +2 -2
- data/lib/html2rss/auto_source.rb +3 -1
- data/lib/html2rss/cli.rb +61 -10
- data/lib/html2rss/config/class_methods.rb +3 -3
- data/lib/html2rss/config/validator.rb +1 -0
- data/lib/html2rss/config.rb +2 -2
- data/lib/html2rss/html_extractor/heading_extractor.rb +50 -0
- data/lib/html2rss/html_extractor/id_generator.rb +67 -0
- data/lib/html2rss/html_extractor/text_extractor.rb +77 -0
- data/lib/html2rss/html_extractor.rb +50 -52
- data/lib/html2rss/rendering.rb +2 -2
- data/lib/html2rss/request_service/local_file_strategy.rb +29 -0
- data/lib/html2rss/request_service/puppet_commander.rb +4 -0
- data/lib/html2rss/request_service.rb +2 -1
- data/lib/html2rss/selectors/extractors/attribute.rb +1 -1
- data/lib/html2rss/selectors/extractors/href.rb +1 -1
- data/lib/html2rss/selectors/extractors/html.rb +1 -1
- data/lib/html2rss/selectors/extractors/static.rb +1 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -1
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +8 -1
- data/lib/html2rss/selectors.rb +1 -1
- data/lib/html2rss/url.rb +26 -13
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +28 -6
- data/schema/html2rss-config.schema.json +12 -1
- metadata +7 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9ff7cdc4e25f3abc6da000e4f672d6832fb6027e49885aecc0cad38329c5e6ae
|
|
4
|
+
data.tar.gz: e42c216e328bb2c56971dd58871f023f15e672563e27741c56c6cf7fe4cb322a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e52812f947561b9a52537f1b28c530f63e116194642d13ff526fac1ad32f02d7ea6ff8ca9b5ee16e2d7f686e1babec1908ca51399ba42ef9461ed3dbe0d02117
|
|
7
|
+
data.tar.gz: 4754495a5947aca6de71846d1c88128d9fc1826e1014fd00806f58e7cd1e1575dc3bd2380ced3aeecd0cd6d51e0366ddee7a1f7db0220750e13f66645de2edde
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class Html
|
|
7
|
+
##
|
|
8
|
+
# ClassClustering clusters DOM elements on anchorless pages by class lists and scores
|
|
9
|
+
# candidate groups to find the best list of content cards/articles.
|
|
10
|
+
# rubocop:disable Metrics/ClassLength
|
|
11
|
+
class ClassClustering
|
|
12
|
+
# Node tags considered layout containers
|
|
13
|
+
LAYOUT_TAG_NAMES = Set['div', 'section', 'article'].freeze
|
|
14
|
+
# HTML/layout tags excluded from candidate nodes
|
|
15
|
+
EXCLUDED_TAGS = Set['html', 'body', 'nav', 'footer', 'header', 'svg', 'script', 'style'].freeze
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
##
|
|
19
|
+
# Clusters elements in parsed_body and returns the best set of content card nodes.
|
|
20
|
+
#
|
|
21
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
22
|
+
# @param minimum_selector_frequency [Integer] minimum frequency for class groups
|
|
23
|
+
# @return [Array<Nokogiri::XML::Node>] candidate nodes of the top-scoring class group
|
|
24
|
+
def call(parsed_body, minimum_selector_frequency:)
|
|
25
|
+
new(parsed_body, minimum_selector_frequency:).call
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @param parsed_body [Nokogiri::HTML::Document]
|
|
30
|
+
# @param minimum_selector_frequency [Integer]
|
|
31
|
+
def initialize(parsed_body, minimum_selector_frequency:)
|
|
32
|
+
@parsed_body = parsed_body
|
|
33
|
+
@minimum_frequency = minimum_selector_frequency
|
|
34
|
+
@text_words = {}.compare_by_identity
|
|
35
|
+
@has_date = {}.compare_by_identity
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# @return [Array<Nokogiri::XML::Node>]
|
|
39
|
+
def call
|
|
40
|
+
candidate_groups = collect_candidate_groups
|
|
41
|
+
return [] if candidate_groups.empty?
|
|
42
|
+
|
|
43
|
+
non_containers = filter_containers(candidate_groups)
|
|
44
|
+
final_groups = filter_1_to_1_overlap(non_containers)
|
|
45
|
+
|
|
46
|
+
select_best_group(final_groups)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def collect_candidate_groups
|
|
52
|
+
class_groups = Hash.new { |h, k| h[k] = [] }
|
|
53
|
+
cache = {}.compare_by_identity
|
|
54
|
+
|
|
55
|
+
@parsed_body.css('[class]').each { |node| add_node_to_groups(node, class_groups, cache) }
|
|
56
|
+
|
|
57
|
+
class_groups.select { |_, nodes| nodes.size >= @minimum_frequency }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def add_node_to_groups(node, class_groups, cache)
|
|
61
|
+
return if EXCLUDED_TAGS.include?(node.name) || HtmlExtractor.ignored_container_path?(node, cache)
|
|
62
|
+
|
|
63
|
+
cls = normalize_class(node['class'])
|
|
64
|
+
class_groups[cls] << node unless cls.empty?
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def normalize_class(class_attr)
|
|
68
|
+
class_str = class_attr.to_s.strip
|
|
69
|
+
return '' if class_str.empty?
|
|
70
|
+
|
|
71
|
+
# Bypass split/sort/join allocation for single-class lists
|
|
72
|
+
if class_str.include?(' ')
|
|
73
|
+
class_str.split(/\s+/).sort.join(' ')
|
|
74
|
+
else
|
|
75
|
+
class_str
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Discard group A if any node of A contains > 1 node of another group B
|
|
80
|
+
def filter_containers(groups)
|
|
81
|
+
groups.reject do |cls_a, nodes_a|
|
|
82
|
+
groups.any? { |cls_b, nodes_b| cls_a != cls_b && container_of?(nodes_a, nodes_b) }
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def container_of?(nodes_a, nodes_b)
|
|
87
|
+
return false unless LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
|
|
88
|
+
|
|
89
|
+
nodes_a.any? do |node_a|
|
|
90
|
+
nodes_b.count { |node_b| node_a != node_b && node_b.ancestors.include?(node_a) } > 1
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# If group A contains group B, and they have the same size:
|
|
95
|
+
# - If B (the descendant) contains >= 80% of A's words, AND B's tag is div/section/article,
|
|
96
|
+
# B is the actual content card. Discard A.
|
|
97
|
+
# - Otherwise, B is a sub-element (header, metadata line, button). Discard B.
|
|
98
|
+
def filter_1_to_1_overlap(groups)
|
|
99
|
+
discarded = Set.new
|
|
100
|
+
groups.each_key do |cls_a|
|
|
101
|
+
groups.each_key do |cls_b|
|
|
102
|
+
next if cls_a == cls_b || discarded.include?(cls_a) || discarded.include?(cls_b)
|
|
103
|
+
|
|
104
|
+
resolve_1_to_1_overlap(cls_a, cls_b, groups, discarded)
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
groups.except(*discarded)
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def resolve_1_to_1_overlap(cls_a, cls_b, groups, discarded)
|
|
112
|
+
nodes_a = groups[cls_a]
|
|
113
|
+
nodes_b = groups[cls_b]
|
|
114
|
+
return if nodes_a.size != nodes_b.size
|
|
115
|
+
return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && b.ancestors.include?(a) }
|
|
116
|
+
|
|
117
|
+
discarded << (keep_descendant?(nodes_a, nodes_b) ? cls_a : cls_b)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def keep_descendant?(nodes_a, nodes_b)
|
|
121
|
+
avg_words(nodes_b) >= 0.8 * avg_words(nodes_a) &&
|
|
122
|
+
LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def select_best_group(groups)
|
|
126
|
+
best_nodes = []
|
|
127
|
+
best_score = -1
|
|
128
|
+
|
|
129
|
+
groups.each_value do |nodes|
|
|
130
|
+
score = score_group(nodes)
|
|
131
|
+
next if score.negative?
|
|
132
|
+
|
|
133
|
+
(best_nodes = nodes) && (best_score = score) if score > best_score
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
best_nodes
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def score_group(nodes)
|
|
140
|
+
avg_w = avg_words(nodes)
|
|
141
|
+
return -1 if avg_w < 5
|
|
142
|
+
|
|
143
|
+
score = nodes.size + (avg_w / 5.0)
|
|
144
|
+
score += 20 if nodes_heading?(nodes)
|
|
145
|
+
score += 20 if nodes_time?(nodes)
|
|
146
|
+
score += 40 if nodes_date?(nodes)
|
|
147
|
+
score
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def nodes_heading?(nodes)
|
|
151
|
+
nodes.any? do |n|
|
|
152
|
+
n.at_css(HtmlExtractor::HEADING_TAGS.join(',')) ||
|
|
153
|
+
n.at_css('.font-bold, .font-semibold')
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def nodes_time?(nodes)
|
|
158
|
+
nodes.any? { |n| n.at_css('time, [datetime]') }
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def nodes_date?(nodes)
|
|
162
|
+
nodes.any? { |n| date?(n) }
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def avg_words(nodes)
|
|
166
|
+
nodes.sum { |n| text_words(n) } / nodes.size.to_f
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def text_words(node)
|
|
170
|
+
@text_words[node] ||= HtmlExtractor.extract_visible_text(node).to_s.scan(/\p{Alnum}+/).size
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def date?(node)
|
|
174
|
+
@has_date[node] ||= begin
|
|
175
|
+
text = HtmlExtractor.extract_visible_text(node).to_s
|
|
176
|
+
text.match?(%r{\b\d{4}[-/]\d{2}[-/]\d{2}\b}) ||
|
|
177
|
+
text.match?(/\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b/i)
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
# rubocop:enable Metrics/ClassLength
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -62,6 +62,7 @@ module Html2rss
|
|
|
62
62
|
@url = url
|
|
63
63
|
@extractor = extractor
|
|
64
64
|
@opts = opts
|
|
65
|
+
@fallback_anchorless = opts.fetch(:fallback_anchorless, false)
|
|
65
66
|
@link_heuristics = LinkHeuristics.new(url)
|
|
66
67
|
@ignored_cache = {}.compare_by_identity
|
|
67
68
|
end
|
|
@@ -105,8 +106,19 @@ module Html2rss
|
|
|
105
106
|
private
|
|
106
107
|
|
|
107
108
|
def articles
|
|
108
|
-
@articles ||=
|
|
109
|
-
|
|
109
|
+
@articles ||= begin
|
|
110
|
+
extracted = each_article_tag.filter_map do |article_tag, selected_anchor|
|
|
111
|
+
extract_article(article_tag, selected_anchor:)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
extracted += find_anchorless_articles if @fallback_anchorless
|
|
115
|
+
extracted
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def find_anchorless_articles
|
|
120
|
+
ClassClustering.call(parsed_body, minimum_selector_frequency:).map do |node|
|
|
121
|
+
@extractor.new(node, base_url: @url, selected_anchor: nil, fallback_anchorless: true).call
|
|
110
122
|
end
|
|
111
123
|
end
|
|
112
124
|
|
|
@@ -305,7 +305,7 @@ module Html2rss
|
|
|
305
305
|
# rubocop:disable Metrics/MethodLength
|
|
306
306
|
# @param entry [Hash] raw article entry candidate
|
|
307
307
|
# @param base_url [String, Html2rss::Url] base URL for relative link resolution
|
|
308
|
-
# @return [Hash{Symbol => Object
|
|
308
|
+
# @return [Hash{Symbol => Object, nil}] normalized article hash for downstream extraction
|
|
309
309
|
def normalise(entry, base_url:)
|
|
310
310
|
return unless entry.is_a?(Hash)
|
|
311
311
|
|
|
@@ -92,7 +92,7 @@ module Html2rss
|
|
|
92
92
|
attr_reader :parsed_body, :url
|
|
93
93
|
|
|
94
94
|
# @param root [Nokogiri::XML::Element] supported Microdata root node
|
|
95
|
-
# @return [Hash{Symbol => Object
|
|
95
|
+
# @return [Hash{Symbol => Object, nil}] normalized article hash
|
|
96
96
|
def article_from(root)
|
|
97
97
|
schema_object = SchemaObjectBuilder.call(root)
|
|
98
98
|
return unless schema_object
|
|
@@ -378,7 +378,7 @@ module Html2rss
|
|
|
378
378
|
extend ValueNormalizer
|
|
379
379
|
|
|
380
380
|
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
381
|
-
# @return [Hash{Symbol => Object
|
|
381
|
+
# @return [Hash{Symbol => Object, nil}] compact schema-like object
|
|
382
382
|
def call(root)
|
|
383
383
|
type = Microdata.supported_type_name(root)
|
|
384
384
|
return unless type
|
data/lib/html2rss/auto_source.rb
CHANGED
|
@@ -37,7 +37,8 @@ module Html2rss
|
|
|
37
37
|
html: {
|
|
38
38
|
enabled: true,
|
|
39
39
|
minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
|
|
40
|
-
use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
|
|
40
|
+
use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS,
|
|
41
|
+
fallback_anchorless: true
|
|
41
42
|
}
|
|
42
43
|
},
|
|
43
44
|
cleanup: Cleanup::DEFAULT_CONFIG
|
|
@@ -63,6 +64,7 @@ module Html2rss
|
|
|
63
64
|
optional(:enabled).filled(:bool)
|
|
64
65
|
optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
|
|
65
66
|
optional(:use_top_selectors).filled(:integer, gt?: 0)
|
|
67
|
+
optional(:fallback_anchorless).filled(:bool)
|
|
66
68
|
end
|
|
67
69
|
end.freeze
|
|
68
70
|
private_constant :SCRAPER_CONFIG
|
data/lib/html2rss/cli.rb
CHANGED
|
@@ -48,6 +48,9 @@ module Html2rss
|
|
|
48
48
|
method_option :max_requests,
|
|
49
49
|
type: :numeric,
|
|
50
50
|
desc: 'Maximum requests to allow for this feed build'
|
|
51
|
+
method_option :input,
|
|
52
|
+
type: :string,
|
|
53
|
+
desc: 'Local HTML file path to read input from'
|
|
51
54
|
# @param yaml_file [String] path to YAML config
|
|
52
55
|
# @param feed_name [String, nil] optional named feed in multi-feed config
|
|
53
56
|
# @return [void]
|
|
@@ -55,6 +58,7 @@ module Html2rss
|
|
|
55
58
|
config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
|
|
56
59
|
config[:params] = options[:params] || {}
|
|
57
60
|
apply_runtime_request_overrides!(config)
|
|
61
|
+
apply_local_file_input!(config, options[:input]) if options[:input]
|
|
58
62
|
|
|
59
63
|
puts(execute_feed { Html2rss.feed(config) })
|
|
60
64
|
end
|
|
@@ -76,20 +80,17 @@ module Html2rss
|
|
|
76
80
|
method_option :max_requests,
|
|
77
81
|
type: :numeric,
|
|
78
82
|
desc: 'Maximum requests to allow for this feed build'
|
|
79
|
-
|
|
83
|
+
method_option :input,
|
|
84
|
+
type: :string,
|
|
85
|
+
desc: 'Local HTML file path to read input from'
|
|
86
|
+
# @param url [String, nil] source page URL for auto discovery
|
|
80
87
|
# @return [void]
|
|
81
|
-
def auto(url
|
|
88
|
+
def auto(url = nil)
|
|
82
89
|
format = options.fetch(:format, 'rss')
|
|
83
|
-
|
|
90
|
+
strategy, local_file_path, url = prepare_auto_inputs(url, options[:input])
|
|
84
91
|
|
|
85
92
|
result = execute_feed do
|
|
86
|
-
|
|
87
|
-
url,
|
|
88
|
-
strategy: current_strategy,
|
|
89
|
-
items_selector: options[:items_selector],
|
|
90
|
-
max_redirects: options[:max_redirects],
|
|
91
|
-
max_requests: options[:max_requests]
|
|
92
|
-
)
|
|
93
|
+
source_call(url, strategy, local_file_path, format == 'jsonfeed')
|
|
93
94
|
end
|
|
94
95
|
|
|
95
96
|
puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
|
|
@@ -159,6 +160,33 @@ module Html2rss
|
|
|
159
160
|
config.delete(:request) if request_config.empty?
|
|
160
161
|
end
|
|
161
162
|
|
|
163
|
+
def apply_local_file_input!(config, input_path)
|
|
164
|
+
file_path = check_file_exists!(input_path)
|
|
165
|
+
config[:strategy] = :local_file
|
|
166
|
+
config[:request] = (config[:request] || {}).merge(local_file_path: file_path)
|
|
167
|
+
|
|
168
|
+
return unless config.dig(:channel, :url).to_s.empty?
|
|
169
|
+
|
|
170
|
+
config[:channel] = (config[:channel] || {}).merge(
|
|
171
|
+
url: detect_base_url!(file_path, 'Please specify a channel.url in the config.')
|
|
172
|
+
)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def prepare_auto_inputs(url, input_option)
|
|
176
|
+
if input_option.nil?
|
|
177
|
+
raise Thor::Error, 'A URL is required unless --input is specified' unless url
|
|
178
|
+
|
|
179
|
+
return [current_strategy, nil, url]
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
file_path = check_file_exists!(input_option)
|
|
183
|
+
detected_url = url || detect_base_url!(
|
|
184
|
+
file_path, 'Please specify a URL: html2rss auto [URL] --input <file>'
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
[:local_file, file_path, detected_url]
|
|
188
|
+
end
|
|
189
|
+
|
|
162
190
|
def request_controls
|
|
163
191
|
Html2rss::RequestControls.new(
|
|
164
192
|
strategy: options[:strategy]&.to_sym,
|
|
@@ -213,5 +241,28 @@ module Html2rss
|
|
|
213
241
|
Html2rss::NoFeedItemsExtracted => error
|
|
214
242
|
raise Thor::Error, error.message
|
|
215
243
|
end
|
|
244
|
+
|
|
245
|
+
def source_call(url, strategy, local_file_path, is_json)
|
|
246
|
+
method = is_json ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
|
|
247
|
+
method.call(
|
|
248
|
+
url,
|
|
249
|
+
strategy:,
|
|
250
|
+
items_selector: options[:items_selector],
|
|
251
|
+
max_redirects: options[:max_redirects],
|
|
252
|
+
max_requests: options[:max_requests],
|
|
253
|
+
local_file_path:
|
|
254
|
+
)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
def check_file_exists!(path)
|
|
258
|
+
File.expand_path(path).tap do |file_path|
|
|
259
|
+
raise Thor::Error, "Input file does not exist: #{path}" unless File.exist?(file_path)
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def detect_base_url!(file_path, error_hint)
|
|
264
|
+
Html2rss::Url.extract_from_html(File.read(file_path))&.to_s ||
|
|
265
|
+
raise(Thor::Error, "Could not auto-detect a base URL from HTML metadata. #{error_hint}")
|
|
266
|
+
end
|
|
216
267
|
end
|
|
217
268
|
end
|
|
@@ -29,7 +29,7 @@ module Html2rss
|
|
|
29
29
|
# Validates a configuration hash with the runtime validator.
|
|
30
30
|
#
|
|
31
31
|
# @param config [Hash{Symbol => Object}] the configuration hash
|
|
32
|
-
# @param params [Hash{Symbol => Object
|
|
32
|
+
# @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting
|
|
33
33
|
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
34
34
|
def validate(config, params: UNSET)
|
|
35
35
|
prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
|
|
@@ -56,7 +56,7 @@ module Html2rss
|
|
|
56
56
|
# @param file [String] the YAML file to load
|
|
57
57
|
# @param feed_name [String, nil] optional feed name for multi-feed files
|
|
58
58
|
# @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
|
|
59
|
-
# @param params [Hash{Symbol => Object
|
|
59
|
+
# @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting
|
|
60
60
|
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
61
61
|
def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
|
|
62
62
|
validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
|
|
@@ -99,7 +99,7 @@ module Html2rss
|
|
|
99
99
|
# and returns a new configuration object.
|
|
100
100
|
#
|
|
101
101
|
# @param config [Hash{Symbol => Object}] the configuration hash.
|
|
102
|
-
# @param params [Hash{Symbol => Object
|
|
102
|
+
# @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting.
|
|
103
103
|
# @return [Html2rss::Config] the configuration object.
|
|
104
104
|
def from_hash(config, params: UNSET)
|
|
105
105
|
new(resolve_effective_config(config, params:))
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -69,9 +69,9 @@ module Html2rss
|
|
|
69
69
|
# @return [Hash{Symbol => Object}] request envelope configuration
|
|
70
70
|
def request = config[:request]
|
|
71
71
|
|
|
72
|
-
# @return [Hash{Symbol => Object
|
|
72
|
+
# @return [Hash{Symbol => Object, nil}] selectors configuration
|
|
73
73
|
def selectors = config[:selectors]
|
|
74
|
-
# @return [Hash{Symbol => Object
|
|
74
|
+
# @return [Hash{Symbol => Object, nil}] auto-source configuration
|
|
75
75
|
def auto_source = config[:auto_source]
|
|
76
76
|
|
|
77
77
|
private
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# HeadingExtractor identifies and returns the best heading element within a container.
|
|
7
|
+
class HeadingExtractor
|
|
8
|
+
# Heading tags used to prioritize title extraction.
|
|
9
|
+
HEADING_TAGS = HtmlExtractor::HEADING_TAGS
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
##
|
|
13
|
+
# @param article_tag [Nokogiri::XML::Element] container node
|
|
14
|
+
# @param fallback_anchorless [Boolean] whether to use fallback search
|
|
15
|
+
# @param selected_anchor [Nokogiri::XML::Node, nil] anchor element
|
|
16
|
+
# @return [Nokogiri::XML::Node, nil] the heading node, if found
|
|
17
|
+
def call(article_tag, fallback_anchorless:, selected_anchor:)
|
|
18
|
+
tags = article_tag.css(HEADING_TAGS.join(','))
|
|
19
|
+
if tags.any?
|
|
20
|
+
select_best_heading(tags)
|
|
21
|
+
elsif fallback_anchorless && selected_anchor.nil?
|
|
22
|
+
fallback_heading(article_tag)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
def select_best_heading(tags)
|
|
29
|
+
min_tag_name = tags.map(&:name).min
|
|
30
|
+
best_tag = nil
|
|
31
|
+
max_size = -1
|
|
32
|
+
|
|
33
|
+
tags.each do |tag|
|
|
34
|
+
next if tag.name != min_tag_name
|
|
35
|
+
|
|
36
|
+
size = TextExtractor.call(tag)&.size.to_i
|
|
37
|
+
(best_tag = tag) && (max_size = size) if size > max_size
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
best_tag
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def fallback_heading(article_tag)
|
|
44
|
+
fallback_tags = article_tag.css('strong, b, [class*="title"], [class*="font-bold"], [class*="font-semibold"]')
|
|
45
|
+
fallback_tags.find { |t| !TextExtractor.call(t).to_s.strip.empty? }
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'zlib'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class HtmlExtractor
|
|
7
|
+
##
|
|
8
|
+
# IdGenerator determines the unique ID for an article container node.
|
|
9
|
+
class IdGenerator
|
|
10
|
+
class << self
|
|
11
|
+
##
|
|
12
|
+
# @param article_tag [Nokogiri::XML::Element] container node
|
|
13
|
+
# @param heading [Nokogiri::XML::Node, nil] heading node
|
|
14
|
+
# @param url [Html2rss::Url, nil] absolute article URL
|
|
15
|
+
# @param selected_anchor [Nokogiri::XML::Node, nil] anchor element
|
|
16
|
+
# @param fallback_anchorless [Boolean] whether to use fallback hashing
|
|
17
|
+
# @return [String, nil] the generated ID, if any
|
|
18
|
+
def call(article_tag, heading:, url:, selected_anchor:, fallback_anchorless:)
|
|
19
|
+
id_from_dom = parse_id_from_dom(article_tag, url, selected_anchor)
|
|
20
|
+
return id_from_dom if id_from_dom
|
|
21
|
+
|
|
22
|
+
heading_text = resolve_heading_text(article_tag, heading, fallback_anchorless)
|
|
23
|
+
if heading_text && !heading_text.strip.empty?
|
|
24
|
+
generate_slug(heading_text)
|
|
25
|
+
elsif fallback_anchorless
|
|
26
|
+
generate_content_hash(article_tag)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def parse_id_from_dom(article_tag, url, selected_anchor)
|
|
33
|
+
candidates = [article_tag['id'], article_tag.at_css('[id]')&.attr('id')]
|
|
34
|
+
candidates += [url&.path, url&.query] if selected_anchor
|
|
35
|
+
candidates.compact.reject(&:empty?).first
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def resolve_heading_text(article_tag, heading, fallback_anchorless)
|
|
39
|
+
text = heading ? TextExtractor.call(heading) : nil
|
|
40
|
+
if text.nil? || text.strip.empty?
|
|
41
|
+
fallback_text_node_content(article_tag, fallback_anchorless)
|
|
42
|
+
else
|
|
43
|
+
text
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def fallback_text_node_content(article_tag, fallback_anchorless)
|
|
48
|
+
return unless fallback_anchorless
|
|
49
|
+
|
|
50
|
+
article_tag.xpath('.//text()').find { |t| !t.text.strip.empty? }&.text&.strip
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def generate_slug(text)
|
|
54
|
+
slug = text.downcase.gsub(/[^a-z0-9]+/, '-')
|
|
55
|
+
slug = slug[1..] if slug.start_with?('-')
|
|
56
|
+
slug = slug[0..-2] if slug.end_with?('-')
|
|
57
|
+
slug unless slug.empty?
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def generate_content_hash(article_tag)
|
|
61
|
+
text = TextExtractor.call(article_tag).to_s.strip
|
|
62
|
+
Zlib.crc32(text).to_s(36) unless text.empty?
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
##
|
|
6
|
+
# TextExtractor extracts visible text from DOM elements, preserving lists
|
|
7
|
+
# and block spacing while sanitizing white spaces.
|
|
8
|
+
class TextExtractor
|
|
9
|
+
# HTML block elements that trigger line breaks or special formatting.
|
|
10
|
+
BLOCK_TAGS = %w[p div li ul ol h1 h2 h3 h4 h5 h6 tr br].to_set.freeze
|
|
11
|
+
# Tags ignored when extracting visible text content.
|
|
12
|
+
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
13
|
+
|
|
14
|
+
class << self
|
|
15
|
+
##
|
|
16
|
+
# @param tag [Nokogiri::XML::Node] the node from which to extract visible text
|
|
17
|
+
# @param separator [String] separator used to join text fragments (default is a space)
|
|
18
|
+
# @param exclude_nodes [Array<Nokogiri::XML::Node>, nil] nodes to exclude from extraction
|
|
19
|
+
# @return [String, nil] the concatenated visible text, or nil if none is found
|
|
20
|
+
def call(tag, separator: ' ', exclude_nodes: nil)
|
|
21
|
+
return tag.text.gsub(/\s+/, ' ').strip if tag.respond_to?(:text?) && tag.text?
|
|
22
|
+
|
|
23
|
+
parts = iterate_children(tag, separator, exclude_nodes)
|
|
24
|
+
return if parts.empty?
|
|
25
|
+
|
|
26
|
+
parts.join.squeeze(' ').strip
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def iterate_children(tag, separator, exclude_nodes)
|
|
32
|
+
last = false
|
|
33
|
+
tag.children.each_with_object([]) do |c, p|
|
|
34
|
+
next if exclude_nodes&.include?(c) || !visible_child?(c)
|
|
35
|
+
|
|
36
|
+
text, block = process_child_node(c, separator, exclude_nodes)
|
|
37
|
+
next if text.empty?
|
|
38
|
+
|
|
39
|
+
append_separator!(p, separator, block, last)
|
|
40
|
+
(p << text) && (last = block)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def process_child_node(child, separator, exclude_nodes)
|
|
45
|
+
child_text = get_child_text(child, separator, exclude_nodes)
|
|
46
|
+
return ['', false] if child_text.empty?
|
|
47
|
+
|
|
48
|
+
child_text = "- #{child_text}" if child.name == 'li'
|
|
49
|
+
[child_text, BLOCK_TAGS.include?(child.name)]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def get_child_text(child, separator, exclude_nodes)
|
|
53
|
+
if child.children.empty?
|
|
54
|
+
child.text.to_s.gsub(/\s+/, ' ').strip
|
|
55
|
+
else
|
|
56
|
+
call(child, separator:, exclude_nodes:).to_s.strip
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def append_separator!(parts, separator, is_block, last_was_block)
|
|
61
|
+
return if parts.empty?
|
|
62
|
+
|
|
63
|
+
parts << if is_block || last_was_block
|
|
64
|
+
(separator == ' ' ? "\n" : separator)
|
|
65
|
+
else
|
|
66
|
+
' '
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def visible_child?(node)
|
|
71
|
+
!INVISIBLE_CONTENT_TAGS.include?(node.name) &&
|
|
72
|
+
!(node.name == 'a' && node['href']&.start_with?('#'))
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -4,13 +4,11 @@ module Html2rss
|
|
|
4
4
|
##
|
|
5
5
|
# HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
|
|
6
6
|
# from an article_tag.
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
7
|
+
# rubocop:disable Metrics/ClassLength
|
|
8
|
+
class HtmlExtractor
|
|
10
9
|
# Heading tags used to prioritize title extraction.
|
|
11
10
|
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
12
|
-
|
|
13
|
-
NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
|
|
11
|
+
|
|
14
12
|
# Element tags that indicate ignored DOM chrome when found in a container path.
|
|
15
13
|
IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
|
|
16
14
|
|
|
@@ -26,20 +24,14 @@ module Html2rss
|
|
|
26
24
|
class << self
|
|
27
25
|
##
|
|
28
26
|
# Extracts visible text from a given node and its children.
|
|
27
|
+
# Delegates to TextExtractor.
|
|
29
28
|
#
|
|
30
29
|
# @param tag [Nokogiri::XML::Node] the node from which to extract visible text
|
|
31
30
|
# @param separator [String] separator used to join text fragments (default is a space)
|
|
31
|
+
# @param exclude_nodes [Array<Nokogiri::XML::Node>, nil] nodes to exclude from extraction
|
|
32
32
|
# @return [String, nil] the concatenated visible text, or nil if none is found
|
|
33
|
-
def extract_visible_text(tag, separator: ' ')
|
|
34
|
-
|
|
35
|
-
next unless visible_child?(child)
|
|
36
|
-
|
|
37
|
-
raw_text = child.children.empty? ? child.text : extract_visible_text(child)
|
|
38
|
-
text = raw_text&.strip
|
|
39
|
-
text unless text.to_s.empty?
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
parts.join(separator).squeeze(' ').strip unless parts.empty?
|
|
33
|
+
def extract_visible_text(tag, separator: ' ', exclude_nodes: nil)
|
|
34
|
+
TextExtractor.call(tag, separator:, exclude_nodes:)
|
|
43
35
|
end
|
|
44
36
|
|
|
45
37
|
##
|
|
@@ -74,23 +66,20 @@ module Html2rss
|
|
|
74
66
|
end
|
|
75
67
|
false
|
|
76
68
|
end
|
|
77
|
-
|
|
78
|
-
def visible_child?(node)
|
|
79
|
-
!INVISIBLE_CONTENT_TAGS.include?(node.name) &&
|
|
80
|
-
!(node.name == 'a' && node['href']&.start_with?('#'))
|
|
81
|
-
end
|
|
82
69
|
end
|
|
83
70
|
|
|
84
71
|
##
|
|
85
72
|
# @param article_tag [Nokogiri::XML::Node] article-like container to extract from
|
|
86
73
|
# @param base_url [String, Html2rss::Url] base url used to resolve relative links
|
|
87
74
|
# @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
|
|
88
|
-
|
|
75
|
+
# @param fallback_anchorless [Boolean] whether to fall back to anchorless extraction
|
|
76
|
+
def initialize(article_tag, base_url:, selected_anchor:, fallback_anchorless: false)
|
|
89
77
|
raise ArgumentError, 'article_tag is required' unless article_tag
|
|
90
78
|
|
|
91
79
|
@article_tag = article_tag
|
|
92
80
|
@base_url = base_url
|
|
93
81
|
@selected_anchor = selected_anchor
|
|
82
|
+
@fallback_anchorless = fallback_anchorless
|
|
94
83
|
end
|
|
95
84
|
|
|
96
85
|
# @return [Hash{Symbol => Object}] extracted article attributes
|
|
@@ -115,54 +104,62 @@ module Html2rss
|
|
|
115
104
|
@extract_url ||= begin
|
|
116
105
|
href = selected_anchor&.[]('href').to_s
|
|
117
106
|
|
|
118
|
-
|
|
107
|
+
if href.empty?
|
|
108
|
+
anchorless_url_fallback
|
|
109
|
+
else
|
|
110
|
+
Url.from_relative(href.split('#').first.strip, base_url)
|
|
111
|
+
end
|
|
119
112
|
end
|
|
120
113
|
end
|
|
121
114
|
|
|
122
|
-
def
|
|
123
|
-
|
|
124
|
-
|
|
115
|
+
def anchorless_url_fallback
|
|
116
|
+
return unless @fallback_anchorless
|
|
117
|
+
|
|
118
|
+
id = generate_id
|
|
119
|
+
Url.from_relative("##{id}", base_url) if id
|
|
125
120
|
end
|
|
126
121
|
|
|
127
|
-
def
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
122
|
+
def extract_title
|
|
123
|
+
title_source = heading || selected_anchor
|
|
124
|
+
if title_source
|
|
125
|
+
self.class.extract_visible_text(title_source)
|
|
126
|
+
else
|
|
127
|
+
fallback_anchorless_title
|
|
131
128
|
end
|
|
132
129
|
end
|
|
133
130
|
|
|
134
|
-
def
|
|
135
|
-
|
|
136
|
-
best_tag = nil
|
|
137
|
-
max_size = -1
|
|
138
|
-
|
|
139
|
-
tags.each do |tag|
|
|
140
|
-
next if tag.name != min_tag_name
|
|
131
|
+
def fallback_anchorless_title
|
|
132
|
+
return unless @fallback_anchorless && selected_anchor.nil?
|
|
141
133
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
134
|
+
text_node = article_tag.xpath('.//text()').find { |t| !t.text.strip.empty? }
|
|
135
|
+
text_node&.text&.strip
|
|
136
|
+
end
|
|
145
137
|
|
|
146
|
-
|
|
138
|
+
def heading
|
|
139
|
+
@heading ||= HeadingExtractor.call(
|
|
140
|
+
article_tag,
|
|
141
|
+
fallback_anchorless: @fallback_anchorless,
|
|
142
|
+
selected_anchor:
|
|
143
|
+
)
|
|
147
144
|
end
|
|
148
145
|
|
|
149
146
|
def extract_description
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
description = self.class.extract_visible_text(article_tag)
|
|
154
|
-
return nil if description.nil? || description.strip.empty?
|
|
147
|
+
exclude = [heading, selected_anchor].compact.to_set
|
|
148
|
+
description = self.class.extract_visible_text(article_tag, exclude_nodes: exclude)
|
|
149
|
+
return if description.nil?
|
|
155
150
|
|
|
156
|
-
description.strip
|
|
151
|
+
desc = description.strip
|
|
152
|
+
desc.empty? ? nil : desc
|
|
157
153
|
end
|
|
158
154
|
|
|
159
155
|
def generate_id
|
|
160
|
-
|
|
161
|
-
article_tag
|
|
162
|
-
|
|
163
|
-
extract_url
|
|
164
|
-
|
|
165
|
-
|
|
156
|
+
@generate_id ||= IdGenerator.call(
|
|
157
|
+
article_tag,
|
|
158
|
+
heading:,
|
|
159
|
+
url: (selected_anchor ? extract_url : nil),
|
|
160
|
+
selected_anchor:,
|
|
161
|
+
fallback_anchorless: @fallback_anchorless
|
|
162
|
+
)
|
|
166
163
|
end
|
|
167
164
|
|
|
168
165
|
def extract_image = ImageExtractor.call(article_tag, base_url:)
|
|
@@ -170,4 +167,5 @@ module Html2rss
|
|
|
170
167
|
def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
|
|
171
168
|
def extract_categories = CategoryExtractor.call(article_tag)
|
|
172
169
|
end
|
|
170
|
+
# rubocop:enable Metrics/ClassLength
|
|
173
171
|
end
|
data/lib/html2rss/rendering.rb
CHANGED
|
@@ -4,6 +4,8 @@ module Html2rss
|
|
|
4
4
|
# Namespace for HTML rendering logic, used to generate rich content such as
|
|
5
5
|
# images, audio, video, or embedded documents for feed descriptions.
|
|
6
6
|
#
|
|
7
|
+
# @see Html2rss::Rendering::DescriptionBuilder
|
|
8
|
+
#
|
|
7
9
|
# @example
|
|
8
10
|
# Html2rss::Rendering::ImageRenderer.new(
|
|
9
11
|
# url: "https://example.com/image.jpg",
|
|
@@ -16,8 +18,6 @@ module Html2rss
|
|
|
16
18
|
# image: "https://example.com/image.jpg",
|
|
17
19
|
# title: "Example"
|
|
18
20
|
# )
|
|
19
|
-
#
|
|
20
|
-
# @see Html2rss::Rendering::DescriptionBuilder
|
|
21
21
|
module Rendering
|
|
22
22
|
end
|
|
23
23
|
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestService
|
|
5
|
+
##
|
|
6
|
+
# Strategy to read a local HTML file.
|
|
7
|
+
class LocalFileStrategy < Strategy
|
|
8
|
+
##
|
|
9
|
+
# Executes the local file read.
|
|
10
|
+
#
|
|
11
|
+
# @return [Response] the mock response wrapped around the file contents
|
|
12
|
+
# @raise [ArgumentError] if the local file path is missing
|
|
13
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
14
|
+
def execute
|
|
15
|
+
file_path = ctx.request[:local_file_path]
|
|
16
|
+
raise ArgumentError, 'Local file path is required for local_file strategy' unless file_path
|
|
17
|
+
raise Errno::ENOENT, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
18
|
+
|
|
19
|
+
body = File.read(file_path)
|
|
20
|
+
Response.new(
|
|
21
|
+
body:,
|
|
22
|
+
headers: { 'content-type' => 'text/html; charset=utf-8' },
|
|
23
|
+
url: ctx.url,
|
|
24
|
+
status: 200
|
|
25
|
+
)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -97,6 +97,10 @@ module Html2rss
|
|
|
97
97
|
|
|
98
98
|
attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
|
|
99
99
|
|
|
100
|
+
##
|
|
101
|
+
# Re-raises a deferred navigation error when one was captured.
|
|
102
|
+
#
|
|
103
|
+
# @raise [Html2rss::Error] when a navigation request or response validation failed
|
|
100
104
|
def raise_navigation_error_if_any
|
|
101
105
|
raise @navigation_error if @navigation_error
|
|
102
106
|
end
|
|
@@ -25,7 +25,7 @@ module Html2rss
|
|
|
25
25
|
# during post processing with {PostProcessors::ParseTime}.
|
|
26
26
|
class Attribute
|
|
27
27
|
# The available options for the attribute extractor.
|
|
28
|
-
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
|
|
28
|
+
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
|
|
29
29
|
|
|
30
30
|
##
|
|
31
31
|
# Initializes the Attribute extractor.
|
|
@@ -25,7 +25,7 @@ module Html2rss
|
|
|
25
25
|
# 'http://blog-without-a-feed.example.com/posts/latest-findings'
|
|
26
26
|
class Href
|
|
27
27
|
# The available options for the href (attribute) extractor.
|
|
28
|
-
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
|
|
28
|
+
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
|
|
29
29
|
|
|
30
30
|
##
|
|
31
31
|
# Initializes the Href extractor.
|
|
@@ -24,7 +24,7 @@ module Html2rss
|
|
|
24
24
|
# {PostProcessors::SanitizeHtml}.
|
|
25
25
|
class Html
|
|
26
26
|
# The available options for the html extractor.
|
|
27
|
-
Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
|
|
27
|
+
Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
|
|
28
28
|
|
|
29
29
|
##
|
|
30
30
|
# Initializes the Html extractor.
|
|
@@ -17,7 +17,7 @@ module Html2rss
|
|
|
17
17
|
# 'Foobar'
|
|
18
18
|
class Static
|
|
19
19
|
# The available option for the static extractor.
|
|
20
|
-
Options = Struct.new('StaticOptions', :static, keyword_init: true)
|
|
20
|
+
Options = Struct.new('StaticOptions', :static, keyword_init: true)
|
|
21
21
|
|
|
22
22
|
##
|
|
23
23
|
# Initializes the Static extractor.
|
|
@@ -22,7 +22,7 @@ module Html2rss
|
|
|
22
22
|
# 'Lorem ipsum dolor ...'
|
|
23
23
|
class Text
|
|
24
24
|
# The available options for the text extractor.
|
|
25
|
-
Options = Struct.new('TextOptions', :selector, keyword_init: true)
|
|
25
|
+
Options = Struct.new('TextOptions', :selector, keyword_init: true)
|
|
26
26
|
|
|
27
27
|
##
|
|
28
28
|
# Initializes the Text extractor.
|
|
@@ -128,8 +128,15 @@ module Html2rss
|
|
|
128
128
|
##
|
|
129
129
|
# @return [String, nil]
|
|
130
130
|
def get
|
|
131
|
-
|
|
131
|
+
# Temporarily replace newlines with a placeholder to preserve them during space collapsing
|
|
132
|
+
temp_value = value.to_s.gsub("\n", ' __NEWLINE_PLACEHOLDER__ ')
|
|
133
|
+
sanitized_html = Sanitize.fragment(temp_value, self.class.sanitize_config(channel_url)).to_s
|
|
132
134
|
sanitized_html.gsub!(/\s+/, ' ')
|
|
135
|
+
|
|
136
|
+
# Restore newlines and clean up surrounding whitespace
|
|
137
|
+
sanitized_html.gsub!(/[ \t\r]*__NEWLINE_PLACEHOLDER__[ \t\r]*/, "\n")
|
|
138
|
+
sanitized_html.gsub!(/\n{3,}/, "\n\n")
|
|
139
|
+
|
|
133
140
|
sanitized_html.strip!
|
|
134
141
|
sanitized_html.empty? ? nil : sanitized_html
|
|
135
142
|
end
|
data/lib/html2rss/selectors.rb
CHANGED
|
@@ -18,7 +18,7 @@ module Html2rss
|
|
|
18
18
|
include Enumerable
|
|
19
19
|
|
|
20
20
|
# A context instance passed to item extractors and post-processors.
|
|
21
|
-
Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true)
|
|
21
|
+
Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true)
|
|
22
22
|
|
|
23
23
|
# Default selectors options merged into user configuration.
|
|
24
24
|
DEFAULT_CONFIG = { items: { enhance: true } }.freeze
|
data/lib/html2rss/url.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'addressable/uri'
|
|
4
4
|
require 'cgi'
|
|
5
|
+
require 'nokogiri'
|
|
5
6
|
|
|
6
7
|
module Html2rss
|
|
7
8
|
##
|
|
@@ -55,10 +56,9 @@ module Html2rss
|
|
|
55
56
|
# @return [Url, nil] the sanitized URL, or nil if no valid URL found
|
|
56
57
|
def self.sanitize(raw_url)
|
|
57
58
|
match = raw_url.to_s.match(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
|
|
58
|
-
|
|
59
|
-
return nil if url.empty?
|
|
59
|
+
return unless match
|
|
60
60
|
|
|
61
|
-
new(Addressable::URI.parse(
|
|
61
|
+
new(Addressable::URI.parse(match[0].strip).normalize)
|
|
62
62
|
end
|
|
63
63
|
|
|
64
64
|
##
|
|
@@ -70,10 +70,9 @@ module Html2rss
|
|
|
70
70
|
def self.from_absolute(url_string)
|
|
71
71
|
return url_string if url_string.is_a?(self)
|
|
72
72
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
url
|
|
73
|
+
new(Addressable::URI.parse(url_string.to_s.strip).normalize).tap do |url|
|
|
74
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
|
75
|
+
end
|
|
77
76
|
rescue Addressable::URI::InvalidURIError
|
|
78
77
|
raise ArgumentError, 'URL must be absolute'
|
|
79
78
|
end
|
|
@@ -92,14 +91,28 @@ module Html2rss
|
|
|
92
91
|
# Url.for_channel('/relative/path')
|
|
93
92
|
# # => raises ArgumentError: "URL must be absolute"
|
|
94
93
|
def self.for_channel(url_string)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
stripped = url_string.strip
|
|
94
|
+
stripped = url_string.to_s.strip
|
|
98
95
|
return nil if stripped.empty?
|
|
99
96
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
97
|
+
from_absolute(stripped).tap { validate_channel_url(_1) }
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
##
|
|
101
|
+
# Extracts a base URL from HTML metadata tags.
|
|
102
|
+
#
|
|
103
|
+
# @param html [String] raw HTML content
|
|
104
|
+
# @return [Url, nil] the extracted absolute URL, or nil if none is found
|
|
105
|
+
def self.extract_from_html(html)
|
|
106
|
+
doc = Nokogiri::HTML(html)
|
|
107
|
+
tags = { 'link[rel="canonical"]' => 'href', 'meta[property="og:url"]' => 'content',
|
|
108
|
+
'meta[name="twitter:url"]' => 'content', 'base[href]' => 'href' }
|
|
109
|
+
tags.each do |sel, attr|
|
|
110
|
+
val = doc.at_css(sel)&.[](attr).to_s.strip
|
|
111
|
+
return from_absolute(val) unless val.empty?
|
|
112
|
+
rescue ArgumentError
|
|
113
|
+
next
|
|
114
|
+
end
|
|
115
|
+
nil
|
|
103
116
|
end
|
|
104
117
|
|
|
105
118
|
##
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
|
@@ -52,6 +52,8 @@ module Html2rss
|
|
|
52
52
|
FeedPipeline.new(raw_config).to_json_feed
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
+
# rubocop:disable Metrics/ParameterLists
|
|
56
|
+
|
|
55
57
|
##
|
|
56
58
|
# Scrapes the provided URL and returns an RSS object.
|
|
57
59
|
#
|
|
@@ -60,9 +62,15 @@ module Html2rss
|
|
|
60
62
|
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
61
63
|
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
62
64
|
# @param max_requests [Integer, nil] optional request budget override
|
|
65
|
+
# @param local_file_path [String, nil] optional local HTML file path
|
|
63
66
|
# @return [RSS::Rss] generated RSS feed
|
|
64
|
-
def self.auto_source(url,
|
|
65
|
-
|
|
67
|
+
def self.auto_source(url,
|
|
68
|
+
strategy: :auto,
|
|
69
|
+
items_selector: nil,
|
|
70
|
+
max_redirects: nil,
|
|
71
|
+
max_requests: nil,
|
|
72
|
+
local_file_path: nil)
|
|
73
|
+
feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:, local_file_path:))
|
|
66
74
|
end
|
|
67
75
|
|
|
68
76
|
##
|
|
@@ -73,11 +81,20 @@ module Html2rss
|
|
|
73
81
|
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
74
82
|
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
75
83
|
# @param max_requests [Integer, nil] optional request budget override
|
|
84
|
+
# @param local_file_path [String, nil] optional local HTML file path
|
|
76
85
|
# @return [Hash] JSONFeed-compliant hash
|
|
77
|
-
def self.auto_json_feed(url,
|
|
78
|
-
|
|
86
|
+
def self.auto_json_feed(url,
|
|
87
|
+
strategy: :auto,
|
|
88
|
+
items_selector: nil,
|
|
89
|
+
max_redirects: nil,
|
|
90
|
+
max_requests: nil,
|
|
91
|
+
local_file_path: nil)
|
|
92
|
+
json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:,
|
|
93
|
+
local_file_path:))
|
|
79
94
|
end
|
|
80
95
|
|
|
96
|
+
# rubocop:enable Metrics/ParameterLists
|
|
97
|
+
|
|
81
98
|
# rubocop:disable ThreadSafety/ClassInstanceVariable
|
|
82
99
|
class << self
|
|
83
100
|
##
|
|
@@ -125,12 +142,17 @@ module Html2rss
|
|
|
125
142
|
class << self
|
|
126
143
|
private
|
|
127
144
|
|
|
128
|
-
def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
|
|
129
|
-
Config.auto_source_config(
|
|
145
|
+
def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:, local_file_path: nil) # rubocop:disable Metrics/ParameterLists
|
|
146
|
+
config = Config.auto_source_config(
|
|
130
147
|
url:,
|
|
131
148
|
items_selector:,
|
|
132
149
|
request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
|
|
133
150
|
)
|
|
151
|
+
if local_file_path
|
|
152
|
+
config[:request] ||= {}
|
|
153
|
+
config[:request][:local_file_path] = local_file_path
|
|
154
|
+
end
|
|
155
|
+
config
|
|
134
156
|
end
|
|
135
157
|
|
|
136
158
|
def shortcut_request_controls(strategy:, max_redirects:, max_requests:)
|
|
@@ -179,6 +179,12 @@
|
|
|
179
179
|
"type": "null"
|
|
180
180
|
},
|
|
181
181
|
"exclusiveMinimum": 0
|
|
182
|
+
},
|
|
183
|
+
"fallback_anchorless": {
|
|
184
|
+
"type": "boolean",
|
|
185
|
+
"not": {
|
|
186
|
+
"type": "null"
|
|
187
|
+
}
|
|
182
188
|
}
|
|
183
189
|
},
|
|
184
190
|
"required": []
|
|
@@ -227,7 +233,8 @@
|
|
|
227
233
|
"html": {
|
|
228
234
|
"enabled": true,
|
|
229
235
|
"minimum_selector_frequency": 2,
|
|
230
|
-
"use_top_selectors": 5
|
|
236
|
+
"use_top_selectors": 5,
|
|
237
|
+
"fallback_anchorless": true
|
|
231
238
|
}
|
|
232
239
|
},
|
|
233
240
|
"cleanup": {
|
|
@@ -535,6 +542,10 @@
|
|
|
535
542
|
}
|
|
536
543
|
},
|
|
537
544
|
"required": []
|
|
545
|
+
},
|
|
546
|
+
"local_file_path": {
|
|
547
|
+
"type": "string",
|
|
548
|
+
"minLength": 1
|
|
538
549
|
}
|
|
539
550
|
},
|
|
540
551
|
"required": []
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.22.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
@@ -278,6 +278,7 @@ files:
|
|
|
278
278
|
- lib/html2rss/auto_source/cleanup.rb
|
|
279
279
|
- lib/html2rss/auto_source/scraper.rb
|
|
280
280
|
- lib/html2rss/auto_source/scraper/html.rb
|
|
281
|
+
- lib/html2rss/auto_source/scraper/html/class_clustering.rb
|
|
281
282
|
- lib/html2rss/auto_source/scraper/json_state.rb
|
|
282
283
|
- lib/html2rss/auto_source/scraper/link_heuristics.rb
|
|
283
284
|
- lib/html2rss/auto_source/scraper/microdata.rb
|
|
@@ -310,10 +311,13 @@ files:
|
|
|
310
311
|
- lib/html2rss/html_extractor.rb
|
|
311
312
|
- lib/html2rss/html_extractor/date_extractor.rb
|
|
312
313
|
- lib/html2rss/html_extractor/enclosure_extractor.rb
|
|
314
|
+
- lib/html2rss/html_extractor/heading_extractor.rb
|
|
315
|
+
- lib/html2rss/html_extractor/id_generator.rb
|
|
313
316
|
- lib/html2rss/html_extractor/image_extractor.rb
|
|
314
317
|
- lib/html2rss/html_extractor/list_candidates.rb
|
|
315
318
|
- lib/html2rss/html_extractor/semantic_anchor_candidates.rb
|
|
316
319
|
- lib/html2rss/html_extractor/semantic_containers.rb
|
|
320
|
+
- lib/html2rss/html_extractor/text_extractor.rb
|
|
317
321
|
- lib/html2rss/html_navigator.rb
|
|
318
322
|
- lib/html2rss/json_feed_builder.rb
|
|
319
323
|
- lib/html2rss/json_feed_builder/item.rb
|
|
@@ -332,6 +336,7 @@ files:
|
|
|
332
336
|
- lib/html2rss/request_service/budget.rb
|
|
333
337
|
- lib/html2rss/request_service/context.rb
|
|
334
338
|
- lib/html2rss/request_service/faraday_strategy.rb
|
|
339
|
+
- lib/html2rss/request_service/local_file_strategy.rb
|
|
335
340
|
- lib/html2rss/request_service/policy.rb
|
|
336
341
|
- lib/html2rss/request_service/puppet_commander.rb
|
|
337
342
|
- lib/html2rss/request_service/response.rb
|
|
@@ -376,7 +381,7 @@ licenses:
|
|
|
376
381
|
- MIT
|
|
377
382
|
metadata:
|
|
378
383
|
allowed_push_host: https://rubygems.org
|
|
379
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
|
384
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.0
|
|
380
385
|
rubygems_mfa_required: 'true'
|
|
381
386
|
rdoc_options: []
|
|
382
387
|
require_paths:
|