html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
##
|
|
7
|
+
# This scraper is designed to scrape articles from a given HTML page using CSS
|
|
8
|
+
# selectors defined in the feed config.
|
|
9
|
+
#
|
|
10
|
+
# It supports the traditional feed configs that html2rss originally provided,
|
|
11
|
+
# ensuring compatibility with existing setups.
|
|
12
|
+
#
|
|
13
|
+
# Additionally, it uniquely offers the capability to convert JSON into XML,
|
|
14
|
+
# extending its versatility for diverse data processing workflows.
|
|
15
|
+
class Selectors # rubocop:disable Metrics/ClassLength
|
|
16
|
+
class InvalidSelectorName < Html2rss::Error; end
|
|
17
|
+
|
|
18
|
+
include Enumerable
|
|
19
|
+
|
|
20
|
+
# A context instance passed to item extractors and post-processors.
|
|
21
|
+
Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
22
|
+
|
|
23
|
+
# Default selectors options merged into user configuration.
|
|
24
|
+
DEFAULT_CONFIG = { items: { enhance: true } }.freeze
|
|
25
|
+
|
|
26
|
+
# Selector key that points to the root list of article nodes.
|
|
27
|
+
ITEMS_SELECTOR_KEY = :items
|
|
28
|
+
# Supported RSS item attributes extractable through selectors.
|
|
29
|
+
ITEM_TAGS = %i[title url description author comments published_at guid enclosure categories].freeze
|
|
30
|
+
# Item attributes that require dedicated extraction logic.
|
|
31
|
+
SPECIAL_ATTRIBUTES = Set[:guid, :enclosure, :categories].freeze
|
|
32
|
+
|
|
33
|
+
# Mapping of new attribute names to their legacy names for backward compatibility.
|
|
34
|
+
RENAMED_ATTRIBUTES = { published_at: %i[updated pubDate] }.freeze
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# Initializes a new Selectors instance.
|
|
38
|
+
#
|
|
39
|
+
# @param response [RequestService::Response] The response object.
|
|
40
|
+
# @param selectors [Hash] A hash of CSS selectors.
|
|
41
|
+
# @param time_zone [String] Time zone string used for date parsing.
|
|
42
|
+
def initialize(response, selectors:, time_zone:)
|
|
43
|
+
@response = response
|
|
44
|
+
@url = response.url
|
|
45
|
+
@selectors = selectors
|
|
46
|
+
@time_zone = time_zone
|
|
47
|
+
|
|
48
|
+
prepare_selectors!
|
|
49
|
+
@rss_item_attributes = @selectors.keys & Html2rss::RssBuilder::Article::PROVIDED_KEYS
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
# Returns articles extracted from the response.
|
|
54
|
+
# Reverses order if config specifies reverse ordering.
|
|
55
|
+
#
|
|
56
|
+
# @return [Array<Html2rss::RssBuilder::Article>]
|
|
57
|
+
def articles
|
|
58
|
+
@articles ||= @selectors.dig(ITEMS_SELECTOR_KEY, :order) == 'reverse' ? to_a.tap(&:reverse!) : to_a
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
##
|
|
62
|
+
# Iterates over each scraped article.
|
|
63
|
+
#
|
|
64
|
+
# @yield [article] Gives each article as an Html2rss::RssBuilder::Article.
|
|
65
|
+
# @return [Enumerator] An enumerator if no block is given.
|
|
66
|
+
def each(&)
|
|
67
|
+
return enum_for(:each) unless block_given?
|
|
68
|
+
|
|
69
|
+
enhance = enhance?
|
|
70
|
+
|
|
71
|
+
parsed_body.css(items_selector).each do |item|
|
|
72
|
+
article_hash = extract_article(item, response)
|
|
73
|
+
|
|
74
|
+
enhance_article_hash(article_hash, item, response.url) if enhance
|
|
75
|
+
|
|
76
|
+
yield Html2rss::RssBuilder::Article.new(**article_hash, scraper: self.class)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
##
|
|
81
|
+
# Returns the CSS selector for the items.
|
|
82
|
+
# @return [String] the CSS selector for the items
|
|
83
|
+
def items_selector = @selectors.dig(ITEMS_SELECTOR_KEY, :selector)
|
|
84
|
+
|
|
85
|
+
## @return [Boolean] whether to enhance the article hash with auto_source's semantic HTML extraction.
|
|
86
|
+
def enhance? = !!@selectors.dig(ITEMS_SELECTOR_KEY, :enhance)
|
|
87
|
+
|
|
88
|
+
##
|
|
89
|
+
# Extracts an article hash for a given item element.
|
|
90
|
+
#
|
|
91
|
+
# @param item [Nokogiri::XML::Element] The element to extract from.
|
|
92
|
+
# @param page_response [RequestService::Response] response used for selector extraction context
|
|
93
|
+
# @return [Hash] Hash of attributes for the article.
|
|
94
|
+
def extract_article(item, page_response = response)
|
|
95
|
+
@rss_item_attributes.to_h { |key| [key, select(key, item, base_url: page_response.url)] }.compact
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
##
|
|
99
|
+
# Enhances the article hash using semantic HTML extraction.
|
|
100
|
+
# Only adds keys that are missing from the original hash.
|
|
101
|
+
#
|
|
102
|
+
# @param article_hash [Hash] The original article hash.
|
|
103
|
+
# @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
|
|
104
|
+
# @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
|
|
105
|
+
# @return [Hash] The enhanced article hash.
|
|
106
|
+
def enhance_article_hash(article_hash, article_tag, base_url = @url)
|
|
107
|
+
selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
|
|
108
|
+
return article_hash unless selected_anchor
|
|
109
|
+
|
|
110
|
+
extracted = HtmlExtractor.new(article_tag, base_url:, selected_anchor:).call
|
|
111
|
+
return article_hash unless extracted
|
|
112
|
+
|
|
113
|
+
extracted.each_with_object(article_hash) do |(key, value), hash|
|
|
114
|
+
next if value.nil? || (hash.key?(key) && hash[key])
|
|
115
|
+
|
|
116
|
+
hash[key] = value
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
##
|
|
121
|
+
# Selects the value for a given attribute from an HTML element.
|
|
122
|
+
#
|
|
123
|
+
# @param name [Symbol, String] Name of the attribute.
|
|
124
|
+
# @param item [Nokogiri::XML::Element] The HTML element to process.
|
|
125
|
+
# @param base_url [String, Html2rss::Url] base URL for relative extraction values
|
|
126
|
+
# @return [Object, Array<Object>] The selected value(s).
|
|
127
|
+
# @raise [InvalidSelectorName] If the attribute name is invalid or not defined.
|
|
128
|
+
def select(name, item, base_url: @url)
|
|
129
|
+
name = name.to_sym
|
|
130
|
+
|
|
131
|
+
raise InvalidSelectorName, "Attribute selector '#{name}' is reserved for items." if name == ITEMS_SELECTOR_KEY
|
|
132
|
+
|
|
133
|
+
selector_key, config = selector_config_for(name)
|
|
134
|
+
|
|
135
|
+
if SPECIAL_ATTRIBUTES.member?(selector_key)
|
|
136
|
+
select_special(selector_key, item:, config:, base_url:)
|
|
137
|
+
else
|
|
138
|
+
select_regular(selector_key, item:, config:, base_url:)
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
private
|
|
143
|
+
|
|
144
|
+
attr_reader :response
|
|
145
|
+
|
|
146
|
+
def prepare_selectors!
|
|
147
|
+
validate_url_and_link_exclusivity!
|
|
148
|
+
fix_url_and_link!
|
|
149
|
+
handle_renamed_attributes!
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def validate_url_and_link_exclusivity!
|
|
153
|
+
return unless @selectors.key?(:url) && @selectors.key?(:link)
|
|
154
|
+
|
|
155
|
+
raise InvalidSelectorName, 'You must either use "url" or "link" your selectors. Using both is not supported.'
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def fix_url_and_link!
|
|
159
|
+
return if @selectors[:url] || !@selectors.key?(:link)
|
|
160
|
+
|
|
161
|
+
@selectors = @selectors.dup
|
|
162
|
+
@selectors[:url] = @selectors[:link]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def handle_renamed_attributes!
|
|
166
|
+
RENAMED_ATTRIBUTES.each_pair do |new_name, old_names|
|
|
167
|
+
old_names.each do |old_name|
|
|
168
|
+
next unless @selectors.key?(old_name)
|
|
169
|
+
|
|
170
|
+
Html2rss::Log.warn("Selector '#{old_name}' is deprecated. Please rename to '#{new_name}'.")
|
|
171
|
+
@selectors[new_name] ||= @selectors.delete(old_name)
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def parsed_body
|
|
177
|
+
parsed_body_for(response)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def parsed_body_for(page_response)
|
|
181
|
+
@parsed_bodies ||= {}
|
|
182
|
+
@parsed_bodies[page_response.url] ||= if page_response.json_response?
|
|
183
|
+
fragment = ObjectToXmlConverter.new(page_response.parsed_body).call
|
|
184
|
+
Nokogiri::HTML5.fragment(fragment)
|
|
185
|
+
else
|
|
186
|
+
page_response.parsed_body
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def select_special(name, item:, config:, base_url:)
|
|
191
|
+
case name
|
|
192
|
+
when :enclosure
|
|
193
|
+
enclosure(item:, config:, base_url:)
|
|
194
|
+
when :guid
|
|
195
|
+
Array(config).map { |selector_name| select(selector_name, item, base_url:) }
|
|
196
|
+
when :categories
|
|
197
|
+
select_categories(category_selectors: config, item:, base_url:)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def select_regular(_name, item:, config:, base_url:)
|
|
202
|
+
value = Extractors.get(config.merge(channel: channel_context(base_url)), item)
|
|
203
|
+
|
|
204
|
+
if value && (post_process_steps = config[:post_process])
|
|
205
|
+
steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
|
|
206
|
+
value = post_process(item, value, steps, base_url:)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
value
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def post_process(item, value, post_process_steps, base_url:)
|
|
213
|
+
post_process_steps.each do |options|
|
|
214
|
+
context = Context.new(config: { channel: { url: base_url, time_zone: @time_zone } },
|
|
215
|
+
item:, scraper: self, options:)
|
|
216
|
+
|
|
217
|
+
value = PostProcessors.get(options[:name], value, context)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
value
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def select_categories(category_selectors:, item:, base_url:)
|
|
224
|
+
Array(category_selectors).flat_map do |selector_name|
|
|
225
|
+
extract_category_values(selector_name, item:, base_url:)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def extract_category_values(selector_name, item:, base_url:)
|
|
230
|
+
selector_key, config = selector_config_for(selector_name, allow_nil: true)
|
|
231
|
+
return [] unless config
|
|
232
|
+
|
|
233
|
+
nodes = extract_nodes(item:, config:)
|
|
234
|
+
unless node_set_with_multiple_elements?(nodes)
|
|
235
|
+
return Array(select_regular(selector_key, item:, config:, base_url:))
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
Array(nodes).flat_map { |node| extract_categories_from_node(node, item:, config:, base_url:) }
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def extract_categories_from_node(node, item:, config:, base_url:)
|
|
242
|
+
values = Extractors.get(category_node_options(config, base_url:), node)
|
|
243
|
+
values = apply_post_process_steps(item:, value: values, post_process_steps: config[:post_process], base_url:)
|
|
244
|
+
|
|
245
|
+
Array(values).filter_map { |category| extract_category_text(category) }
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def extract_category_text(category)
|
|
249
|
+
text = case category
|
|
250
|
+
when Nokogiri::XML::Node, Nokogiri::XML::NodeSet
|
|
251
|
+
HtmlExtractor.extract_visible_text(category)
|
|
252
|
+
else
|
|
253
|
+
category&.to_s
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
stripped = text&.strip
|
|
257
|
+
stripped unless stripped.nil? || stripped.empty?
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
def node_set_with_multiple_elements?(nodes)
|
|
261
|
+
nodes.is_a?(Nokogiri::XML::NodeSet) && nodes.length > 1
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
def category_node_options(selector_config, base_url:)
|
|
265
|
+
selector_config.merge(channel: channel_context(base_url), selector: nil)
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
|
|
269
|
+
return value unless value && post_process_steps
|
|
270
|
+
|
|
271
|
+
steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
|
|
272
|
+
post_process(item, value, steps, base_url:)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
def selector_config_for(name, allow_nil: false)
|
|
276
|
+
selector_key = name.to_sym
|
|
277
|
+
|
|
278
|
+
return [selector_key, @selectors[selector_key]] if @selectors.key?(selector_key)
|
|
279
|
+
return [selector_key, nil] if allow_nil
|
|
280
|
+
|
|
281
|
+
raise InvalidSelectorName, "Selector for '#{selector_key}' is not defined."
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def extract_nodes(item:, config:)
|
|
285
|
+
return unless config.respond_to?(:[]) && config[:selector]
|
|
286
|
+
|
|
287
|
+
Extractors.element(item, config[:selector])
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def channel_context(base_url)
|
|
291
|
+
{ url: base_url, time_zone: @time_zone }
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# @return [Hash] enclosure details.
|
|
295
|
+
def enclosure(item:, config:, base_url:)
|
|
296
|
+
url = Url.from_relative(select_regular(:enclosure, item:, config:, base_url:), base_url)
|
|
297
|
+
|
|
298
|
+
{ url:, type: config[:content_type] }
|
|
299
|
+
end
|
|
300
|
+
end
|
|
301
|
+
end
|
data/lib/html2rss/url.rb
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'addressable/uri'
|
|
4
|
+
require 'cgi'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
##
|
|
8
|
+
# A value object representing a resolved, absolute URL with built-in operations.
|
|
9
|
+
# Provides URL resolution, sanitization, and titleization capabilities.
|
|
10
|
+
#
|
|
11
|
+
# @example Creating a URL from a relative path
|
|
12
|
+
# url = Url.from_relative('/path/to/article', 'https://example.com')
|
|
13
|
+
# url.to_s # => "https://example.com/path/to/article"
|
|
14
|
+
#
|
|
15
|
+
# @example Sanitizing a raw URL string
|
|
16
|
+
# url = Url.sanitize('https://example.com/ ')
|
|
17
|
+
# url.to_s # => "https://example.com/"
|
|
18
|
+
#
|
|
19
|
+
# @example Getting titleized versions
|
|
20
|
+
# url = Url.from_relative('/foo-bar/baz.txt', 'https://example.com')
|
|
21
|
+
# url.titleized # => "Foo Bar Baz"
|
|
22
|
+
# url.channel_titleized # => "example.com: Foo Bar Baz"
|
|
23
|
+
class Url
|
|
24
|
+
include Comparable
|
|
25
|
+
|
|
26
|
+
# Regular expression for basic URI format validation
|
|
27
|
+
URI_REGEXP = Addressable::URI::URIREGEX
|
|
28
|
+
# Schemes accepted by channel URL validation.
|
|
29
|
+
SUPPORTED_SCHEMES = %w[http https].to_set.freeze
|
|
30
|
+
|
|
31
|
+
##
|
|
32
|
+
# Creates a URL from a relative path and base URL.
|
|
33
|
+
#
|
|
34
|
+
# @param relative_url [String, Html2rss::Url] the relative URL to resolve
|
|
35
|
+
# @param base_url [String, Html2rss::Url] the base URL to resolve against
|
|
36
|
+
# @return [Url] the resolved absolute URL
|
|
37
|
+
# @raise [ArgumentError] if the URL cannot be parsed
|
|
38
|
+
def self.from_relative(relative_url, base_url)
|
|
39
|
+
url = Addressable::URI.parse(relative_url.to_s.strip)
|
|
40
|
+
return new(url) if url.absolute?
|
|
41
|
+
|
|
42
|
+
base_uri = Addressable::URI.parse(base_url.to_s)
|
|
43
|
+
base_uri.path = '/' if base_uri.path.empty?
|
|
44
|
+
|
|
45
|
+
new(base_uri.join(url).normalize)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# Creates a URL by sanitizing a raw URL string.
|
|
50
|
+
# Removes spaces and extracts the first valid URL from the string.
|
|
51
|
+
#
|
|
52
|
+
# @param raw_url [String] the raw URL string to sanitize
|
|
53
|
+
# @return [Url, nil] the sanitized URL, or nil if no valid URL found
|
|
54
|
+
def self.sanitize(raw_url)
|
|
55
|
+
matched_urls = raw_url.to_s.scan(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
|
|
56
|
+
url = matched_urls.first.to_s.strip
|
|
57
|
+
return nil if url.empty?
|
|
58
|
+
|
|
59
|
+
new(Addressable::URI.parse(url).normalize)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
##
|
|
63
|
+
# Creates a URL from an already-absolute URL string.
|
|
64
|
+
#
|
|
65
|
+
# @param url_string [String, Html2rss::Url] the absolute URL to parse
|
|
66
|
+
# @return [Url] the parsed and normalized URL
|
|
67
|
+
# @raise [ArgumentError] if the URL is not absolute or cannot be parsed
|
|
68
|
+
def self.from_absolute(url_string)
|
|
69
|
+
return url_string if url_string.is_a?(self)
|
|
70
|
+
|
|
71
|
+
url = new(Addressable::URI.parse(url_string.to_s.strip).normalize)
|
|
72
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
|
73
|
+
|
|
74
|
+
url
|
|
75
|
+
rescue Addressable::URI::InvalidURIError
|
|
76
|
+
raise ArgumentError, 'URL must be absolute'
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
##
|
|
80
|
+
# Creates a URL for channel use with validation.
|
|
81
|
+
# Validates that the URL meets channel requirements (absolute, no @, supported schemes).
|
|
82
|
+
#
|
|
83
|
+
# @param url_string [String] the URL string to validate and parse
|
|
84
|
+
# @return [Url] the validated and parsed URL
|
|
85
|
+
# @raise [ArgumentError] if the URL doesn't meet channel requirements
|
|
86
|
+
# @example Creating a channel URL
|
|
87
|
+
# Url.for_channel('https://example.com')
|
|
88
|
+
# # => #<Html2rss::Url:... @uri=#<Addressable::URI:... URI:https://example.com>>
|
|
89
|
+
# @example Invalid channel URL
|
|
90
|
+
# Url.for_channel('/relative/path')
|
|
91
|
+
# # => raises ArgumentError: "URL must be absolute"
|
|
92
|
+
def self.for_channel(url_string)
|
|
93
|
+
return nil if url_string.nil? || url_string.empty?
|
|
94
|
+
|
|
95
|
+
stripped = url_string.strip
|
|
96
|
+
return nil if stripped.empty?
|
|
97
|
+
|
|
98
|
+
url = from_absolute(stripped)
|
|
99
|
+
validate_channel_url(url)
|
|
100
|
+
url
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
##
|
|
104
|
+
# Validates that a URL meets channel requirements.
|
|
105
|
+
#
|
|
106
|
+
# @param url [Url] the URL to validate
|
|
107
|
+
# @raise [ArgumentError] if the URL doesn't meet channel requirements
|
|
108
|
+
def self.validate_channel_url(url)
|
|
109
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
|
110
|
+
|
|
111
|
+
uri = Addressable::URI.parse(url.to_s)
|
|
112
|
+
has_forbidden_at = uri.user || uri.password
|
|
113
|
+
has_forbidden_at ||= [uri.query, uri.fragment].compact.any? { |value| value.include?('@') }
|
|
114
|
+
raise ArgumentError, 'URL must not contain an @ character' if has_forbidden_at
|
|
115
|
+
|
|
116
|
+
scheme = url.scheme
|
|
117
|
+
raise ArgumentError, "URL scheme '#{scheme}' is not supported" unless SUPPORTED_SCHEMES.include?(scheme)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
private_class_method :validate_channel_url
|
|
121
|
+
|
|
122
|
+
##
|
|
123
|
+
# @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
|
|
124
|
+
def initialize(uri)
|
|
125
|
+
@uri = uri.freeze
|
|
126
|
+
freeze
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
# @return [String] normalized URL string
|
|
130
|
+
def to_s = @uri.to_s
|
|
131
|
+
|
|
132
|
+
# @return [String, nil] URI scheme, for example `http` or `https`
|
|
133
|
+
def scheme = @uri.scheme
|
|
134
|
+
|
|
135
|
+
# @return [String, nil] URI host component
|
|
136
|
+
def host = @uri.host
|
|
137
|
+
|
|
138
|
+
# @return [Integer, nil] URI port component
|
|
139
|
+
def port = @uri.port
|
|
140
|
+
|
|
141
|
+
# @return [String, nil] URI path component
|
|
142
|
+
def path = @uri.path
|
|
143
|
+
|
|
144
|
+
# @return [String, nil] URI query string without leading `?`
|
|
145
|
+
def query = @uri.query
|
|
146
|
+
|
|
147
|
+
# @return [String, nil] URI fragment without leading `#`
|
|
148
|
+
def fragment = @uri.fragment
|
|
149
|
+
|
|
150
|
+
# @return [Boolean] whether the URL includes scheme and host
|
|
151
|
+
def absolute? = @uri.absolute?
|
|
152
|
+
|
|
153
|
+
##
|
|
154
|
+
# Returns the URL query string as a hash of string keys and values.
|
|
155
|
+
#
|
|
156
|
+
# @return [Hash{String => String}] normalized query parameters
|
|
157
|
+
def query_values = @uri.query_values(Hash) || {}
|
|
158
|
+
|
|
159
|
+
##
|
|
160
|
+
# Returns the URL path split into non-empty segments.
|
|
161
|
+
#
|
|
162
|
+
# @return [Array<String>] normalized path segments
|
|
163
|
+
def path_segments = @uri.path.to_s.split('/').reject(&:empty?)
|
|
164
|
+
|
|
165
|
+
##
|
|
166
|
+
# Returns a copy of the URL with the provided path.
|
|
167
|
+
#
|
|
168
|
+
# @param path [String] normalized absolute path
|
|
169
|
+
# @return [Url] a new URL with the updated path
|
|
170
|
+
def with_path(path)
|
|
171
|
+
uri = @uri.dup
|
|
172
|
+
uri.path = path
|
|
173
|
+
self.class.from_absolute(uri.normalize.to_s)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
##
|
|
177
|
+
# Returns a copy of the URL with the provided query values.
|
|
178
|
+
#
|
|
179
|
+
# @param values [Hash{String, Symbol => #to_s}] query parameters to assign
|
|
180
|
+
# @return [Url] a new URL with the updated query string
|
|
181
|
+
def with_query_values(values)
|
|
182
|
+
uri = @uri.dup
|
|
183
|
+
uri.query_values = values.transform_keys(&:to_s).transform_values(&:to_s)
|
|
184
|
+
self.class.from_absolute(uri.normalize.to_s)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
##
|
|
188
|
+
# Returns a titleized representation of the URL path.
|
|
189
|
+
# Converts the path to a human-readable title by cleaning and capitalizing words.
|
|
190
|
+
# Removes file extensions and special characters, then capitalizes each word.
|
|
191
|
+
#
|
|
192
|
+
# @return [String] the titleized path, or empty string if path is empty
|
|
193
|
+
# @example Basic titleization
|
|
194
|
+
# url = Url.from_absolute('https://example.com/foo-bar/baz.txt')
|
|
195
|
+
# url.titleized # => "Foo Bar Baz"
|
|
196
|
+
# @example With URL encoding
|
|
197
|
+
# url = Url.from_absolute('https://example.com/hello%20world/article.html')
|
|
198
|
+
# url.titleized # => "Hello World Article"
|
|
199
|
+
def titleized
|
|
200
|
+
path = @uri.path
|
|
201
|
+
return '' if path.empty?
|
|
202
|
+
|
|
203
|
+
nicer_path = CGI.unescapeURIComponent(path)
|
|
204
|
+
.split('/')
|
|
205
|
+
.flat_map do |part|
|
|
206
|
+
part.gsub(/[^a-zA-Z0-9.]/, ' ').gsub(/\s+/, ' ').split
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
nicer_path.map!(&:capitalize)
|
|
210
|
+
File.basename(nicer_path.join(' '), '.*')
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
##
|
|
214
|
+
# Returns a titleized representation of the URL with prefixed host.
|
|
215
|
+
# Creates a channel title by combining host and path information.
|
|
216
|
+
# Useful for RSS channel titles that need to identify the source.
|
|
217
|
+
#
|
|
218
|
+
# @return [String] the titleized channel URL
|
|
219
|
+
# @example With path
|
|
220
|
+
# url = Url.from_absolute('https://example.com/foo-bar/baz')
|
|
221
|
+
# url.channel_titleized # => "example.com: Foo Bar Baz"
|
|
222
|
+
# @example Without path (root URL)
|
|
223
|
+
# url = Url.from_absolute('https://example.com')
|
|
224
|
+
# url.channel_titleized # => "example.com"
|
|
225
|
+
def channel_titleized
|
|
226
|
+
nicer_path = CGI.unescapeURIComponent(@uri.path).split('/').reject(&:empty?)
|
|
227
|
+
host = @uri.host
|
|
228
|
+
|
|
229
|
+
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
##
|
|
233
|
+
# Compares this URL with another URL for equality.
|
|
234
|
+
# URLs are considered equal if their string representations are the same.
|
|
235
|
+
#
|
|
236
|
+
# @param other [Url] the other URL to compare with
|
|
237
|
+
# @return [Integer] -1, 0, or 1 for less than, equal, or greater than
|
|
238
|
+
def <=>(other) = to_s <=> other.to_s
|
|
239
|
+
|
|
240
|
+
##
|
|
241
|
+
# Returns true if this URL is equal to another URL.
|
|
242
|
+
#
|
|
243
|
+
# @param other [Object] the other object to compare with
|
|
244
|
+
# @return [Boolean] true if the URLs are equal
|
|
245
|
+
def ==(other) = other.is_a?(Url) && to_s == other.to_s
|
|
246
|
+
|
|
247
|
+
##
|
|
248
|
+
# Supports hash-based comparisons by ensuring equality semantics match `hash`.
|
|
249
|
+
#
|
|
250
|
+
# @param other [Object] the other object to compare with
|
|
251
|
+
# @return [Boolean] true if the URLs are considered equal
|
|
252
|
+
def eql?(other) = other.is_a?(Url) && to_s == other.to_s
|
|
253
|
+
|
|
254
|
+
##
|
|
255
|
+
# Returns the hash code for this URL.
|
|
256
|
+
#
|
|
257
|
+
# @return [Integer] the hash code
|
|
258
|
+
def hash = to_s.hash
|
|
259
|
+
|
|
260
|
+
##
|
|
261
|
+
# Returns a string representation of the URL for debugging.
|
|
262
|
+
#
|
|
263
|
+
# @return [String] the debug representation
|
|
264
|
+
def inspect = "#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
|
|
265
|
+
end
|
|
266
|
+
end
|