html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ ##
7
+ # Returns a defined part of a String.
8
+ #
9
+ # Both parameters must be an Integer and they can be negative.
10
+ # The +end+ parameter can be omitted, in that case it will not cut the
11
+ # String at the end.
12
+ #
13
+ # A Regexp or a MatchString is not supported.
14
+ #
15
+ # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
16
+ # documentation for more information.
17
+ #
18
+ # Imagine this HTML:
19
+ # <h1>Foo bar and baz<h1>
20
+ #
21
+ # YAML usage example:
22
+ # selectors:
23
+ # title:
24
+ # selector: h1
25
+ # post_process:
26
+ # name: substring
27
+ # start: 4
28
+ # end: 6
29
+ #
30
+ # Would return:
31
+ # 'bar'
32
+ class Substring < Base
33
+ def self.validate_args!(value, context)
34
+ assert_type value, String, :value, context:
35
+
36
+ options = context[:options]
37
+ assert_type options[:start], Integer, :start, context:
38
+
39
+ end_index = options[:end]
40
+ assert_type(end_index, Integer, :end, context:) if end_index
41
+ end
42
+
43
+ ##
44
+ # Extracts the substring from the original string based on the provided start and end indices.
45
+ #
46
+ # @return [String] The extracted substring.
47
+ def get
48
+ value[range]
49
+ end
50
+
51
+ ##
52
+ # Determines the range for the substring extraction based on the provided start and end indices.
53
+ #
54
+ # @return [Range] The range object representing the start and end/Infinity (integers).
55
+ def range
56
+ return (start_index..) unless end_index?
57
+
58
+ if start_index == end_index
59
+ raise ArgumentError,
60
+ 'The `start` value must be unequal to the `end` value.'
61
+ end
62
+
63
+ (start_index..end_index)
64
+ end
65
+
66
+ private
67
+
68
+ def end_index? = !context[:options][:end].to_s.empty?
69
+ def end_index = context[:options][:end].to_i
70
+ def start_index = context[:options][:start].to_i
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ ##
7
+ # Returns a formatted String according to the string pattern.
8
+ # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
9
+ #
10
+ # It supports the format pattern `%<key>s` and `%{key}`, where `key` is the key of the selector.
11
+ # If `%{self}` is used, the selectors extracted value will be used.
12
+ #
13
+ # Imagine this HTML:
14
+ #
15
+ # <li>
16
+ # <h1>Product</h1>
17
+ # <span class="price">23,42€</span>
18
+ # </li>
19
+ #
20
+ #
21
+ # YAML usage example:
22
+ #
23
+ # selectors:
24
+ # items:
25
+ # selector: 'li'
26
+ # price:
27
+ # selector: '.price'
28
+ # title:
29
+ # selector: h1
30
+ # post_process:
31
+ # name: template
32
+ # string: '%{self} (%{price})'
33
+ #
34
+ # Would return:
35
+ # 'Product (23,42€)'
36
+ class Template < Base
37
+ def self.validate_args!(value, context)
38
+ assert_type value, String, :value, context:
39
+
40
+ string = context[:options]&.dig(:string).to_s
41
+ raise InvalidType, 'The `string` template is absent.' if string.empty?
42
+ end
43
+
44
+ ##
45
+ # @param value [String]
46
+ # @param context [Selectors::Context]
47
+ def initialize(value, context)
48
+ super
49
+
50
+ @options = context[:options] || {}
51
+ @scraper = context[:scraper]
52
+ @item = context[:item]
53
+ @string = @options[:string].to_s
54
+ end
55
+
56
+ ##
57
+ # @return [String]
58
+ def get
59
+ Html2rss::Config::DynamicParams.call(@string, {}, getter: method(:item_value), replace_missing_with: '')
60
+ end
61
+
62
+ private
63
+
64
+ # @param key [String, Symbol]
65
+ # @return [String]
66
+ def item_value(key)
67
+ key = key.to_sym
68
+ key == :self ? value : @scraper.select(key, @item)
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ ##
6
+ # Provides a namespace for attribute post processors.
7
+ module PostProcessors
8
+ ##
9
+ # Error raised when an unknown post processor name is requested.
10
+ class UnknownPostProcessorName < Html2rss::Error; end
11
+
12
+ ##
13
+ # Error raised when a required option is missing.
14
+ class MissingOption < Html2rss::Error; end
15
+
16
+ ##
17
+ # Error raised when an invalid type is provided.
18
+ class InvalidType < Html2rss::Error; end
19
+
20
+ ##
21
+ # Maps the post processor name to the class implementing the post processor.
22
+ #
23
+ # The key is the name to use in the feed config.
24
+ NAME_TO_CLASS = {
25
+ gsub: Gsub,
26
+ html_to_markdown: HtmlToMarkdown,
27
+ markdown_to_html: MarkdownToHtml,
28
+ parse_time: ParseTime,
29
+ parse_uri: ParseUri,
30
+ sanitize_html: SanitizeHtml,
31
+ substring: Substring,
32
+ template: Template
33
+ }.freeze
34
+
35
+ ##
36
+ # Shorthand method to instantiate the post processor and call `#get` on it
37
+ def self.get(name, value, context)
38
+ klass = NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Unknown name '#{name}'")
39
+ klass.new(value, context).get
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,294 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+
5
+ module Html2rss
6
+ ##
7
+ # This scraper is designed to scrape articles from a given HTML page using CSS
8
+ # selectors defined in the feed config.
9
+ #
10
+ # It supports the traditional feed configs that html2rss originally provided,
11
+ # ensuring compatibility with existing setups.
12
+ #
13
+ # Additionally, it uniquely offers the capability to convert JSON into XML,
14
+ # extending its versatility for diverse data processing workflows.
15
+ class Selectors # rubocop:disable Metrics/ClassLength
16
+ class InvalidSelectorName < Html2rss::Error; end
17
+
18
+ include Enumerable
19
+
20
+ # A context instance passed to item extractors and post-processors.
21
+ Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
22
+
23
+ DEFAULT_CONFIG = { items: { enhance: true } }.freeze
24
+
25
+ ITEMS_SELECTOR_KEY = :items
26
+ ITEM_TAGS = %i[title url description author comments published_at guid enclosure categories].freeze
27
+ SPECIAL_ATTRIBUTES = Set[:guid, :enclosure, :categories].freeze
28
+
29
+ # Mapping of new attribute names to their legacy names for backward compatibility.
30
+ RENAMED_ATTRIBUTES = { published_at: %i[updated pubDate] }.freeze
31
+
32
+ ##
33
+ # Initializes a new Selectors instance.
34
+ #
35
+ # @param response [RequestService::Response] The response object.
36
+ # @param selectors [Hash] A hash of CSS selectors.
37
+ # @param time_zone [String] Time zone string used for date parsing.
38
+ def initialize(response, selectors:, time_zone:)
39
+ @response = response
40
+ @url = response.url
41
+ @selectors = selectors
42
+ @time_zone = time_zone
43
+
44
+ prepare_selectors!
45
+ @rss_item_attributes = @selectors.keys & Html2rss::RssBuilder::Article::PROVIDED_KEYS
46
+ end
47
+
48
+ ##
49
+ # Returns articles extracted from the response.
50
+ # Reverses order if config specifies reverse ordering.
51
+ #
52
+ # @return [Array<Html2rss::RssBuilder::Article>]
53
+ def articles
54
+ @articles ||= @selectors.dig(ITEMS_SELECTOR_KEY, :order) == 'reverse' ? to_a.tap(&:reverse!) : to_a
55
+ end
56
+
57
+ ##
58
+ # Iterates over each scraped article.
59
+ #
60
+ # @yield [article] Gives each article as an Html2rss::RssBuilder::Article.
61
+ # @return [Enumerator] An enumerator if no block is given.
62
+ def each(&)
63
+ return enum_for(:each) unless block_given?
64
+
65
+ enhance = enhance?
66
+
67
+ parsed_body.css(items_selector).each do |item|
68
+ article_hash = extract_article(item, response)
69
+
70
+ enhance_article_hash(article_hash, item, response.url) if enhance
71
+
72
+ yield Html2rss::RssBuilder::Article.new(**article_hash, scraper: self.class)
73
+ end
74
+ end
75
+
76
+ ##
77
+ # Returns the CSS selector for the items.
78
+ # @return [String] the CSS selector for the items
79
+ def items_selector = @selectors.dig(ITEMS_SELECTOR_KEY, :selector)
80
+
81
+ ## @return [Boolean] whether to enhance the article hash with auto_source's semantic HTML extraction.
82
+ def enhance? = !!@selectors.dig(ITEMS_SELECTOR_KEY, :enhance)
83
+
84
+ ##
85
+ # Extracts an article hash for a given item element.
86
+ #
87
+ # @param item [Nokogiri::XML::Element] The element to extract from.
88
+ # @return [Hash] Hash of attributes for the article.
89
+ def extract_article(item, page_response = response)
90
+ @rss_item_attributes.to_h { |key| [key, select(key, item, base_url: page_response.url)] }.compact
91
+ end
92
+
93
+ ##
94
+ # Enhances the article hash using semantic HTML extraction.
95
+ # Only adds keys that are missing from the original hash.
96
+ #
97
+ # @param article_hash [Hash] The original article hash.
98
+ # @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
99
+ # @return [Hash] The enhanced article hash.
100
+ def enhance_article_hash(article_hash, article_tag, base_url = @url)
101
+ selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
102
+ return article_hash unless selected_anchor
103
+
104
+ extracted = HtmlExtractor.new(article_tag, base_url:, selected_anchor:).call
105
+ return article_hash unless extracted
106
+
107
+ extracted.each_with_object(article_hash) do |(key, value), hash|
108
+ next if value.nil? || (hash.key?(key) && hash[key])
109
+
110
+ hash[key] = value
111
+ end
112
+ end
113
+
114
+ ##
115
+ # Selects the value for a given attribute from an HTML element.
116
+ #
117
+ # @param name [Symbol, String] Name of the attribute.
118
+ # @param item [Nokogiri::XML::Element] The HTML element to process.
119
+ # @return [Object, Array<Object>] The selected value(s).
120
+ # @raise [InvalidSelectorName] If the attribute name is invalid or not defined.
121
+ def select(name, item, base_url: @url)
122
+ name = name.to_sym
123
+
124
+ raise InvalidSelectorName, "Attribute selector '#{name}' is reserved for items." if name == ITEMS_SELECTOR_KEY
125
+
126
+ selector_key, config = selector_config_for(name)
127
+
128
+ if SPECIAL_ATTRIBUTES.member?(selector_key)
129
+ select_special(selector_key, item:, config:, base_url:)
130
+ else
131
+ select_regular(selector_key, item:, config:, base_url:)
132
+ end
133
+ end
134
+
135
+ private
136
+
137
+ attr_reader :response
138
+
139
+ def prepare_selectors!
140
+ validate_url_and_link_exclusivity!
141
+ fix_url_and_link!
142
+ handle_renamed_attributes!
143
+ end
144
+
145
+ def validate_url_and_link_exclusivity!
146
+ return unless @selectors.key?(:url) && @selectors.key?(:link)
147
+
148
+ raise InvalidSelectorName, 'You must either use "url" or "link" your selectors. Using both is not supported.'
149
+ end
150
+
151
+ def fix_url_and_link!
152
+ return if @selectors[:url] || !@selectors.key?(:link)
153
+
154
+ @selectors = @selectors.dup
155
+ @selectors[:url] = @selectors[:link]
156
+ end
157
+
158
+ def handle_renamed_attributes!
159
+ RENAMED_ATTRIBUTES.each_pair do |new_name, old_names|
160
+ old_names.each do |old_name|
161
+ next unless @selectors.key?(old_name)
162
+
163
+ Html2rss::Log.warn("Selector '#{old_name}' is deprecated. Please rename to '#{new_name}'.")
164
+ @selectors[new_name] ||= @selectors.delete(old_name)
165
+ end
166
+ end
167
+ end
168
+
169
+ def parsed_body
170
+ parsed_body_for(response)
171
+ end
172
+
173
+ def parsed_body_for(page_response)
174
+ @parsed_bodies ||= {}
175
+ @parsed_bodies[page_response.url] ||= if page_response.json_response?
176
+ fragment = ObjectToXmlConverter.new(page_response.parsed_body).call
177
+ Nokogiri::HTML5.fragment(fragment)
178
+ else
179
+ page_response.parsed_body
180
+ end
181
+ end
182
+
183
+ def select_special(name, item:, config:, base_url:)
184
+ case name
185
+ when :enclosure
186
+ enclosure(item:, config:, base_url:)
187
+ when :guid
188
+ Array(config).map { |selector_name| select(selector_name, item, base_url:) }
189
+ when :categories
190
+ select_categories(category_selectors: config, item:, base_url:)
191
+ end
192
+ end
193
+
194
+ def select_regular(_name, item:, config:, base_url:)
195
+ value = Extractors.get(config.merge(channel: channel_context(base_url)), item)
196
+
197
+ if value && (post_process_steps = config[:post_process])
198
+ steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
199
+ value = post_process(item, value, steps, base_url:)
200
+ end
201
+
202
+ value
203
+ end
204
+
205
+ def post_process(item, value, post_process_steps, base_url:)
206
+ post_process_steps.each do |options|
207
+ context = Context.new(config: { channel: { url: base_url, time_zone: @time_zone } },
208
+ item:, scraper: self, options:)
209
+
210
+ value = PostProcessors.get(options[:name], value, context)
211
+ end
212
+
213
+ value
214
+ end
215
+
216
+ def select_categories(category_selectors:, item:, base_url:)
217
+ Array(category_selectors).flat_map do |selector_name|
218
+ extract_category_values(selector_name, item:, base_url:)
219
+ end
220
+ end
221
+
222
+ def extract_category_values(selector_name, item:, base_url:)
223
+ selector_key, config = selector_config_for(selector_name, allow_nil: true)
224
+ return [] unless config
225
+
226
+ nodes = extract_nodes(item:, config:)
227
+ unless node_set_with_multiple_elements?(nodes)
228
+ return Array(select_regular(selector_key, item:, config:, base_url:))
229
+ end
230
+
231
+ Array(nodes).flat_map { |node| extract_categories_from_node(node, item:, config:, base_url:) }
232
+ end
233
+
234
+ def extract_categories_from_node(node, item:, config:, base_url:)
235
+ values = Extractors.get(category_node_options(config, base_url:), node)
236
+ values = apply_post_process_steps(item:, value: values, post_process_steps: config[:post_process], base_url:)
237
+
238
+ Array(values).filter_map { |category| extract_category_text(category) }
239
+ end
240
+
241
+ def extract_category_text(category)
242
+ text = case category
243
+ when Nokogiri::XML::Node, Nokogiri::XML::NodeSet
244
+ HtmlExtractor.extract_visible_text(category)
245
+ else
246
+ category&.to_s
247
+ end
248
+
249
+ stripped = text&.strip
250
+ stripped unless stripped.nil? || stripped.empty?
251
+ end
252
+
253
+ def node_set_with_multiple_elements?(nodes)
254
+ nodes.is_a?(Nokogiri::XML::NodeSet) && nodes.length > 1
255
+ end
256
+
257
+ def category_node_options(selector_config, base_url:)
258
+ selector_config.merge(channel: channel_context(base_url), selector: nil)
259
+ end
260
+
261
+ def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
262
+ return value unless value && post_process_steps
263
+
264
+ steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
265
+ post_process(item, value, steps, base_url:)
266
+ end
267
+
268
+ def selector_config_for(name, allow_nil: false)
269
+ selector_key = name.to_sym
270
+
271
+ return [selector_key, @selectors[selector_key]] if @selectors.key?(selector_key)
272
+ return [selector_key, nil] if allow_nil
273
+
274
+ raise InvalidSelectorName, "Selector for '#{selector_key}' is not defined."
275
+ end
276
+
277
+ def extract_nodes(item:, config:)
278
+ return unless config.respond_to?(:[]) && config[:selector]
279
+
280
+ Extractors.element(item, config[:selector])
281
+ end
282
+
283
+ def channel_context(base_url)
284
+ { url: base_url, time_zone: @time_zone }
285
+ end
286
+
287
+ # @return [Hash] enclosure details.
288
+ def enclosure(item:, config:, base_url:)
289
+ url = Url.from_relative(select_regular(:enclosure, item:, config:, base_url:), base_url)
290
+
291
+ { url:, type: config[:content_type] }
292
+ end
293
+ end
294
+ end