html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -1,136 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'set'
4
-
5
- module Html2rss
6
- class AutoSource
7
- module Scraper
8
- class SemanticHtml
9
- ##
10
- # ArticleExtractor is responsible for extracting the details of an article.
11
- # It focuses on finding a headline first, and from it traverse as much as possible,
12
- # to find the DOM upwards to find the other details.
13
- class Extractor
14
- INVISIBLE_CONTENT_TAG_SELECTORS = %w[svg script noscript style template].to_set.freeze
15
- HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
16
- NOT_HEADLINE_SELECTOR = (HEADING_TAGS.map { |selector| ":not(#{selector})" } +
17
- INVISIBLE_CONTENT_TAG_SELECTORS.to_a).freeze
18
-
19
- def self.visible_text_from_tag(tag, separator: ' ')
20
- text = if (children = tag.children).empty?
21
- tag.text.strip
22
- else
23
- children.filter_map do |child|
24
- next if INVISIBLE_CONTENT_TAG_SELECTORS.include?(child.name)
25
-
26
- visible_text_from_tag(child)
27
- end.join(separator)
28
- end
29
-
30
- return if (sanitized_text = text.gsub(/\s+/, ' ').strip).empty?
31
-
32
- sanitized_text
33
- end
34
-
35
- def initialize(article_tag, url:)
36
- @article_tag = article_tag
37
- @url = url
38
- end
39
-
40
- # @return [Hash, nil] The scraped article or nil.
41
- def call
42
- @heading = find_heading || closest_anchor || return
43
-
44
- @extract_url = find_url
45
-
46
- {
47
- title: extract_title,
48
- url: extract_url,
49
- image: extract_image,
50
- description: extract_description,
51
- id: generate_id,
52
- published_at: extract_published_at
53
- }
54
- end
55
-
56
- private
57
-
58
- attr_reader :article_tag, :url, :heading, :extract_url
59
-
60
- ##
61
- # Find the heading of the article.
62
- # @return [Nokogiri::XML::Node, nil]
63
- def find_heading
64
- heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
65
-
66
- return if heading_tags.empty?
67
-
68
- smallest_heading = heading_tags.keys.min
69
- heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size.to_i }
70
- end
71
-
72
- def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
73
-
74
- def closest_anchor
75
- SemanticHtml.find_closest_selector(heading || article_tag,
76
- selector: 'a[href]:not([href=""])')
77
- end
78
-
79
- def find_url
80
- href = closest_anchor&.[]('href')
81
-
82
- return if (parts = href.to_s.split('#')).empty?
83
-
84
- Utils.build_absolute_url_from_relative(parts.first.strip, url)
85
- end
86
-
87
- def extract_title
88
- if heading && (heading.children.empty? || heading.text)
89
- visible_text_from_tag(heading)
90
- else
91
- visible_text_from_tag(article_tag.css(HEADING_TAGS.join(','))
92
- .max_by { |tag| tag.text.size })
93
-
94
- end
95
- end
96
-
97
- def extract_image
98
- Image.call(article_tag, url:)
99
- end
100
-
101
- def extract_description
102
- text = visible_text_from_tag(article_tag.css(NOT_HEADLINE_SELECTOR), separator: '<br>')
103
- return text if text
104
-
105
- description = visible_text_from_tag(article_tag)
106
- return nil unless description
107
-
108
- description.strip!
109
- description.empty? ? nil : description
110
- end
111
-
112
- def generate_id
113
- [
114
- article_tag['id'],
115
- article_tag.at_css('[id]')&.attr('id'),
116
- extract_url&.path,
117
- extract_url&.query
118
- ].compact.reject(&:empty?).first
119
- end
120
-
121
- # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
122
- def extract_published_at
123
- times = article_tag.css('time[datetime]')
124
- .filter_map do |tag|
125
- DateTime.parse(tag['datetime'])
126
- rescue ArgumentError, TypeError
127
- nil
128
- end
129
-
130
- times.min
131
- end
132
- end
133
- end
134
- end
135
- end
136
- end
@@ -1,54 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- class AutoSource
5
- module Scraper
6
- class SemanticHtml
7
- ##
8
- # Image is responsible for extracting image URLs the article_tag.
9
- class Image
10
- def self.call(article_tag, url:)
11
- img_src = from_source(article_tag) ||
12
- from_img(article_tag) ||
13
- from_style(article_tag)
14
-
15
- Utils.build_absolute_url_from_relative(img_src, url) if img_src
16
- end
17
-
18
- def self.from_img(article_tag)
19
- article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
20
- end
21
-
22
- ##
23
- # Extracts the largest image source from the srcset attribute
24
- # of an img tag or a source tag inside a picture tag.
25
- #
26
- # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
27
- # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
28
- # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
29
- def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
30
- hash = article_tag.css('img[srcset], picture > source[srcset]')
31
- .flat_map { |source| source['srcset'].to_s.split(',') }
32
- .filter_map do |line|
33
- width, url = line.split.reverse
34
- next if url.nil? || url.start_with?('data:')
35
-
36
- width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
37
-
38
- [width_value, url.strip]
39
- end.to_h
40
-
41
- hash[hash.keys.max]
42
- end
43
-
44
- def self.from_style(article_tag)
45
- article_tag.css('[style*="url"]')
46
- .map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
47
- .reject { |src| !src || src.start_with?('data:') }
48
- .max_by(&:size)
49
- end
50
- end
51
- end
52
- end
53
- end
54
- end
@@ -1,125 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'addressable'
4
-
5
- module Html2rss
6
- class Config
7
- ##
8
- # Holds the configuration for the feed's channel options.
9
- # This contains:
10
- #
11
- # 1. the RSS channel attributes
12
- # 2. html2rss options like json or custom HTTP-headers for the request
13
- class Channel
14
- ##
15
- # @param config [Hash<Symbol, Object>]
16
- # @return [Set<String>] the required parameter names
17
- def self.required_params_for_config(config)
18
- config.each_with_object(Set.new) do |(_, value), required_params|
19
- required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
20
- end
21
- end
22
-
23
- ##
24
- # @param channel [Hash<Symbol, Object>]
25
- # @param params [Hash]
26
- def initialize(channel, params: {})
27
- raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
28
-
29
- url = channel[:url]
30
- raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
31
-
32
- @config = process_params(channel, params.transform_keys(&:to_sym))
33
- end
34
-
35
- ##
36
- # The HTTP headers to use for the request.
37
- #
38
- # @return [Hash<Symbol, String>]
39
- def headers
40
- config.fetch(:headers, {})
41
- end
42
-
43
- ##
44
- # @return [String]
45
- def author
46
- config.fetch(:author, 'html2rss')
47
- end
48
-
49
- ##
50
- # @return [Integer]
51
- def ttl
52
- config.fetch(:ttl, 360)
53
- end
54
-
55
- ##
56
- # @return [String]
57
- def title
58
- config.fetch(:title) { Utils.titleized_channel_url(url) }
59
- end
60
-
61
- ##
62
- # @return [String] language code
63
- def language
64
- config.fetch(:language, 'en')
65
- end
66
-
67
- ##
68
- # @return [String]
69
- def description
70
- config.fetch(:description) { "Latest items from #{url}." }
71
- end
72
-
73
- ##
74
- # @return [Addressable::URI]
75
- def url
76
- Addressable::URI.parse(config[:url]).normalize
77
- end
78
-
79
- ##
80
- # @return [String] time_zone name
81
- def time_zone
82
- config.fetch(:time_zone, 'UTC')
83
- end
84
-
85
- ##
86
- # @return [true, false]
87
- def json?
88
- config.fetch(:json, false)
89
- end
90
-
91
- ##
92
- # @return [Symbol]
93
- def strategy
94
- config.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
95
- end
96
-
97
- private
98
-
99
- # @return [Hash<Symbol, Object>]
100
- attr_reader :config
101
-
102
- ##
103
- # @param config [Hash<Symbol, Object>]
104
- # @param params [Hash<Symbol, String>]
105
- # @return [nil]
106
- def assert_required_params_presence(config, params)
107
- missing_params = self.class.required_params_for_config(config) - params.keys.map(&:to_s)
108
- raise ParamsMissing, missing_params.to_a.join(', ') unless missing_params.empty?
109
- end
110
-
111
- ##
112
- # Sets the variables used in the feed config's channel.
113
- #
114
- # @param config [Hash<Symbol, Object>]
115
- # @param params [Hash<Symbol, Object>]
116
- # @return [Hash<Symbol, Object>]
117
- def process_params(config, params)
118
- assert_required_params_presence(config, params)
119
- config.transform_values do |value|
120
- value.is_a?(String) ? format(value, params) : value
121
- end
122
- end
123
- end
124
- end
125
- end
@@ -1,103 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- class Config
5
- ##
6
- # Holds the configurations of the selectors.
7
- class Selectors
8
- ITEMS_SELECTOR_NAME = :items
9
-
10
- # Struct to represent a selector with associated attributes for extraction and processing.
11
- Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
12
- keyword_init: true)
13
-
14
- # raised when an invalid selector name is used
15
- class InvalidSelectorName < Html2rss::Error; end
16
-
17
- ##
18
- # @param config [Hash<Symbol, Object>]
19
- def initialize(config)
20
- validate_config(config)
21
- @config = config
22
- end
23
-
24
- ##
25
- # @param name [Symbol]
26
- # @return [true, false]
27
- def selector?(name)
28
- name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
29
- end
30
-
31
- ##
32
- # @param name [Symbol]
33
- # @return [Selector]
34
- def selector(name)
35
- raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
36
-
37
- keywords = config[name].slice(*available_keys)
38
-
39
- if (additional_keys = keywords.keys - available_keys).any?
40
- Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
41
- end
42
-
43
- Selector.new(keywords)
44
- end
45
-
46
- ##
47
- # @return [Set<Symbol>]
48
- def category_selector_names
49
- selector_keys_for(:categories)
50
- end
51
-
52
- ##
53
- # @return [Set<Symbol>]
54
- def guid_selector_names
55
- selector_keys_for(:guid, default: :title_or_description)
56
- end
57
-
58
- ##
59
- # Returns the CSS/XPath selector.
60
- #
61
- # @param name [Symbol]
62
- # @return [String]
63
- def selector_string(name)
64
- Selector.new(config[name]).selector
65
- end
66
-
67
- ##
68
- # @return [Set<Symbol>]
69
- def item_selector_names
70
- @item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
71
- end
72
-
73
- ##
74
- # @return [Symbol, nil]
75
- def items_order
76
- config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
77
- end
78
-
79
- private
80
-
81
- attr_reader :config
82
-
83
- def validate_config(config)
84
- raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
85
- end
86
-
87
- ##
88
- # Returns the selector keys for the selector named `name`. If none, returns [default].
89
- #
90
- # @param name [Symbol]
91
- # @param default [String, Symbol]
92
- # @return [Set<Symbol>]
93
- def selector_keys_for(name, default: nil)
94
- config.fetch(name) { Array(default) }.tap do |array|
95
- array.reject! { |entry| entry.to_s == '' }
96
- array.map!(&:to_sym)
97
- end.to_set
98
- end
99
-
100
- def available_keys = @available_keys ||= Selector.members
101
- end
102
- end
103
- end
data/lib/html2rss/item.rb DELETED
@@ -1,186 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
-
5
- module Html2rss
6
- ##
7
- # Takes the selected Nokogiri::HTML and responds to accessor names
8
- # defined in the feed config.
9
- #
10
- # Instances can only be created via `.from_url` and
11
- # each represents an internally used "RSS item".
12
- # Such an item provides dynamically defined attributes as methods.
13
- class Item
14
- # A context instance is passed to Item Extractors.
15
- Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
16
- # Class to keep an Item's <enclosure>.
17
- Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
18
-
19
- ##
20
- # Fetches items from a given URL using configuration settings.
21
- #
22
- # @param url [Addressable::URI] URL to fetch items from.
23
- # @param config [Html2rss::Config] Configuration object.
24
- # @return [Array<Html2rss::Item>] list of items fetched.
25
- def self.from_url(url, config)
26
- ctx = RequestService::Context.new(url:, headers: config.headers)
27
-
28
- body = RequestService.execute(ctx, strategy: config.strategy).body
29
- body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
30
-
31
- Nokogiri.HTML(body)
32
- .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
33
- .map { |xml| new(xml, config) }
34
- .select(&:valid?)
35
- end
36
-
37
- ##
38
- # @param xml [Nokogiri::XML::Element]
39
- # @param config [Html2rss::Config]
40
- def initialize(xml, config)
41
- @xml = xml
42
- @config = config
43
- end
44
-
45
- private_class_method :new
46
-
47
- ##
48
- # Checks if the object responds to a method dynamically based on the configuration.
49
- #
50
- # @param method_name [Symbol]
51
- # @param _include_private [true, false]
52
- # @return [true, false]
53
- # :reek:BooleanParameter { enabled: false }
54
- def respond_to_missing?(method_name, _include_private = false)
55
- config.selector?(method_name) || super
56
- end
57
-
58
- ##
59
- # Dynamically extracts data based on the method name.
60
- #
61
- # @param method_name [Symbol]
62
- # @param _args [Array]
63
- # @return [String] extracted value for the selector.
64
- def method_missing(method_name, *_args)
65
- return super unless respond_to_missing?(method_name)
66
-
67
- extract(method_name)
68
- end
69
-
70
- ##
71
- # Selects and processes data according to the selector name.
72
- #
73
- # @param tag [Symbol]
74
- # @return [String] the extracted value for the selector.
75
- def extract(tag)
76
- attribute_options = config.selector_attributes_with_channel(tag.to_sym)
77
-
78
- post_process(
79
- ItemExtractors.item_extractor_factory(attribute_options, xml).get,
80
- attribute_options.fetch(:post_process, false)
81
- )
82
- end
83
-
84
- ##
85
- # Checks if the item is valid accordin to RSS 2.0 spec,
86
- # by ensuring it has at least a title or a description.
87
- #
88
- # @return [true, false]
89
- def valid?
90
- title_or_description.to_s != ''
91
- end
92
-
93
- ##
94
- # Returns either the title or the description, preferring title if available.
95
- #
96
- # @return [String, nil]
97
- def title_or_description
98
- return title if config.selector?(:title)
99
-
100
- description if config.selector?(:description)
101
- end
102
-
103
- ##
104
- #
105
- # @return [String] SHA1 hashed GUID.
106
- def guid
107
- content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
108
-
109
- Digest::SHA1.hexdigest(content)
110
- end
111
-
112
- ##
113
- # Retrieves categories for the item based on configured category selectors.
114
- #
115
- # @return [Array<String>] list of categories.
116
- def categories
117
- config.category_selector_names
118
- .filter_map do |method_name|
119
- category = public_send(method_name)
120
- category.strip unless category.to_s.empty?
121
- end.uniq
122
- end
123
-
124
- ##
125
- # Checks if the item has an enclosure based on configuration.
126
- #
127
- # @return [true, false]
128
- def enclosure?
129
- config.selector?(:enclosure)
130
- end
131
-
132
- ##
133
- # Retrieves enclosure details for the item.
134
- #
135
- # @return [Enclosure] enclosure details.
136
- def enclosure
137
- url = enclosure_url
138
-
139
- raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
140
-
141
- type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
142
- Html2rss::Utils.guess_content_type_from_url(url)
143
-
144
- Enclosure.new(
145
- type:,
146
- bits_length: 0,
147
- url: url.to_s
148
- )
149
- end
150
-
151
- private
152
-
153
- # @return [Nokogiri::XML::Element] XML element representing the item.
154
- attr_reader :xml
155
- # @return [Html2rss::Config] Configuration object for the item.
156
- attr_reader :config
157
-
158
- ##
159
- # Processes the extracted value according to post-processing options.
160
- #
161
- # @param value [String] extracted value.
162
- # @param post_process_options [Hash<Symbol, Object>] post-processing options.
163
- # @return [String] processed value.
164
- def post_process(value, post_process_options)
165
- return value unless post_process_options
166
-
167
- [post_process_options].flatten.each do |options|
168
- value = AttributePostProcessors.get_processor(options[:name])
169
- .new(value, Context.new(options:, item: self, config:))
170
- .get
171
- end
172
-
173
- value
174
- end
175
-
176
- ##
177
- # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
178
- #
179
- # @return [Addressable::URI, nil] absolute URL of the enclosure.
180
- def enclosure_url
181
- enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
182
-
183
- Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
184
- end
185
- end
186
- end
@@ -1,50 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Returns the value of the attribute.
7
- #
8
- # Imagine this +time+ HTML tag with a +datetime+ attribute:
9
- #
10
- # <time datetime="2019-07-01">...</time>
11
- #
12
- # YAML usage example:
13
- #
14
- # selectors:
15
- # link:
16
- # selector: time
17
- # extractor: attribute
18
- # attribute: datetime
19
- #
20
- # Would return:
21
- # '2019-07-01'
22
- #
23
- # In case you're extracting a date or a time, consider parsing it
24
- # during post processing with {AttributePostProcessors::ParseTime}.
25
- class Attribute
26
- # The available options for the attribute extractor.
27
- Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
28
-
29
- ##
30
- # Initializes the Attribute extractor.
31
- #
32
- # @param xml [Nokogiri::XML::Element]
33
- # @param options [Options]
34
- def initialize(xml, options)
35
- @options = options
36
- @element = ItemExtractors.element(xml, options.selector)
37
- end
38
-
39
- ##
40
- # Retrieves and returns the attribute's value as a string.
41
- #
42
- # @return [String] The value of the attribute.
43
- def get
44
- @element.attr(@options.attribute).to_s.freeze
45
- rescue NoMethodError => error
46
- raise "Failed to extract attribute: #{error.message}"
47
- end
48
- end
49
- end
50
- end
@@ -1,52 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Returns the value of the +href+ attribute.
7
- # It always returns absolute URLs. If the extracted +href+ value is a
8
- # relative URL, it prepends the channel's URL.
9
- #
10
- # Imagine this +a+ HTML element with a +href+ attribute:
11
- #
12
- # <a href="/posts/latest-findings">...</a>
13
- #
14
- # YAML usage example:
15
- # channel:
16
- # url: http://blog-without-a-feed.example.com
17
- # ...
18
- # selectors:
19
- # link:
20
- # selector: a
21
- # extractor: href
22
- #
23
- # Would return:
24
- # 'http://blog-without-a-feed.example.com/posts/latest-findings'
25
- class Href
26
- # The available options for the href (attribute) extractor.
27
- Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
28
-
29
- ##
30
- # Initializes the Href extractor.
31
- #
32
- # @param xml [Nokogiri::XML::Element]
33
- # @param options [Options]
34
- def initialize(xml, options)
35
- @options = options
36
- @element = ItemExtractors.element(xml, options.selector)
37
- @href = @element.attr('href').to_s
38
- end
39
-
40
- ##
41
- # Retrieves and returns the normalized absolute URL.
42
- #
43
- # @return [String] The absolute URL.
44
- def get
45
- return nil unless @href
46
-
47
- sanitized_href = Html2rss::Utils.sanitize_url(@href)
48
- Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
49
- end
50
- end
51
- end
52
- end