html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'mime/types'
4
+
5
+ module Html2rss
6
+ class RssBuilder
7
+ ##
8
+ # Represents an enclosure for an RSS item.
9
+ class Enclosure
10
+ ##
11
+ # Guesses the content type based on the file extension of the URL.
12
+ #
13
+ # @param url [Html2rss::Url]
14
+ # @param default [String] default content type
15
+ # @return [String] guessed content type, or default
16
+ def self.guess_content_type_from_url(url, default: 'application/octet-stream')
17
+ return default unless url
18
+
19
+ url = url.path.split('?').first
20
+
21
+ content_type = MIME::Types.type_for(File.extname(url).delete('.'))
22
+ content_type.first&.to_s || 'application/octet-stream'
23
+ end
24
+
25
+ def self.add(enclosure, maker)
26
+ return unless enclosure
27
+
28
+ maker.enclosure.tap do |enclosure_maker|
29
+ enclosure_maker.url = enclosure.url.to_s
30
+ enclosure_maker.type = enclosure.type
31
+ enclosure_maker.length = enclosure.bits_length
32
+ end
33
+ end
34
+
35
+ def initialize(url:, type: nil, bits_length: 0)
36
+ raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
37
+
38
+ @url = url
39
+ @type = type
40
+ @bits_length = bits_length
41
+ end
42
+
43
+ def type = @type || self.class.guess_content_type_from_url(url)
44
+
45
+ attr_reader :bits_length, :url
46
+ end
47
+ end
48
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Html2rss
4
- module RssBuilder
4
+ class RssBuilder
5
5
  ##
6
6
  # Represents a stylesheet.
7
7
  class Stylesheet
@@ -10,7 +10,7 @@ module Html2rss
10
10
  # Adds the stylesheet XML tags to the RSS.
11
11
  #
12
12
  # @param maker [RSS::Maker::RSS20] RSS maker object.
13
- # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
13
+ # @param stylesheets [Array<Html2rss::RssBuilder::Stylesheet>] Array of stylesheet configurations.
14
14
  # @return [nil]
15
15
  def add(maker, stylesheets)
16
16
  stylesheets.each do |stylesheet|
@@ -24,7 +24,7 @@ module Html2rss
24
24
  # Adds a single Stylesheet to the RSS.
25
25
  #
26
26
  # @param maker [RSS::Maker::RSS20] RSS maker object.
27
- # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
27
+ # @param stylesheet [Html2rss::RssBuilder::Stylesheet] Stylesheet configuration.
28
28
  # @return [nil]
29
29
  def add_stylesheet(maker, stylesheet)
30
30
  maker.xml_stylesheets.new_xml_stylesheet do |xss|
@@ -35,7 +35,7 @@ module Html2rss
35
35
  end
36
36
  end
37
37
 
38
- TYPES = ['text/css', 'text/xsl'].freeze
38
+ TYPES = ['text/css', 'text/xsl'].to_set.freeze
39
39
 
40
40
  def initialize(href:, type:, media: 'all')
41
41
  raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
@@ -4,93 +4,94 @@ require 'rss'
4
4
 
5
5
  module Html2rss
6
6
  ##
7
- # Builds the RSS 2.0 feed, which consists of the '<channel>' and the '<item>'s
8
- # tags in the RSS.
9
- module RssBuilder
10
- # Possible tags inside a RSS 2.0 <channel> tag.
11
- CHANNEL_TAGS = %i[language author title description link ttl].freeze
12
- # Possible tags inside a RSS 2.0 <item> tag.
13
- ITEM_TAGS = %i[title link description author comments updated].freeze
7
+ # Builds an RSS Feed by providing channel, articles and stylesheets.
8
+ class RssBuilder
9
+ class << self
10
+ def add_item(article, item_maker)
11
+ add_item_string_values(article, item_maker)
12
+ add_item_categories(article, item_maker)
13
+ Enclosure.add(article.enclosure, item_maker)
14
+ add_item_guid(article, item_maker)
15
+ end
14
16
 
15
- ##
16
- # Builds an RSS 2.0 feed based on the provided configuration.
17
- #
18
- # @param config [Html2rss::Config] Configuration object containing feed details.
19
- # @return [RSS::Rss] RSS feed object.
20
- def self.build(config)
21
- RSS::Maker.make('2.0') do |maker|
22
- add_stylesheets(maker, config.stylesheets)
23
- add_channel(maker, config)
24
- add_items(maker, config)
17
+ private
18
+
19
+ def add_item_string_values(article, item_maker)
20
+ %i[title description author].each do |attr|
21
+ next unless (value = article.send(attr))
22
+ next if value.empty?
23
+
24
+ item_maker.send(:"#{attr}=", value)
25
+ end
26
+
27
+ item_maker.link = article.url.to_s if article.url
28
+ item_maker.pubDate = article.published_at&.rfc2822
25
29
  end
26
- end
27
30
 
28
- ##
29
- # Adds stylesheets to the RSS maker.
30
- #
31
- # @param maker [RSS::Maker] RSS maker instance.
32
- # @param stylesheets [Array<String>] Array of stylesheets to add.
33
- def self.add_stylesheets(maker, stylesheets)
34
- Stylesheet.add(maker, stylesheets)
35
- end
31
+ def add_item_categories(article, item_maker)
32
+ article.categories.each { |category| item_maker.categories.new_category.content = category }
33
+ end
36
34
 
37
- ##
38
- # Adds channel information to the RSS maker.
39
- #
40
- # @param maker [RSS::Maker] RSS maker instance.
41
- # @param config [Html2rss::Config] Configuration object containing feed details.
42
- def self.add_channel(maker, config)
43
- channel = maker.channel
44
- CHANNEL_TAGS.each do |tag|
45
- Channel.add(channel, config, [tag])
35
+ def add_item_guid(article, item_maker)
36
+ item_maker.guid.tap do |guid|
37
+ guid.content = article.guid
38
+ guid.isPermaLink = false
39
+ end
46
40
  end
47
41
  end
48
42
 
49
43
  ##
50
- # Adds items to the RSS maker based on configuration.
51
- #
52
- # @param maker [RSS::Maker] RSS maker instance.
53
- # @param config [Html2rss::Config] Configuration object containing feed details.
54
- def self.add_items(maker, config)
55
- item_attributes = extract_item_attributes(config)
56
- items = fetch_items(config)
57
- items.reverse! if config.items_order == :reverse
58
-
59
- items.each do |item|
60
- add_item(maker, item, item_attributes)
44
+ # @param channel [Html2rss::RssBuilder::Channel] The channel information for the RSS feed.
45
+ # @param articles [Array<Html2rss::RssBuilder::Article>] The list of articles to include in the RSS feed.
46
+ # @param stylesheets [Array<Hash>] An optional array of stylesheet configurations.
47
+ def initialize(channel:, articles:, stylesheets: [])
48
+ @channel = channel
49
+ @articles = articles
50
+ @stylesheets = stylesheets
51
+ end
52
+
53
+ def call
54
+ RSS::Maker.make('2.0') do |maker|
55
+ Stylesheet.add(maker, stylesheets)
56
+
57
+ make_channel(maker.channel)
58
+ make_items(maker)
61
59
  end
62
60
  end
63
61
 
64
- ##
65
- # Adds a single item to the RSS maker.
66
- #
67
- # @param maker [RSS::Maker] RSS maker instance.
68
- # @param item [Html2rss::Item] Item to add.
69
- # @param item_attributes [Array<Symbol>] Array of item attributes.
70
- # @return [nil]
71
- def self.add_item(maker, item, item_attributes)
72
- new_item = maker.items.new_item
73
- Item.add(new_item, item, item_attributes)
62
+ private
63
+
64
+ attr_reader :channel, :articles
65
+
66
+ def stylesheets
67
+ @stylesheets.map { |style| Stylesheet.new(**style) }
74
68
  end
75
69
 
76
- ##
77
- # Extracts item attributes from configuration.
78
- #
79
- # @param config [Html2rss::Config] Configuration object containing feed details.
80
- # @return [Array<Symbol>] Array of item attributes.
81
- def self.extract_item_attributes(config)
82
- config.item_selector_names & ITEM_TAGS
70
+ def make_channel(maker)
71
+ %i[language title description ttl].each do |key|
72
+ maker.public_send(:"#{key}=", channel.public_send(key))
73
+ end
74
+
75
+ maker.link = channel.url.to_s
76
+ maker.generator = generator
77
+ maker.updated = channel.last_build_date
83
78
  end
84
79
 
85
- ##
86
- # Fetches items from the URL specified in configuration.
87
- #
88
- # @param config [Html2rss::Config] Configuration object containing feed details.
89
- # @return [Array<Html2rss::Item>] Array of items.
90
- def self.fetch_items(config)
91
- Html2rss::Item.from_url(config.url, config)
80
+ def make_items(maker)
81
+ articles.each do |article|
82
+ maker.items.new_item { |item_maker| self.class.add_item(article, item_maker) }
83
+ end
92
84
  end
93
85
 
94
- private_class_method :extract_item_attributes, :fetch_items, :add_item
86
+ def generator
87
+ scraper_namespace_regex = /(?<namespace>Html2rss|Scraper)::/
88
+
89
+ scraper_counts = articles.flat_map(&:scraper).tally.map do |klass, count|
90
+ scraper_name = klass.to_s.gsub(scraper_namespace_regex, '')
91
+ "#{scraper_name} (#{count})"
92
+ end
93
+
94
+ "html2rss V. #{Html2rss::VERSION} (scrapers: #{scraper_counts.join(', ')})"
95
+ end
95
96
  end
96
97
  end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'dry-validation'
4
+
5
+ module Html2rss
6
+ class Selectors
7
+ ##
8
+ # Validates the configuration hash for :selectors.
9
+ class Config < Dry::Validation::Contract
10
+ NESTING_KEY = :dynamic_keys_workaround
11
+
12
+ ##
13
+ # Validates the configuration of the :items selector
14
+ class Items < Dry::Validation::Contract
15
+ params do
16
+ required(:selector).filled(:string)
17
+ optional(:order).filled(included_in?: %w[reverse])
18
+ optional(:enhance).filled(:bool?)
19
+ optional(:pagination).hash do
20
+ required(:max_pages).filled(:integer, gt?: 0)
21
+ end
22
+ end
23
+ end
24
+
25
+ ##
26
+ # Validates the configuration of a single selector.
27
+ class Selector < Dry::Validation::Contract
28
+ params do
29
+ optional(:selector)
30
+ optional(:extractor).filled(:string)
31
+ optional(:attribute).filled(:string)
32
+ optional(:static).filled(:string)
33
+ optional(:post_process).array(:hash)
34
+ end
35
+
36
+ rule(:selector) do
37
+ key(:selector).failure('`selector` must be a string') if value && !value.is_a?(String)
38
+ end
39
+
40
+ rule(:extractor) do
41
+ # dependent on the extractor, validate required fields, (i.e. static, attribute)
42
+ case value
43
+ when 'attribute'
44
+ key(:attribute).failure('`attribute` must be a string') unless values[:attribute].is_a?(String)
45
+ when 'static'
46
+ key(:static).failure('`static` must be a string') unless values[:static].is_a?(String)
47
+ end
48
+ end
49
+
50
+ rule(:post_process).each do
51
+ case (name = value[:name])
52
+ when 'gsub'
53
+ key(:pattern).failure('`pattern` must be a string') unless value[:pattern].is_a?(String)
54
+ key(:replacement).failure('`replacement` must be a string') unless value[:replacement].is_a?(String)
55
+ when 'substring'
56
+ key(:start).failure('`start` must be an integer') unless value[:start].is_a?(Integer)
57
+ key(:end).failure('`end` must be an integer or omitted') if !value[:end].nil? && !value[:end].is_a?(Integer)
58
+ when 'template'
59
+ key(:string).failure('`string` must be a string') unless value[:string].is_a?(String)
60
+ when 'html_to_markdown', 'markdown_to_html', 'parse_time', 'parse_uri', 'sanitize_html'
61
+ # nothing to validate
62
+ when nil
63
+ key(:post_process).failure('Missing post_processor `name`')
64
+ else
65
+ key(:post_process).failure("Unknown post_processor `name`: #{name}")
66
+ end
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Validates the configuration of the :enclosure Selector
72
+ class Enclosure < Selector
73
+ params do
74
+ optional(:content_type).filled(:string, format?: %r{^[\w-]+/[\w-]+$})
75
+ end
76
+ end
77
+
78
+ params do
79
+ required(NESTING_KEY).hash
80
+ end
81
+
82
+ rule(NESTING_KEY) do
83
+ value.each_pair do |selector_key, selector|
84
+ case selector_key.to_sym
85
+ when Selectors::ITEMS_SELECTOR_KEY
86
+ Items.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
87
+ when :enclosure
88
+ Enclosure.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
89
+ when :guid, :categories
90
+ unless selector.is_a?(Array)
91
+ key(selector_key).failure("`#{selector_key}` must be an array")
92
+ next
93
+ end
94
+
95
+ key(selector_key).failure("`#{selector_key}` must contain at least one element") if selector.empty?
96
+
97
+ selector.each do |name|
98
+ next if values[NESTING_KEY].key?(name.to_sym)
99
+
100
+ key(selector_key).failure("`#{selector_key}` references unspecified `#{name}`")
101
+ end
102
+ else
103
+ # From here on, the selector is found under its "dynamic" selector_key
104
+ Selector.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
105
+ end
106
+ end
107
+ end
108
+
109
+ ##
110
+ # Shortcut to validate the config.
111
+ # @param config [Hash] the configuration hash to validate
112
+ # @return [Dry::Validation::Result] the result of the validation
113
+ def self.call(config)
114
+ # dry-validation/schema does not support "Dynamic Keys" yet: https://github.com/dry-rb/dry-schema/issues/37
115
+ # But :selectors contains mostly "dynamic" keys, as the user defines them to extract article attributes.
116
+ # --> Validate the dynamic keys manually.
117
+ # To be able to specify a `rule`, nest the config under NESTING_KEY and mark that as `required`.
118
+ new.call(NESTING_KEY => config)
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns the value of the attribute.
8
+ #
9
+ # Imagine this +time+ HTML tag with a +datetime+ attribute:
10
+ #
11
+ # <time datetime="2019-07-01">...</time>
12
+ #
13
+ # YAML usage example:
14
+ #
15
+ # selectors:
16
+ # link:
17
+ # selector: time
18
+ # extractor: attribute
19
+ # attribute: datetime
20
+ #
21
+ # Would return:
22
+ # '2019-07-01'
23
+ #
24
+ # In case you're extracting a date or a time, consider parsing it
25
+ # during post processing with {PostProcessors::ParseTime}.
26
+ class Attribute
27
+ # The available options for the attribute extractor.
28
+ Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
29
+
30
+ ##
31
+ # Initializes the Attribute extractor.
32
+ #
33
+ # @param xml [Nokogiri::XML::Element]
34
+ # @param options [Options]
35
+ def initialize(xml, options)
36
+ @options = options
37
+ @element = Extractors.element(xml, options.selector)
38
+ end
39
+
40
+ ##
41
+ # Retrieves and returns the attribute's value as a string.
42
+ #
43
+ # @return [String] The value of the attribute.
44
+ def get
45
+ @element.attr(@options.attribute).to_s
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns the value of the +href+ attribute.
8
+ # It always returns absolute URLs. If the extracted +href+ value is a
9
+ # relative URL, it prepends the channel's URL.
10
+ #
11
+ # Imagine this +a+ HTML element with a +href+ attribute:
12
+ #
13
+ # <a href="/posts/latest-findings">...</a>
14
+ #
15
+ # YAML usage example:
16
+ # channel:
17
+ # url: http://blog-without-a-feed.example.com
18
+ # ...
19
+ # selectors:
20
+ # link:
21
+ # selector: a
22
+ # extractor: href
23
+ #
24
+ # Would return:
25
+ # 'http://blog-without-a-feed.example.com/posts/latest-findings'
26
+ class Href
27
+ # The available options for the href (attribute) extractor.
28
+ Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
29
+
30
+ ##
31
+ # Initializes the Href extractor.
32
+ #
33
+ # @param xml [Nokogiri::XML::Element]
34
+ # @param options [Options]
35
+ def initialize(xml, options)
36
+ @options = options
37
+ @element = Extractors.element(xml, options.selector)
38
+ @href = @element.attr('href').to_s
39
+ end
40
+
41
+ ##
42
+ # Retrieves and returns the normalized absolute URL.
43
+ #
44
+ # @return [String] The absolute URL.
45
+ def get
46
+ return nil unless @href
47
+
48
+ Url.from_relative(@href, @options.channel[:url])
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns the HTML content of the specified element.
8
+ #
9
+ # Example HTML structure:
10
+ #
11
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
12
+ #
13
+ # YAML usage example:
14
+ #
15
+ # selectors:
16
+ # description:
17
+ # selector: p
18
+ # extractor: html
19
+ #
20
+ # Would return:
21
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
22
+ #
23
+ # Always ensure to sanitize the HTML during post-processing with
24
+ # {PostProcessors::SanitizeHtml}.
25
+ class Html
26
+ # The available options for the html extractor.
27
+ Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
28
+
29
+ ##
30
+ # Initializes the Html extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
34
+ def initialize(xml, options)
35
+ @element = Extractors.element(xml, options.selector)
36
+ end
37
+
38
+ ##
39
+ # Retrieves and returns the HTML content of the element.
40
+ #
41
+ # @return [String] The HTML content.
42
+ def get
43
+ @element.to_s
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns a static value provided in the options.
8
+ #
9
+ # Example usage in YAML:
10
+ #
11
+ # selectors:
12
+ # author:
13
+ # extractor: static
14
+ # static: Foobar
15
+ #
16
+ # Would return:
17
+ # 'Foobar'
18
+ class Static
19
+ # The available option for the static extractor.
20
+ Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
21
+
22
+ ##
23
+ # Initializes the Static extractor.
24
+ #
25
+ # @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
26
+ # @param options [Options] Options containing the static value.
27
+ def initialize(_xml, options)
28
+ @options = options
29
+ end
30
+
31
+ ##
32
+ # Retrieves and returns the static value.
33
+ #
34
+ # @return [String, Symbol] The static value provided in options.
35
+ def get
36
+ @options.static
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Return the text content of the attribute. This is the default extractor used,
8
+ # when no extractor is explicitly given.
9
+ #
10
+ # Example HTML structure:
11
+ #
12
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
13
+ #
14
+ # YAML usage example:
15
+ #
16
+ # selectors:
17
+ # description:
18
+ # selector: p
19
+ # extractor: text
20
+ #
21
+ # Would return:
22
+ # 'Lorem ipsum dolor ...'
23
+ class Text
24
+ # The available options for the text extractor.
25
+ Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
26
+
27
+ ##
28
+ # Initializes the Text extractor.
29
+ #
30
+ # @param xml [Nokogiri::XML::Element]
31
+ # @param options [Options]
32
+ def initialize(xml, options)
33
+ @element = Extractors.element(xml, options.selector)
34
+ end
35
+
36
+ ##
37
+ # Retrieves and returns the text content of the element.
38
+ #
39
+ # @return [String] The text content.
40
+ def get
41
+ @element.text.to_s.strip.gsub(/\s+/, ' ')
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ ##
6
+ # Provides a namespace for item extractors.
7
+ module Extractors
8
+ ##
9
+ # Maps the extractor name to the class implementing the extractor.
10
+ #
11
+ # The key is the name to use in the feed config.
12
+ NAME_TO_CLASS = {
13
+ attribute: Attribute,
14
+ href: Href,
15
+ html: Html,
16
+ static: Static,
17
+ text: Text
18
+ }.freeze
19
+
20
+ ##
21
+ # Maps the extractor class to its corresponding options class.
22
+ ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
23
+ hash[klass] = klass.const_get(:Options)
24
+ end
25
+
26
+ DEFAULT_EXTRACTOR = :text
27
+
28
+ class << self
29
+ ##
30
+ # Retrieves an element from Nokogiri XML based on the selector.
31
+ #
32
+ # @param xml [Nokogiri::XML::Document]
33
+ # @param selector [String, nil]
34
+ # @return [Nokogiri::XML::ElementSet] selected XML elements
35
+ def element(xml, selector)
36
+ selector ? xml.css(selector) : xml
37
+ end
38
+
39
+ # @param attribute_options [Hash<Symbol, Object>]
40
+ # Should contain at least `:extractor` (the name) and required options for that extractor.
41
+ # @param xml [Nokogiri::XML::Document]
42
+ # @return [Object] instance of the specified item extractor class
43
+ def get(attribute_options, xml)
44
+ extractor_class = NAME_TO_CLASS[attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR]
45
+ options = ITEM_OPTION_CLASSES[extractor_class].new(attribute_options.slice(*extractor_class::Options.members))
46
+
47
+ extractor_class.new(xml, options).get
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end