html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'dry-validation'
4
+
5
+ module Html2rss
6
+ class Selectors
7
+ ##
8
+ # Validates the configuration hash for :selectors.
9
+ class Config < Dry::Validation::Contract
10
+ # Required wrapper key used to validate dynamic selector names.
11
+ NESTING_KEY = :dynamic_keys_workaround
12
+
13
+ ##
14
+ # Validates the configuration of the :items selector
15
+ class Items < Dry::Validation::Contract
16
+ params do
17
+ required(:selector).filled(:string)
18
+ optional(:order).filled(included_in?: %w[reverse])
19
+ optional(:enhance).filled(:bool?)
20
+ optional(:pagination).hash do
21
+ required(:max_pages).filled(:integer, gt?: 0)
22
+ end
23
+ end
24
+ end
25
+
26
+ ##
27
+ # Validates the configuration of a single selector.
28
+ class Selector < Dry::Validation::Contract
29
+ params do
30
+ optional(:selector)
31
+ optional(:extractor).filled(:string)
32
+ optional(:attribute).filled(:string)
33
+ optional(:static).filled(:string)
34
+ optional(:post_process).array(:hash)
35
+ end
36
+
37
+ rule(:selector) do
38
+ key(:selector).failure('`selector` must be a string') if value && !value.is_a?(String)
39
+ end
40
+
41
+ rule(:extractor) do
42
+ # dependent on the extractor, validate required fields, (i.e. static, attribute)
43
+ case value
44
+ when 'attribute'
45
+ key(:attribute).failure('`attribute` must be a string') unless values[:attribute].is_a?(String)
46
+ when 'static'
47
+ key(:static).failure('`static` must be a string') unless values[:static].is_a?(String)
48
+ end
49
+ end
50
+
51
+ rule(:post_process).each do
52
+ case (name = value[:name])
53
+ when 'gsub'
54
+ key(:pattern).failure('`pattern` must be a string') unless value[:pattern].is_a?(String)
55
+ key(:replacement).failure('`replacement` must be a string') unless value[:replacement].is_a?(String)
56
+ when 'substring'
57
+ key(:start).failure('`start` must be an integer') unless value[:start].is_a?(Integer)
58
+ key(:end).failure('`end` must be an integer or omitted') if !value[:end].nil? && !value[:end].is_a?(Integer)
59
+ when 'template'
60
+ key(:string).failure('`string` must be a string') unless value[:string].is_a?(String)
61
+ when 'html_to_markdown', 'markdown_to_html', 'parse_time', 'parse_uri', 'sanitize_html'
62
+ # nothing to validate
63
+ when nil
64
+ key(:post_process).failure('Missing post_processor `name`')
65
+ else
66
+ key(:post_process).failure("Unknown post_processor `name`: #{name}")
67
+ end
68
+ end
69
+ end
70
+
71
+ ##
72
+ # Validates the configuration of the :enclosure Selector
73
+ class Enclosure < Selector
74
+ params do
75
+ optional(:content_type).filled(:string, format?: %r{^[\w-]+/[\w-]+$})
76
+ end
77
+ end
78
+
79
+ params do
80
+ required(NESTING_KEY).hash
81
+ end
82
+
83
+ rule(NESTING_KEY) do
84
+ value.each_pair do |selector_key, selector|
85
+ case selector_key.to_sym
86
+ when Selectors::ITEMS_SELECTOR_KEY
87
+ Items.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
88
+ when :enclosure
89
+ Enclosure.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
90
+ when :guid, :categories
91
+ unless selector.is_a?(Array)
92
+ key(selector_key).failure("`#{selector_key}` must be an array")
93
+ next
94
+ end
95
+
96
+ key(selector_key).failure("`#{selector_key}` must contain at least one element") if selector.empty?
97
+
98
+ selector.each do |name|
99
+ next if values[NESTING_KEY].key?(name.to_sym)
100
+
101
+ key(selector_key).failure("`#{selector_key}` references unspecified `#{name}`")
102
+ end
103
+ else
104
+ # From here on, the selector is found under its "dynamic" selector_key
105
+ Selector.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
106
+ end
107
+ end
108
+ end
109
+
110
+ ##
111
+ # Shortcut to validate the config.
112
+ # @param config [Hash] the configuration hash to validate
113
+ # @return [Dry::Validation::Result] the result of the validation
114
+ def self.call(config)
115
+ # dry-validation/schema does not support "Dynamic Keys" yet: https://github.com/dry-rb/dry-schema/issues/37
116
+ # But :selectors contains mostly "dynamic" keys, as the user defines them to extract article attributes.
117
+ # --> Validate the dynamic keys manually.
118
+ # To be able to specify a `rule`, nest the config under NESTING_KEY and mark that as `required`.
119
+ new.call(NESTING_KEY => config)
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns the value of the attribute.
8
+ #
9
+ # Imagine this +time+ HTML tag with a +datetime+ attribute:
10
+ #
11
+ # <time datetime="2019-07-01">...</time>
12
+ #
13
+ # YAML usage example:
14
+ #
15
+ # selectors:
16
+ # link:
17
+ # selector: time
18
+ # extractor: attribute
19
+ # attribute: datetime
20
+ #
21
+ # Would return:
22
+ # '2019-07-01'
23
+ #
24
+ # In case you're extracting a date or a time, consider parsing it
25
+ # during post processing with {PostProcessors::ParseTime}.
26
+ class Attribute
27
+ # The available options for the attribute extractor.
28
+ Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
29
+
30
+ ##
31
+ # Initializes the Attribute extractor.
32
+ #
33
+ # @param xml [Nokogiri::XML::Element]
34
+ # @param options [Options]
35
+ # @option options [String] :selector CSS selector used to find the element
36
+ # @option options [String] :attribute attribute name to extract from the selected element
37
+ def initialize(xml, options)
38
+ @options = options
39
+ @element = Extractors.element(xml, options.selector)
40
+ end
41
+
42
+ ##
43
+ # Retrieves and returns the attribute's value as a string.
44
+ #
45
+ # @return [String] The value of the attribute.
46
+ def get
47
+ @element.attr(@options.attribute).to_s
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns the value of the +href+ attribute.
8
+ # It always returns absolute URLs. If the extracted +href+ value is a
9
+ # relative URL, it prepends the channel's URL.
10
+ #
11
+ # Imagine this +a+ HTML element with a +href+ attribute:
12
+ #
13
+ # <a href="/posts/latest-findings">...</a>
14
+ #
15
+ # YAML usage example:
16
+ # channel:
17
+ # url: http://blog-without-a-feed.example.com
18
+ # ...
19
+ # selectors:
20
+ # link:
21
+ # selector: a
22
+ # extractor: href
23
+ #
24
+ # Would return:
25
+ # 'http://blog-without-a-feed.example.com/posts/latest-findings'
26
+ class Href
27
+ # The available options for the href (attribute) extractor.
28
+ Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
29
+
30
+ ##
31
+ # Initializes the Href extractor.
32
+ #
33
+ # @param xml [Nokogiri::XML::Element]
34
+ # @param options [Options]
35
+ # @option options [String] :selector CSS selector used to find the link element
36
+ # @option options [Hash{Symbol => Object}] :channel channel configuration, including :url
37
+ def initialize(xml, options)
38
+ @options = options
39
+ @element = Extractors.element(xml, options.selector)
40
+ @href = @element.attr('href').to_s
41
+ end
42
+
43
+ ##
44
+ # Retrieves and returns the normalized absolute URL.
45
+ #
46
+ # @return [String] The absolute URL.
47
+ def get
48
+ return nil unless @href
49
+
50
+ Url.from_relative(@href, @options.channel[:url])
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns the HTML content of the specified element.
8
+ #
9
+ # Example HTML structure:
10
+ #
11
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
12
+ #
13
+ # YAML usage example:
14
+ #
15
+ # selectors:
16
+ # description:
17
+ # selector: p
18
+ # extractor: html
19
+ #
20
+ # Would return:
21
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
22
+ #
23
+ # Always ensure to sanitize the HTML during post-processing with
24
+ # {PostProcessors::SanitizeHtml}.
25
+ class Html
26
+ # The available options for the html extractor.
27
+ Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
28
+
29
+ ##
30
+ # Initializes the Html extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
34
+ # @option options [String] :selector CSS selector used to find the element
35
+ def initialize(xml, options)
36
+ @element = Extractors.element(xml, options.selector)
37
+ end
38
+
39
+ ##
40
+ # Retrieves and returns the HTML content of the element.
41
+ #
42
+ # @return [String] The HTML content.
43
+ def get
44
+ @element.to_s
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Returns a static value provided in the options.
8
+ #
9
+ # Example usage in YAML:
10
+ #
11
+ # selectors:
12
+ # byline:
13
+ # extractor: static
14
+ # static: Foobar
15
+ #
16
+ # Would return:
17
+ # 'Foobar'
18
+ class Static
19
+ # The available option for the static extractor.
20
+ Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
21
+
22
+ ##
23
+ # Initializes the Static extractor.
24
+ #
25
+ # @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
26
+ # @param options [Options] Options containing the static value.
27
+ # @option options [String, Symbol] :static static value returned by this extractor
28
+ def initialize(_xml, options)
29
+ @options = options
30
+ end
31
+
32
+ ##
33
+ # Retrieves and returns the static value.
34
+ #
35
+ # @return [String, Symbol] The static value provided in options.
36
+ def get
37
+ @options.static
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module Extractors
6
+ ##
7
+ # Return the text content of the attribute. This is the default extractor used,
8
+ # when no extractor is explicitly given.
9
+ #
10
+ # Example HTML structure:
11
+ #
12
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
13
+ #
14
+ # YAML usage example:
15
+ #
16
+ # selectors:
17
+ # description:
18
+ # selector: p
19
+ # extractor: text
20
+ #
21
+ # Would return:
22
+ # 'Lorem ipsum dolor ...'
23
+ class Text
24
+ # The available options for the text extractor.
25
+ Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
26
+
27
+ ##
28
+ # Initializes the Text extractor.
29
+ #
30
+ # @param xml [Nokogiri::XML::Element]
31
+ # @param options [Options]
32
+ # @option options [String] :selector CSS selector used to find the element
33
+ def initialize(xml, options)
34
+ @element = Extractors.element(xml, options.selector)
35
+ end
36
+
37
+ ##
38
+ # Retrieves and returns the text content of the element.
39
+ #
40
+ # @return [String] The text content.
41
+ def get
42
+ @element.text.to_s.strip.gsub(/\s+/, ' ')
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ ##
6
+ # Provides a namespace for item extractors.
7
+ module Extractors
8
+ ##
9
+ # Maps the extractor name to the class implementing the extractor.
10
+ #
11
+ # The key is the name to use in the feed config.
12
+ NAME_TO_CLASS = {
13
+ attribute: Attribute,
14
+ href: Href,
15
+ html: Html,
16
+ static: Static,
17
+ text: Text
18
+ }.freeze
19
+
20
+ ##
21
+ # Maps the extractor class to its corresponding options class.
22
+ ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
23
+ hash[klass] = klass.const_get(:Options)
24
+ end
25
+
26
+ # Extractor used when none is explicitly configured.
27
+ DEFAULT_EXTRACTOR = :text
28
+
29
+ class << self
30
+ ##
31
+ # Retrieves an element from Nokogiri XML based on the selector.
32
+ #
33
+ # @param xml [Nokogiri::XML::Document]
34
+ # @param selector [String, nil]
35
+ # @return [Nokogiri::XML::ElementSet] selected XML elements
36
+ def element(xml, selector)
37
+ selector ? xml.css(selector) : xml
38
+ end
39
+
40
+ # @param attribute_options [Hash{Symbol => Object}]
41
+ # Should contain at least `:extractor` (the name) and required options for that extractor.
42
+ # @param xml [Nokogiri::XML::Document]
43
+ # @return [Object] instance of the specified item extractor class
44
+ def get(attribute_options, xml)
45
+ extractor_class = NAME_TO_CLASS[attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR]
46
+ options = ITEM_OPTION_CLASSES[extractor_class].new(attribute_options.slice(*extractor_class::Options.members))
47
+
48
+ extractor_class.new(xml, options).get
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ class Selectors
7
+ ##
8
+ # A naive implementation of "Object to XML": converts a Ruby object to XML format.
9
+ class ObjectToXmlConverter
10
+ # Wrapper tags used for top-level collection conversion.
11
+ OBJECT_TO_XML_TAGS = {
12
+ hash: ['<object>', '</object>'],
13
+ array: ['<array>', '</array>']
14
+ }.freeze
15
+
16
+ ##
17
+ # @param object [Object] any Ruby object (Hash, Array, String, Symbol, etc.)
18
+ def initialize(object)
19
+ @object = object
20
+ end
21
+
22
+ ##
23
+ # Converts the object to XML format.
24
+ #
25
+ # @return [String] representing the object in XML
26
+ def call
27
+ object_to_xml(@object).tap do |converted|
28
+ Html2rss::Log.debug("#{self.class}: converted object to XML (#{converted.bytesize} bytes)")
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def object_to_xml(object)
35
+ case object
36
+ when Hash
37
+ hash_to_xml(object)
38
+ when Array
39
+ array_to_xml(object)
40
+ else
41
+ CGI.escapeHTML(object.to_s)
42
+ end
43
+ end
44
+
45
+ def hash_to_xml(object)
46
+ prefix, suffix = OBJECT_TO_XML_TAGS[:hash]
47
+ inner_xml = object.each_with_object(+'') do |(key, value), str|
48
+ str << "<#{key}>#{object_to_xml(value)}</#{key}>"
49
+ end
50
+
51
+ "#{prefix}#{inner_xml}#{suffix}"
52
+ end
53
+
54
+ def array_to_xml(object)
55
+ prefix, suffix = OBJECT_TO_XML_TAGS[:array]
56
+ inner_xml = object.each_with_object(+'') { |value, str| str << object_to_xml(value) }
57
+
58
+ "#{prefix}#{inner_xml}#{suffix}"
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Selectors
5
+ module PostProcessors
6
+ ##
7
+ # All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
8
+ class Base
9
+ # Validates the presence of required options in the context
10
+ #
11
+ # @param keys [Array<Symbol>] the keys to check for presence
12
+ # @param context [Selectors::Context] the context containing options
13
+ # @return [void]
14
+ # @raise [MissingOption] if any key is missing
15
+ def self.expect_options(keys, context)
16
+ keys.each do |key|
17
+ unless (options = context[:options]).key?(key)
18
+ raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
19
+ cause: nil
20
+ end
21
+ end
22
+ end
23
+
24
+ # Asserts that the value is of the expected type(s)
25
+ #
26
+ # @param value [Object] the value to check
27
+ # @param types [Array<Class>, Class] the expected type(s)
28
+ # @param name [String] the name of the option being checked
29
+ # @param context [Selectors::Context] call-site context used for richer validation errors
30
+ # @return [void]
31
+ # @raise [InvalidType] if the value is not of the expected type(s)
32
+ def self.assert_type(value, types = [], name, context:)
33
+ return if Array(types).any? { |type| value.is_a?(type) }
34
+
35
+ options = if context.respond_to?(:options)
36
+ context.options
37
+ else
38
+ { file: File.basename(caller(1, 1).first.split(':').first) }
39
+ end
40
+ message = "The type of `#{name}` must be #{Array(types).join(' or ')}, " \
41
+ "but is: #{value.class} in: #{options.inspect}"
42
+ raise InvalidType, message, [], cause: nil
43
+ end
44
+
45
+ ##
46
+ # This method validates the arguments passed to the post processor. Must be implemented by subclasses.
47
+ #
48
+ # @param _value [Object] extracted selector value
49
+ # @param _context [Selectors::Context] post-processor execution context
50
+ # @return [void]
51
+ def self.validate_args!(_value, _context)
52
+ raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
53
+ end
54
+
55
+ # Initializes the post processor
56
+ #
57
+ # @param value [Object] the value to be processed
58
+ # @param context [Selectors::Context] runtime selector context and options
59
+ def initialize(value, context)
60
+ klass = self.class
61
+ klass.assert_type(context, Selectors::Context, 'context', context:)
62
+ klass.validate_args!(value, context)
63
+
64
+ @value = value
65
+ @context = context
66
+ end
67
+
68
+ attr_reader :value, :context
69
+
70
+ # Abstract method to be implemented by subclasses
71
+ #
72
+ # @return [Object] transformed value
73
+ # @raise [NotImplementedError] if not implemented in subclass
74
+ def get
75
+ raise NotImplementedError, 'You must implement the `get` method in the post processor'
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'regexp_parser'
4
+
5
+ module Html2rss
6
+ class Selectors
7
+ module PostProcessors
8
+ ##
9
+ # Imagine this HTML:
10
+ # <h1>Foo bar and boo<h1>
11
+ #
12
+ # YAML usage example:
13
+ # selectors:
14
+ # title:
15
+ # selector: h1
16
+ # post_process:
17
+ # name: gsub
18
+ # pattern: boo
19
+ # replacement: baz
20
+ #
21
+ # Would return:
22
+ # 'Foo bar and baz'
23
+ #
24
+ # `pattern` can be a Regexp or a String. If it is a String, it will remove
25
+ # one pair of surrounding slashes ('/') to keep backwards compatibility
26
+ # and then parse it to build a Regexp.
27
+ #
28
+ # `replacement` can be a String or a Hash.
29
+ #
30
+ # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
31
+ class Gsub < Base
32
+ # @param value [String] extracted selector value
33
+ # @param context [Selectors::Context] post-processor context
34
+ # @return [void]
35
+ def self.validate_args!(value, context)
36
+ assert_type value, String, :value, context:
37
+ expect_options(%i[replacement pattern], context)
38
+ assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
39
+ end
40
+
41
+ ##
42
+ # @param value [String]
43
+ # @param context [Selectors::Context]
44
+ def initialize(value, context)
45
+ super
46
+
47
+ options = context[:options]
48
+
49
+ @replacement = options[:replacement]
50
+ @pattern = options[:pattern]
51
+ end
52
+
53
+ ##
54
+ # @return [String]
55
+ def get
56
+ value.to_s.gsub(pattern, replacement)
57
+ end
58
+
59
+ private
60
+
61
+ attr_accessor :replacement
62
+
63
+ ##
64
+ # @return [Regexp]
65
+ def pattern
66
+ @pattern.is_a?(String) ? parse_regexp_string(@pattern) : @pattern
67
+ end
68
+
69
+ ##
70
+ # Parses the given String and builds a Regexp out of it.
71
+ #
72
+ # It will remove one pair of surrounding slashes ('/') from the String
73
+ # to maintain backwards compatibility before building the Regexp.
74
+ #
75
+ # @param string [String]
76
+ # @return [Regexp]
77
+ def parse_regexp_string(string)
78
+ raise ArgumentError, 'must be a string!' unless string.is_a?(String)
79
+
80
+ # Only remove surrounding slashes if the string has at least 3 characters
81
+ # to avoid issues with single character strings like "/"
82
+ string = string[1..-2] if string.length >= 3 && string.start_with?('/') && string.end_with?('/')
83
+ Regexp::Parser.parse(string, options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE).to_re
84
+ end
85
+ end
86
+ end
87
+ end
88
+ end