html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -1,103 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- class Config
5
- ##
6
- # Holds the configurations of the selectors.
7
- class Selectors
8
- ITEMS_SELECTOR_NAME = :items
9
-
10
- # Struct to represent a selector with associated attributes for extraction and processing.
11
- Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
12
- keyword_init: true)
13
-
14
- # raised when an invalid selector name is used
15
- class InvalidSelectorName < Html2rss::Error; end
16
-
17
- ##
18
- # @param config [Hash<Symbol, Object>]
19
- def initialize(config)
20
- validate_config(config)
21
- @config = config
22
- end
23
-
24
- ##
25
- # @param name [Symbol]
26
- # @return [true, false]
27
- def selector?(name)
28
- name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
29
- end
30
-
31
- ##
32
- # @param name [Symbol]
33
- # @return [Selector]
34
- def selector(name)
35
- raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
36
-
37
- keywords = config[name].slice(*available_keys)
38
-
39
- if (additional_keys = keywords.keys - available_keys).any?
40
- Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
41
- end
42
-
43
- Selector.new(keywords)
44
- end
45
-
46
- ##
47
- # @return [Set<Symbol>]
48
- def category_selector_names
49
- selector_keys_for(:categories)
50
- end
51
-
52
- ##
53
- # @return [Set<Symbol>]
54
- def guid_selector_names
55
- selector_keys_for(:guid, default: :title_or_description)
56
- end
57
-
58
- ##
59
- # Returns the CSS/XPath selector.
60
- #
61
- # @param name [Symbol]
62
- # @return [String]
63
- def selector_string(name)
64
- Selector.new(config[name]).selector
65
- end
66
-
67
- ##
68
- # @return [Set<Symbol>]
69
- def item_selector_names
70
- @item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
71
- end
72
-
73
- ##
74
- # @return [Symbol, nil]
75
- def items_order
76
- config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
77
- end
78
-
79
- private
80
-
81
- attr_reader :config
82
-
83
- def validate_config(config)
84
- raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
85
- end
86
-
87
- ##
88
- # Returns the selector keys for the selector named `name`. If none, returns [default].
89
- #
90
- # @param name [Symbol]
91
- # @param default [String, Symbol]
92
- # @return [Set<Symbol>]
93
- def selector_keys_for(name, default: nil)
94
- config.fetch(name) { Array(default) }.tap do |array|
95
- array.reject! { |entry| entry.to_s == '' }
96
- array.map!(&:to_sym)
97
- end.to_set
98
- end
99
-
100
- def available_keys = @available_keys ||= Selector.members
101
- end
102
- end
103
- end
data/lib/html2rss/item.rb DELETED
@@ -1,186 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'nokogiri'
4
-
5
- module Html2rss
6
- ##
7
- # Takes the selected Nokogiri::HTML and responds to accessor names
8
- # defined in the feed config.
9
- #
10
- # Instances can only be created via `.from_url` and
11
- # each represents an internally used "RSS item".
12
- # Such an item provides dynamically defined attributes as methods.
13
- class Item
14
- # A context instance is passed to Item Extractors.
15
- Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
16
- # Class to keep an Item's <enclosure>.
17
- Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
18
-
19
- ##
20
- # Fetches items from a given URL using configuration settings.
21
- #
22
- # @param url [Addressable::URI] URL to fetch items from.
23
- # @param config [Html2rss::Config] Configuration object.
24
- # @return [Array<Html2rss::Item>] list of items fetched.
25
- def self.from_url(url, config)
26
- ctx = RequestService::Context.new(url:, headers: config.headers)
27
-
28
- body = RequestService.execute(ctx, strategy: config.strategy).body
29
- body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
30
-
31
- Nokogiri.HTML(body)
32
- .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
33
- .map { |xml| new(xml, config) }
34
- .select(&:valid?)
35
- end
36
-
37
- ##
38
- # @param xml [Nokogiri::XML::Element]
39
- # @param config [Html2rss::Config]
40
- def initialize(xml, config)
41
- @xml = xml
42
- @config = config
43
- end
44
-
45
- private_class_method :new
46
-
47
- ##
48
- # Checks if the object responds to a method dynamically based on the configuration.
49
- #
50
- # @param method_name [Symbol]
51
- # @param _include_private [true, false]
52
- # @return [true, false]
53
- # :reek:BooleanParameter { enabled: false }
54
- def respond_to_missing?(method_name, _include_private = false)
55
- config.selector?(method_name) || super
56
- end
57
-
58
- ##
59
- # Dynamically extracts data based on the method name.
60
- #
61
- # @param method_name [Symbol]
62
- # @param _args [Array]
63
- # @return [String] extracted value for the selector.
64
- def method_missing(method_name, *_args)
65
- return super unless respond_to_missing?(method_name)
66
-
67
- extract(method_name)
68
- end
69
-
70
- ##
71
- # Selects and processes data according to the selector name.
72
- #
73
- # @param tag [Symbol]
74
- # @return [String] the extracted value for the selector.
75
- def extract(tag)
76
- attribute_options = config.selector_attributes_with_channel(tag.to_sym)
77
-
78
- post_process(
79
- ItemExtractors.item_extractor_factory(attribute_options, xml).get,
80
- attribute_options.fetch(:post_process, false)
81
- )
82
- end
83
-
84
- ##
85
- # Checks if the item is valid accordin to RSS 2.0 spec,
86
- # by ensuring it has at least a title or a description.
87
- #
88
- # @return [true, false]
89
- def valid?
90
- title_or_description.to_s != ''
91
- end
92
-
93
- ##
94
- # Returns either the title or the description, preferring title if available.
95
- #
96
- # @return [String, nil]
97
- def title_or_description
98
- return title if config.selector?(:title)
99
-
100
- description if config.selector?(:description)
101
- end
102
-
103
- ##
104
- #
105
- # @return [String] SHA1 hashed GUID.
106
- def guid
107
- content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
108
-
109
- Digest::SHA1.hexdigest(content)
110
- end
111
-
112
- ##
113
- # Retrieves categories for the item based on configured category selectors.
114
- #
115
- # @return [Array<String>] list of categories.
116
- def categories
117
- config.category_selector_names
118
- .filter_map do |method_name|
119
- category = public_send(method_name)
120
- category.strip unless category.to_s.empty?
121
- end.uniq
122
- end
123
-
124
- ##
125
- # Checks if the item has an enclosure based on configuration.
126
- #
127
- # @return [true, false]
128
- def enclosure?
129
- config.selector?(:enclosure)
130
- end
131
-
132
- ##
133
- # Retrieves enclosure details for the item.
134
- #
135
- # @return [Enclosure] enclosure details.
136
- def enclosure
137
- url = enclosure_url
138
-
139
- raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
140
-
141
- type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
142
- Html2rss::Utils.guess_content_type_from_url(url)
143
-
144
- Enclosure.new(
145
- type:,
146
- bits_length: 0,
147
- url: url.to_s
148
- )
149
- end
150
-
151
- private
152
-
153
- # @return [Nokogiri::XML::Element] XML element representing the item.
154
- attr_reader :xml
155
- # @return [Html2rss::Config] Configuration object for the item.
156
- attr_reader :config
157
-
158
- ##
159
- # Processes the extracted value according to post-processing options.
160
- #
161
- # @param value [String] extracted value.
162
- # @param post_process_options [Hash<Symbol, Object>] post-processing options.
163
- # @return [String] processed value.
164
- def post_process(value, post_process_options)
165
- return value unless post_process_options
166
-
167
- [post_process_options].flatten.each do |options|
168
- value = AttributePostProcessors.get_processor(options[:name])
169
- .new(value, Context.new(options:, item: self, config:))
170
- .get
171
- end
172
-
173
- value
174
- end
175
-
176
- ##
177
- # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
178
- #
179
- # @return [Addressable::URI, nil] absolute URL of the enclosure.
180
- def enclosure_url
181
- enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
182
-
183
- Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
184
- end
185
- end
186
- end
@@ -1,50 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Returns the value of the attribute.
7
- #
8
- # Imagine this +time+ HTML tag with a +datetime+ attribute:
9
- #
10
- # <time datetime="2019-07-01">...</time>
11
- #
12
- # YAML usage example:
13
- #
14
- # selectors:
15
- # link:
16
- # selector: time
17
- # extractor: attribute
18
- # attribute: datetime
19
- #
20
- # Would return:
21
- # '2019-07-01'
22
- #
23
- # In case you're extracting a date or a time, consider parsing it
24
- # during post processing with {AttributePostProcessors::ParseTime}.
25
- class Attribute
26
- # The available options for the attribute extractor.
27
- Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
28
-
29
- ##
30
- # Initializes the Attribute extractor.
31
- #
32
- # @param xml [Nokogiri::XML::Element]
33
- # @param options [Options]
34
- def initialize(xml, options)
35
- @options = options
36
- @element = ItemExtractors.element(xml, options.selector)
37
- end
38
-
39
- ##
40
- # Retrieves and returns the attribute's value as a string.
41
- #
42
- # @return [String] The value of the attribute.
43
- def get
44
- @element.attr(@options.attribute).to_s.freeze
45
- rescue NoMethodError => error
46
- raise "Failed to extract attribute: #{error.message}"
47
- end
48
- end
49
- end
50
- end
@@ -1,52 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Returns the value of the +href+ attribute.
7
- # It always returns absolute URLs. If the extracted +href+ value is a
8
- # relative URL, it prepends the channel's URL.
9
- #
10
- # Imagine this +a+ HTML element with a +href+ attribute:
11
- #
12
- # <a href="/posts/latest-findings">...</a>
13
- #
14
- # YAML usage example:
15
- # channel:
16
- # url: http://blog-without-a-feed.example.com
17
- # ...
18
- # selectors:
19
- # link:
20
- # selector: a
21
- # extractor: href
22
- #
23
- # Would return:
24
- # 'http://blog-without-a-feed.example.com/posts/latest-findings'
25
- class Href
26
- # The available options for the href (attribute) extractor.
27
- Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
28
-
29
- ##
30
- # Initializes the Href extractor.
31
- #
32
- # @param xml [Nokogiri::XML::Element]
33
- # @param options [Options]
34
- def initialize(xml, options)
35
- @options = options
36
- @element = ItemExtractors.element(xml, options.selector)
37
- @href = @element.attr('href').to_s
38
- end
39
-
40
- ##
41
- # Retrieves and returns the normalized absolute URL.
42
- #
43
- # @return [String] The absolute URL.
44
- def get
45
- return nil unless @href
46
-
47
- sanitized_href = Html2rss::Utils.sanitize_url(@href)
48
- Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
49
- end
50
- end
51
- end
52
- end
@@ -1,46 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Returns the HTML content of the specified element.
7
- #
8
- # Example HTML structure:
9
- #
10
- # <p>Lorem <b>ipsum</b> dolor ...</p>
11
- #
12
- # YAML usage example:
13
- #
14
- # selectors:
15
- # description:
16
- # selector: p
17
- # extractor: html
18
- #
19
- # Would return:
20
- # '<p>Lorem <b>ipsum</b> dolor ...</p>'
21
- #
22
- # Always ensure to sanitize the HTML during post-processing with
23
- # {AttributePostProcessors::SanitizeHtml}.
24
- class Html
25
- # The available options for the html extractor.
26
- Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
27
-
28
- ##
29
- # Initializes the Html extractor.
30
- #
31
- # @param xml [Nokogiri::XML::Element]
32
- # @param options [Options]
33
- def initialize(xml, options)
34
- @element = ItemExtractors.element(xml, options.selector)
35
- end
36
-
37
- ##
38
- # Retrieves and returns the HTML content of the element.
39
- #
40
- # @return [String] The HTML content.
41
- def get
42
- @element.to_s
43
- end
44
- end
45
- end
46
- end
@@ -1,39 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Returns a static value provided in the options.
7
- #
8
- # Example usage in YAML:
9
- #
10
- # selectors:
11
- # author:
12
- # extractor: static
13
- # static: Foobar
14
- #
15
- # Would return:
16
- # 'Foobar'
17
- class Static
18
- # The available option for the static extractor.
19
- Options = Struct.new('StaticOptions', :static, keyword_init: true)
20
-
21
- ##
22
- # Initializes the Static extractor.
23
- #
24
- # @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
25
- # @param options [Options] Options containing the static value.
26
- def initialize(_xml, options)
27
- @options = options
28
- end
29
-
30
- ##
31
- # Retrieves and returns the static value.
32
- #
33
- # @return [String, Symbol] The static value provided in options.
34
- def get
35
- @options.static
36
- end
37
- end
38
- end
39
- end
@@ -1,44 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- module ItemExtractors
5
- ##
6
- # Return the text content of the attribute. This is the default extractor used,
7
- # when no extractor is explicitly given.
8
- #
9
- # Example HTML structure:
10
- #
11
- # <p>Lorem <b>ipsum</b> dolor ...</p>
12
- #
13
- # YAML usage example:
14
- #
15
- # selectors:
16
- # description:
17
- # selector: p
18
- # extractor: text
19
- #
20
- # Would return:
21
- # 'Lorem ipsum dolor ...'
22
- class Text
23
- # The available options for the text extractor.
24
- Options = Struct.new('TextOptions', :selector, keyword_init: true)
25
-
26
- ##
27
- # Initializes the Text extractor.
28
- #
29
- # @param xml [Nokogiri::XML::Element]
30
- # @param options [Options]
31
- def initialize(xml, options)
32
- @element = ItemExtractors.element(xml, options.selector)
33
- end
34
-
35
- ##
36
- # Retrieves and returns the text content of the element.
37
- #
38
- # @return [String] The text content.
39
- def get
40
- @element.text.to_s.strip.gsub(/\s+/, ' ')
41
- end
42
- end
43
- end
44
- end
@@ -1,88 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Html2rss
4
- ##
5
- # Provides a namespace for item extractors.
6
- module ItemExtractors
7
- ##
8
- # The Error class to be thrown when an unknown extractor name is requested.
9
- class UnknownExtractorName < Html2rss::Error; end
10
-
11
- ##
12
- # Maps the extractor name to the class implementing the extractor.
13
- #
14
- # The key is the name to use in the feed config.
15
- NAME_TO_CLASS = {
16
- attribute: Attribute,
17
- href: Href,
18
- html: Html,
19
- static: Static,
20
- text: Text
21
- }.freeze
22
-
23
- ##
24
- # Maps the extractor class to its corresponding options class.
25
- ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
26
- hash[klass] = klass.const_get(:Options)
27
- end
28
-
29
- DEFAULT_EXTRACTOR = :text
30
-
31
- ##
32
- # Retrieves an element from Nokogiri XML based on the selector.
33
- #
34
- # @param xml [Nokogiri::XML::Document]
35
- # @param selector [String, nil]
36
- # @return [Nokogiri::XML::ElementSet] selected XML elements
37
- def self.element(xml, selector)
38
- selector ? xml.css(selector) : xml
39
- end
40
-
41
- ##
42
- # Creates an instance of the requested item extractor.
43
- #
44
- # @param attribute_options [Hash<Symbol, Object>]
45
- # Should contain at least `:extractor` (the name) and required options for that extractor.
46
- # @param xml [Nokogiri::XML::Document]
47
- # @return [Object] instance of the specified item extractor class
48
- def self.item_extractor_factory(attribute_options, xml)
49
- extractor_name = attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR
50
- extractor_class = find_extractor_class(extractor_name)
51
- options_instance = build_options_instance(extractor_class, attribute_options)
52
- create_extractor_instance(extractor_class, xml, options_instance)
53
- end
54
-
55
- ##
56
- # Finds the extractor class based on the name.
57
- #
58
- # @param extractor_name [Symbol] the name of the extractor
59
- # @return [Class] the class implementing the extractor
60
- # @raise [UnknownExtractorName] if the extractor class is not found
61
- def self.find_extractor_class(extractor_name)
62
- NAME_TO_CLASS[extractor_name] || raise(UnknownExtractorName,
63
- "Unknown extractor name '#{extractor_name}' requested in NAME_TO_CLASS")
64
- end
65
-
66
- ##
67
- # Builds the options instance for the extractor class.
68
- #
69
- # @param extractor_class [Class] the class implementing the extractor
70
- # @param attribute_options [Hash<Symbol, Object>] the attribute options
71
- # @return [Object] an instance of the options class for the extractor
72
- def self.build_options_instance(extractor_class, attribute_options)
73
- options = attribute_options.slice(*extractor_class::Options.members)
74
- ITEM_OPTION_CLASSES[extractor_class].new(options)
75
- end
76
-
77
- ##
78
- # Creates an instance of the extractor class.
79
- #
80
- # @param extractor_class [Class] the class implementing the extractor
81
- # @param xml [Nokogiri::XML::Document] the XML document
82
- # @param options_instance [Object] the options instance
83
- # @return [Object] an instance of the extractor class
84
- def self.create_extractor_instance(extractor_class, xml, options_instance)
85
- extractor_class.new(xml, options_instance)
86
- end
87
- end
88
- end
@@ -1,56 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'cgi'
4
- require 'json'
5
-
6
- module Html2rss
7
- ##
8
- # A naive implementation of "Object to XML": converts a Ruby object to XML format.
9
- class ObjectToXmlConverter
10
- OBJECT_TO_XML_TAGS = {
11
- hash: ['<object>', '</object>'],
12
- enumerable: ['<array>', '</array>']
13
- }.freeze
14
-
15
- ##
16
- # @param object [Object] any Ruby object (Hash, Array, String, Symbol, etc.)
17
- def initialize(object)
18
- @object = object
19
- end
20
-
21
- ##
22
- # Converts the object to XML format.
23
- #
24
- # @return [String] representing the object in XML
25
- def call
26
- object_to_xml(@object)
27
- end
28
-
29
- private
30
-
31
- def object_to_xml(object)
32
- case object
33
- when Hash
34
- hash_to_xml(object)
35
- when Enumerable
36
- enumerable_to_xml(object)
37
- else
38
- CGI.escapeHTML(object.to_s)
39
- end
40
- end
41
-
42
- def hash_to_xml(object)
43
- prefix, suffix = OBJECT_TO_XML_TAGS[:hash]
44
- inner_xml = object.map { |key, value| "<#{key}>#{object_to_xml(value)}</#{key}>" }.join
45
-
46
- "#{prefix}#{inner_xml}#{suffix}"
47
- end
48
-
49
- def enumerable_to_xml(object)
50
- prefix, suffix = OBJECT_TO_XML_TAGS[:enumerable]
51
- inner_xml = object.map { |value| object_to_xml(value) }.join
52
-
53
- "#{prefix}#{inner_xml}#{suffix}"
54
- end
55
- end
56
- end