html2rss 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -656
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +115 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
class Config
|
|
5
|
-
##
|
|
6
|
-
# Holds the configurations of the selectors.
|
|
7
|
-
class Selectors
|
|
8
|
-
ITEMS_SELECTOR_NAME = :items
|
|
9
|
-
|
|
10
|
-
# Struct to represent a selector with associated attributes for extraction and processing.
|
|
11
|
-
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
|
|
12
|
-
keyword_init: true)
|
|
13
|
-
|
|
14
|
-
# raised when an invalid selector name is used
|
|
15
|
-
class InvalidSelectorName < Html2rss::Error; end
|
|
16
|
-
|
|
17
|
-
##
|
|
18
|
-
# @param config [Hash<Symbol, Object>]
|
|
19
|
-
def initialize(config)
|
|
20
|
-
validate_config(config)
|
|
21
|
-
@config = config
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
##
|
|
25
|
-
# @param name [Symbol]
|
|
26
|
-
# @return [true, false]
|
|
27
|
-
def selector?(name)
|
|
28
|
-
name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
##
|
|
32
|
-
# @param name [Symbol]
|
|
33
|
-
# @return [Selector]
|
|
34
|
-
def selector(name)
|
|
35
|
-
raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
|
|
36
|
-
|
|
37
|
-
keywords = config[name].slice(*available_keys)
|
|
38
|
-
|
|
39
|
-
if (additional_keys = keywords.keys - available_keys).any?
|
|
40
|
-
Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
Selector.new(keywords)
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
##
|
|
47
|
-
# @return [Set<Symbol>]
|
|
48
|
-
def category_selector_names
|
|
49
|
-
selector_keys_for(:categories)
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
##
|
|
53
|
-
# @return [Set<Symbol>]
|
|
54
|
-
def guid_selector_names
|
|
55
|
-
selector_keys_for(:guid, default: :title_or_description)
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
##
|
|
59
|
-
# Returns the CSS/XPath selector.
|
|
60
|
-
#
|
|
61
|
-
# @param name [Symbol]
|
|
62
|
-
# @return [String]
|
|
63
|
-
def selector_string(name)
|
|
64
|
-
Selector.new(config[name]).selector
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
##
|
|
68
|
-
# @return [Set<Symbol>]
|
|
69
|
-
def item_selector_names
|
|
70
|
-
@item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
##
|
|
74
|
-
# @return [Symbol, nil]
|
|
75
|
-
def items_order
|
|
76
|
-
config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
private
|
|
80
|
-
|
|
81
|
-
attr_reader :config
|
|
82
|
-
|
|
83
|
-
def validate_config(config)
|
|
84
|
-
raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
##
|
|
88
|
-
# Returns the selector keys for the selector named `name`. If none, returns [default].
|
|
89
|
-
#
|
|
90
|
-
# @param name [Symbol]
|
|
91
|
-
# @param default [String, Symbol]
|
|
92
|
-
# @return [Set<Symbol>]
|
|
93
|
-
def selector_keys_for(name, default: nil)
|
|
94
|
-
config.fetch(name) { Array(default) }.tap do |array|
|
|
95
|
-
array.reject! { |entry| entry.to_s == '' }
|
|
96
|
-
array.map!(&:to_sym)
|
|
97
|
-
end.to_set
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
def available_keys = @available_keys ||= Selector.members
|
|
101
|
-
end
|
|
102
|
-
end
|
|
103
|
-
end
|
data/lib/html2rss/item.rb
DELETED
|
@@ -1,186 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'nokogiri'
|
|
4
|
-
|
|
5
|
-
module Html2rss
|
|
6
|
-
##
|
|
7
|
-
# Takes the selected Nokogiri::HTML and responds to accessor names
|
|
8
|
-
# defined in the feed config.
|
|
9
|
-
#
|
|
10
|
-
# Instances can only be created via `.from_url` and
|
|
11
|
-
# each represents an internally used "RSS item".
|
|
12
|
-
# Such an item provides dynamically defined attributes as methods.
|
|
13
|
-
class Item
|
|
14
|
-
# A context instance is passed to Item Extractors.
|
|
15
|
-
Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
|
|
16
|
-
# Class to keep an Item's <enclosure>.
|
|
17
|
-
Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
|
|
18
|
-
|
|
19
|
-
##
|
|
20
|
-
# Fetches items from a given URL using configuration settings.
|
|
21
|
-
#
|
|
22
|
-
# @param url [Addressable::URI] URL to fetch items from.
|
|
23
|
-
# @param config [Html2rss::Config] Configuration object.
|
|
24
|
-
# @return [Array<Html2rss::Item>] list of items fetched.
|
|
25
|
-
def self.from_url(url, config)
|
|
26
|
-
ctx = RequestService::Context.new(url:, headers: config.headers)
|
|
27
|
-
|
|
28
|
-
body = RequestService.execute(ctx, strategy: config.strategy).body
|
|
29
|
-
body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
|
|
30
|
-
|
|
31
|
-
Nokogiri.HTML(body)
|
|
32
|
-
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
|
33
|
-
.map { |xml| new(xml, config) }
|
|
34
|
-
.select(&:valid?)
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
##
|
|
38
|
-
# @param xml [Nokogiri::XML::Element]
|
|
39
|
-
# @param config [Html2rss::Config]
|
|
40
|
-
def initialize(xml, config)
|
|
41
|
-
@xml = xml
|
|
42
|
-
@config = config
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
private_class_method :new
|
|
46
|
-
|
|
47
|
-
##
|
|
48
|
-
# Checks if the object responds to a method dynamically based on the configuration.
|
|
49
|
-
#
|
|
50
|
-
# @param method_name [Symbol]
|
|
51
|
-
# @param _include_private [true, false]
|
|
52
|
-
# @return [true, false]
|
|
53
|
-
# :reek:BooleanParameter { enabled: false }
|
|
54
|
-
def respond_to_missing?(method_name, _include_private = false)
|
|
55
|
-
config.selector?(method_name) || super
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
##
|
|
59
|
-
# Dynamically extracts data based on the method name.
|
|
60
|
-
#
|
|
61
|
-
# @param method_name [Symbol]
|
|
62
|
-
# @param _args [Array]
|
|
63
|
-
# @return [String] extracted value for the selector.
|
|
64
|
-
def method_missing(method_name, *_args)
|
|
65
|
-
return super unless respond_to_missing?(method_name)
|
|
66
|
-
|
|
67
|
-
extract(method_name)
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
##
|
|
71
|
-
# Selects and processes data according to the selector name.
|
|
72
|
-
#
|
|
73
|
-
# @param tag [Symbol]
|
|
74
|
-
# @return [String] the extracted value for the selector.
|
|
75
|
-
def extract(tag)
|
|
76
|
-
attribute_options = config.selector_attributes_with_channel(tag.to_sym)
|
|
77
|
-
|
|
78
|
-
post_process(
|
|
79
|
-
ItemExtractors.item_extractor_factory(attribute_options, xml).get,
|
|
80
|
-
attribute_options.fetch(:post_process, false)
|
|
81
|
-
)
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
##
|
|
85
|
-
# Checks if the item is valid accordin to RSS 2.0 spec,
|
|
86
|
-
# by ensuring it has at least a title or a description.
|
|
87
|
-
#
|
|
88
|
-
# @return [true, false]
|
|
89
|
-
def valid?
|
|
90
|
-
title_or_description.to_s != ''
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
##
|
|
94
|
-
# Returns either the title or the description, preferring title if available.
|
|
95
|
-
#
|
|
96
|
-
# @return [String, nil]
|
|
97
|
-
def title_or_description
|
|
98
|
-
return title if config.selector?(:title)
|
|
99
|
-
|
|
100
|
-
description if config.selector?(:description)
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
##
|
|
104
|
-
#
|
|
105
|
-
# @return [String] SHA1 hashed GUID.
|
|
106
|
-
def guid
|
|
107
|
-
content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
|
|
108
|
-
|
|
109
|
-
Digest::SHA1.hexdigest(content)
|
|
110
|
-
end
|
|
111
|
-
|
|
112
|
-
##
|
|
113
|
-
# Retrieves categories for the item based on configured category selectors.
|
|
114
|
-
#
|
|
115
|
-
# @return [Array<String>] list of categories.
|
|
116
|
-
def categories
|
|
117
|
-
config.category_selector_names
|
|
118
|
-
.filter_map do |method_name|
|
|
119
|
-
category = public_send(method_name)
|
|
120
|
-
category.strip unless category.to_s.empty?
|
|
121
|
-
end.uniq
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
##
|
|
125
|
-
# Checks if the item has an enclosure based on configuration.
|
|
126
|
-
#
|
|
127
|
-
# @return [true, false]
|
|
128
|
-
def enclosure?
|
|
129
|
-
config.selector?(:enclosure)
|
|
130
|
-
end
|
|
131
|
-
|
|
132
|
-
##
|
|
133
|
-
# Retrieves enclosure details for the item.
|
|
134
|
-
#
|
|
135
|
-
# @return [Enclosure] enclosure details.
|
|
136
|
-
def enclosure
|
|
137
|
-
url = enclosure_url
|
|
138
|
-
|
|
139
|
-
raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
|
|
140
|
-
|
|
141
|
-
type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
|
|
142
|
-
Html2rss::Utils.guess_content_type_from_url(url)
|
|
143
|
-
|
|
144
|
-
Enclosure.new(
|
|
145
|
-
type:,
|
|
146
|
-
bits_length: 0,
|
|
147
|
-
url: url.to_s
|
|
148
|
-
)
|
|
149
|
-
end
|
|
150
|
-
|
|
151
|
-
private
|
|
152
|
-
|
|
153
|
-
# @return [Nokogiri::XML::Element] XML element representing the item.
|
|
154
|
-
attr_reader :xml
|
|
155
|
-
# @return [Html2rss::Config] Configuration object for the item.
|
|
156
|
-
attr_reader :config
|
|
157
|
-
|
|
158
|
-
##
|
|
159
|
-
# Processes the extracted value according to post-processing options.
|
|
160
|
-
#
|
|
161
|
-
# @param value [String] extracted value.
|
|
162
|
-
# @param post_process_options [Hash<Symbol, Object>] post-processing options.
|
|
163
|
-
# @return [String] processed value.
|
|
164
|
-
def post_process(value, post_process_options)
|
|
165
|
-
return value unless post_process_options
|
|
166
|
-
|
|
167
|
-
[post_process_options].flatten.each do |options|
|
|
168
|
-
value = AttributePostProcessors.get_processor(options[:name])
|
|
169
|
-
.new(value, Context.new(options:, item: self, config:))
|
|
170
|
-
.get
|
|
171
|
-
end
|
|
172
|
-
|
|
173
|
-
value
|
|
174
|
-
end
|
|
175
|
-
|
|
176
|
-
##
|
|
177
|
-
# Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
|
|
178
|
-
#
|
|
179
|
-
# @return [Addressable::URI, nil] absolute URL of the enclosure.
|
|
180
|
-
def enclosure_url
|
|
181
|
-
enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
|
|
182
|
-
|
|
183
|
-
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
|
|
184
|
-
end
|
|
185
|
-
end
|
|
186
|
-
end
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module ItemExtractors
|
|
5
|
-
##
|
|
6
|
-
# Returns the value of the attribute.
|
|
7
|
-
#
|
|
8
|
-
# Imagine this +time+ HTML tag with a +datetime+ attribute:
|
|
9
|
-
#
|
|
10
|
-
# <time datetime="2019-07-01">...</time>
|
|
11
|
-
#
|
|
12
|
-
# YAML usage example:
|
|
13
|
-
#
|
|
14
|
-
# selectors:
|
|
15
|
-
# link:
|
|
16
|
-
# selector: time
|
|
17
|
-
# extractor: attribute
|
|
18
|
-
# attribute: datetime
|
|
19
|
-
#
|
|
20
|
-
# Would return:
|
|
21
|
-
# '2019-07-01'
|
|
22
|
-
#
|
|
23
|
-
# In case you're extracting a date or a time, consider parsing it
|
|
24
|
-
# during post processing with {AttributePostProcessors::ParseTime}.
|
|
25
|
-
class Attribute
|
|
26
|
-
# The available options for the attribute extractor.
|
|
27
|
-
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
|
|
28
|
-
|
|
29
|
-
##
|
|
30
|
-
# Initializes the Attribute extractor.
|
|
31
|
-
#
|
|
32
|
-
# @param xml [Nokogiri::XML::Element]
|
|
33
|
-
# @param options [Options]
|
|
34
|
-
def initialize(xml, options)
|
|
35
|
-
@options = options
|
|
36
|
-
@element = ItemExtractors.element(xml, options.selector)
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
##
|
|
40
|
-
# Retrieves and returns the attribute's value as a string.
|
|
41
|
-
#
|
|
42
|
-
# @return [String] The value of the attribute.
|
|
43
|
-
def get
|
|
44
|
-
@element.attr(@options.attribute).to_s.freeze
|
|
45
|
-
rescue NoMethodError => error
|
|
46
|
-
raise "Failed to extract attribute: #{error.message}"
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module ItemExtractors
|
|
5
|
-
##
|
|
6
|
-
# Returns the value of the +href+ attribute.
|
|
7
|
-
# It always returns absolute URLs. If the extracted +href+ value is a
|
|
8
|
-
# relative URL, it prepends the channel's URL.
|
|
9
|
-
#
|
|
10
|
-
# Imagine this +a+ HTML element with a +href+ attribute:
|
|
11
|
-
#
|
|
12
|
-
# <a href="/posts/latest-findings">...</a>
|
|
13
|
-
#
|
|
14
|
-
# YAML usage example:
|
|
15
|
-
# channel:
|
|
16
|
-
# url: http://blog-without-a-feed.example.com
|
|
17
|
-
# ...
|
|
18
|
-
# selectors:
|
|
19
|
-
# link:
|
|
20
|
-
# selector: a
|
|
21
|
-
# extractor: href
|
|
22
|
-
#
|
|
23
|
-
# Would return:
|
|
24
|
-
# 'http://blog-without-a-feed.example.com/posts/latest-findings'
|
|
25
|
-
class Href
|
|
26
|
-
# The available options for the href (attribute) extractor.
|
|
27
|
-
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
|
|
28
|
-
|
|
29
|
-
##
|
|
30
|
-
# Initializes the Href extractor.
|
|
31
|
-
#
|
|
32
|
-
# @param xml [Nokogiri::XML::Element]
|
|
33
|
-
# @param options [Options]
|
|
34
|
-
def initialize(xml, options)
|
|
35
|
-
@options = options
|
|
36
|
-
@element = ItemExtractors.element(xml, options.selector)
|
|
37
|
-
@href = @element.attr('href').to_s
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
##
|
|
41
|
-
# Retrieves and returns the normalized absolute URL.
|
|
42
|
-
#
|
|
43
|
-
# @return [String] The absolute URL.
|
|
44
|
-
def get
|
|
45
|
-
return nil unless @href
|
|
46
|
-
|
|
47
|
-
sanitized_href = Html2rss::Utils.sanitize_url(@href)
|
|
48
|
-
Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
end
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module ItemExtractors
|
|
5
|
-
##
|
|
6
|
-
# Returns the HTML content of the specified element.
|
|
7
|
-
#
|
|
8
|
-
# Example HTML structure:
|
|
9
|
-
#
|
|
10
|
-
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
|
11
|
-
#
|
|
12
|
-
# YAML usage example:
|
|
13
|
-
#
|
|
14
|
-
# selectors:
|
|
15
|
-
# description:
|
|
16
|
-
# selector: p
|
|
17
|
-
# extractor: html
|
|
18
|
-
#
|
|
19
|
-
# Would return:
|
|
20
|
-
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
|
21
|
-
#
|
|
22
|
-
# Always ensure to sanitize the HTML during post-processing with
|
|
23
|
-
# {AttributePostProcessors::SanitizeHtml}.
|
|
24
|
-
class Html
|
|
25
|
-
# The available options for the html extractor.
|
|
26
|
-
Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
|
|
27
|
-
|
|
28
|
-
##
|
|
29
|
-
# Initializes the Html extractor.
|
|
30
|
-
#
|
|
31
|
-
# @param xml [Nokogiri::XML::Element]
|
|
32
|
-
# @param options [Options]
|
|
33
|
-
def initialize(xml, options)
|
|
34
|
-
@element = ItemExtractors.element(xml, options.selector)
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
##
|
|
38
|
-
# Retrieves and returns the HTML content of the element.
|
|
39
|
-
#
|
|
40
|
-
# @return [String] The HTML content.
|
|
41
|
-
def get
|
|
42
|
-
@element.to_s
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module ItemExtractors
|
|
5
|
-
##
|
|
6
|
-
# Returns a static value provided in the options.
|
|
7
|
-
#
|
|
8
|
-
# Example usage in YAML:
|
|
9
|
-
#
|
|
10
|
-
# selectors:
|
|
11
|
-
# author:
|
|
12
|
-
# extractor: static
|
|
13
|
-
# static: Foobar
|
|
14
|
-
#
|
|
15
|
-
# Would return:
|
|
16
|
-
# 'Foobar'
|
|
17
|
-
class Static
|
|
18
|
-
# The available option for the static extractor.
|
|
19
|
-
Options = Struct.new('StaticOptions', :static, keyword_init: true)
|
|
20
|
-
|
|
21
|
-
##
|
|
22
|
-
# Initializes the Static extractor.
|
|
23
|
-
#
|
|
24
|
-
# @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
|
|
25
|
-
# @param options [Options] Options containing the static value.
|
|
26
|
-
def initialize(_xml, options)
|
|
27
|
-
@options = options
|
|
28
|
-
end
|
|
29
|
-
|
|
30
|
-
##
|
|
31
|
-
# Retrieves and returns the static value.
|
|
32
|
-
#
|
|
33
|
-
# @return [String, Symbol] The static value provided in options.
|
|
34
|
-
def get
|
|
35
|
-
@options.static
|
|
36
|
-
end
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
end
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module ItemExtractors
|
|
5
|
-
##
|
|
6
|
-
# Return the text content of the attribute. This is the default extractor used,
|
|
7
|
-
# when no extractor is explicitly given.
|
|
8
|
-
#
|
|
9
|
-
# Example HTML structure:
|
|
10
|
-
#
|
|
11
|
-
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
|
12
|
-
#
|
|
13
|
-
# YAML usage example:
|
|
14
|
-
#
|
|
15
|
-
# selectors:
|
|
16
|
-
# description:
|
|
17
|
-
# selector: p
|
|
18
|
-
# extractor: text
|
|
19
|
-
#
|
|
20
|
-
# Would return:
|
|
21
|
-
# 'Lorem ipsum dolor ...'
|
|
22
|
-
class Text
|
|
23
|
-
# The available options for the text extractor.
|
|
24
|
-
Options = Struct.new('TextOptions', :selector, keyword_init: true)
|
|
25
|
-
|
|
26
|
-
##
|
|
27
|
-
# Initializes the Text extractor.
|
|
28
|
-
#
|
|
29
|
-
# @param xml [Nokogiri::XML::Element]
|
|
30
|
-
# @param options [Options]
|
|
31
|
-
def initialize(xml, options)
|
|
32
|
-
@element = ItemExtractors.element(xml, options.selector)
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
##
|
|
36
|
-
# Retrieves and returns the text content of the element.
|
|
37
|
-
#
|
|
38
|
-
# @return [String] The text content.
|
|
39
|
-
def get
|
|
40
|
-
@element.text.to_s.strip.gsub(/\s+/, ' ')
|
|
41
|
-
end
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
end
|
|
@@ -1,88 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
##
|
|
5
|
-
# Provides a namespace for item extractors.
|
|
6
|
-
module ItemExtractors
|
|
7
|
-
##
|
|
8
|
-
# The Error class to be thrown when an unknown extractor name is requested.
|
|
9
|
-
class UnknownExtractorName < Html2rss::Error; end
|
|
10
|
-
|
|
11
|
-
##
|
|
12
|
-
# Maps the extractor name to the class implementing the extractor.
|
|
13
|
-
#
|
|
14
|
-
# The key is the name to use in the feed config.
|
|
15
|
-
NAME_TO_CLASS = {
|
|
16
|
-
attribute: Attribute,
|
|
17
|
-
href: Href,
|
|
18
|
-
html: Html,
|
|
19
|
-
static: Static,
|
|
20
|
-
text: Text
|
|
21
|
-
}.freeze
|
|
22
|
-
|
|
23
|
-
##
|
|
24
|
-
# Maps the extractor class to its corresponding options class.
|
|
25
|
-
ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
|
|
26
|
-
hash[klass] = klass.const_get(:Options)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
DEFAULT_EXTRACTOR = :text
|
|
30
|
-
|
|
31
|
-
##
|
|
32
|
-
# Retrieves an element from Nokogiri XML based on the selector.
|
|
33
|
-
#
|
|
34
|
-
# @param xml [Nokogiri::XML::Document]
|
|
35
|
-
# @param selector [String, nil]
|
|
36
|
-
# @return [Nokogiri::XML::ElementSet] selected XML elements
|
|
37
|
-
def self.element(xml, selector)
|
|
38
|
-
selector ? xml.css(selector) : xml
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
##
|
|
42
|
-
# Creates an instance of the requested item extractor.
|
|
43
|
-
#
|
|
44
|
-
# @param attribute_options [Hash<Symbol, Object>]
|
|
45
|
-
# Should contain at least `:extractor` (the name) and required options for that extractor.
|
|
46
|
-
# @param xml [Nokogiri::XML::Document]
|
|
47
|
-
# @return [Object] instance of the specified item extractor class
|
|
48
|
-
def self.item_extractor_factory(attribute_options, xml)
|
|
49
|
-
extractor_name = attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR
|
|
50
|
-
extractor_class = find_extractor_class(extractor_name)
|
|
51
|
-
options_instance = build_options_instance(extractor_class, attribute_options)
|
|
52
|
-
create_extractor_instance(extractor_class, xml, options_instance)
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
##
|
|
56
|
-
# Finds the extractor class based on the name.
|
|
57
|
-
#
|
|
58
|
-
# @param extractor_name [Symbol] the name of the extractor
|
|
59
|
-
# @return [Class] the class implementing the extractor
|
|
60
|
-
# @raise [UnknownExtractorName] if the extractor class is not found
|
|
61
|
-
def self.find_extractor_class(extractor_name)
|
|
62
|
-
NAME_TO_CLASS[extractor_name] || raise(UnknownExtractorName,
|
|
63
|
-
"Unknown extractor name '#{extractor_name}' requested in NAME_TO_CLASS")
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
##
|
|
67
|
-
# Builds the options instance for the extractor class.
|
|
68
|
-
#
|
|
69
|
-
# @param extractor_class [Class] the class implementing the extractor
|
|
70
|
-
# @param attribute_options [Hash<Symbol, Object>] the attribute options
|
|
71
|
-
# @return [Object] an instance of the options class for the extractor
|
|
72
|
-
def self.build_options_instance(extractor_class, attribute_options)
|
|
73
|
-
options = attribute_options.slice(*extractor_class::Options.members)
|
|
74
|
-
ITEM_OPTION_CLASSES[extractor_class].new(options)
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
##
|
|
78
|
-
# Creates an instance of the extractor class.
|
|
79
|
-
#
|
|
80
|
-
# @param extractor_class [Class] the class implementing the extractor
|
|
81
|
-
# @param xml [Nokogiri::XML::Document] the XML document
|
|
82
|
-
# @param options_instance [Object] the options instance
|
|
83
|
-
# @return [Object] an instance of the extractor class
|
|
84
|
-
def self.create_extractor_instance(extractor_class, xml, options_instance)
|
|
85
|
-
extractor_class.new(xml, options_instance)
|
|
86
|
-
end
|
|
87
|
-
end
|
|
88
|
-
end
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'cgi'
|
|
4
|
-
require 'json'
|
|
5
|
-
|
|
6
|
-
module Html2rss
|
|
7
|
-
##
|
|
8
|
-
# A naive implementation of "Object to XML": converts a Ruby object to XML format.
|
|
9
|
-
class ObjectToXmlConverter
|
|
10
|
-
OBJECT_TO_XML_TAGS = {
|
|
11
|
-
hash: ['<object>', '</object>'],
|
|
12
|
-
enumerable: ['<array>', '</array>']
|
|
13
|
-
}.freeze
|
|
14
|
-
|
|
15
|
-
##
|
|
16
|
-
# @param object [Object] any Ruby object (Hash, Array, String, Symbol, etc.)
|
|
17
|
-
def initialize(object)
|
|
18
|
-
@object = object
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
##
|
|
22
|
-
# Converts the object to XML format.
|
|
23
|
-
#
|
|
24
|
-
# @return [String] representing the object in XML
|
|
25
|
-
def call
|
|
26
|
-
object_to_xml(@object)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
private
|
|
30
|
-
|
|
31
|
-
def object_to_xml(object)
|
|
32
|
-
case object
|
|
33
|
-
when Hash
|
|
34
|
-
hash_to_xml(object)
|
|
35
|
-
when Enumerable
|
|
36
|
-
enumerable_to_xml(object)
|
|
37
|
-
else
|
|
38
|
-
CGI.escapeHTML(object.to_s)
|
|
39
|
-
end
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
def hash_to_xml(object)
|
|
43
|
-
prefix, suffix = OBJECT_TO_XML_TAGS[:hash]
|
|
44
|
-
inner_xml = object.map { |key, value| "<#{key}>#{object_to_xml(value)}</#{key}>" }.join
|
|
45
|
-
|
|
46
|
-
"#{prefix}#{inner_xml}#{suffix}"
|
|
47
|
-
end
|
|
48
|
-
|
|
49
|
-
def enumerable_to_xml(object)
|
|
50
|
-
prefix, suffix = OBJECT_TO_XML_TAGS[:enumerable]
|
|
51
|
-
inner_xml = object.map { |value| object_to_xml(value) }.join
|
|
52
|
-
|
|
53
|
-
"#{prefix}#{inner_xml}#{suffix}"
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
end
|