html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'mime/types'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class RssBuilder
|
|
7
|
+
##
|
|
8
|
+
# Represents an enclosure for an RSS item.
|
|
9
|
+
class Enclosure
|
|
10
|
+
##
|
|
11
|
+
# Guesses the content type based on the file extension of the URL.
|
|
12
|
+
#
|
|
13
|
+
# @param url [Html2rss::Url]
|
|
14
|
+
# @param default [String] default content type
|
|
15
|
+
# @return [String] guessed content type, or default
|
|
16
|
+
def self.guess_content_type_from_url(url, default: 'application/octet-stream')
|
|
17
|
+
return default unless url
|
|
18
|
+
|
|
19
|
+
url = url.path.split('?').first
|
|
20
|
+
|
|
21
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
|
22
|
+
content_type.first&.to_s || 'application/octet-stream'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def self.add(enclosure, maker)
|
|
26
|
+
return unless enclosure
|
|
27
|
+
|
|
28
|
+
maker.enclosure.tap do |enclosure_maker|
|
|
29
|
+
enclosure_maker.url = enclosure.url.to_s
|
|
30
|
+
enclosure_maker.type = enclosure.type
|
|
31
|
+
enclosure_maker.length = enclosure.bits_length
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def initialize(url:, type: nil, bits_length: 0)
|
|
36
|
+
raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
|
|
37
|
+
|
|
38
|
+
@url = url
|
|
39
|
+
@type = type
|
|
40
|
+
@bits_length = bits_length
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def type = @type || self.class.guess_content_type_from_url(url)
|
|
44
|
+
|
|
45
|
+
attr_reader :bits_length, :url
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Html2rss
|
|
4
|
-
|
|
4
|
+
class RssBuilder
|
|
5
5
|
##
|
|
6
6
|
# Represents a stylesheet.
|
|
7
7
|
class Stylesheet
|
|
@@ -10,7 +10,7 @@ module Html2rss
|
|
|
10
10
|
# Adds the stylesheet XML tags to the RSS.
|
|
11
11
|
#
|
|
12
12
|
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
|
13
|
-
# @param stylesheets [Array<Html2rss::
|
|
13
|
+
# @param stylesheets [Array<Html2rss::RssBuilder::Stylesheet>] Array of stylesheet configurations.
|
|
14
14
|
# @return [nil]
|
|
15
15
|
def add(maker, stylesheets)
|
|
16
16
|
stylesheets.each do |stylesheet|
|
|
@@ -24,7 +24,7 @@ module Html2rss
|
|
|
24
24
|
# Adds a single Stylesheet to the RSS.
|
|
25
25
|
#
|
|
26
26
|
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
|
27
|
-
# @param stylesheet [Html2rss::
|
|
27
|
+
# @param stylesheet [Html2rss::RssBuilder::Stylesheet] Stylesheet configuration.
|
|
28
28
|
# @return [nil]
|
|
29
29
|
def add_stylesheet(maker, stylesheet)
|
|
30
30
|
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
|
@@ -35,7 +35,7 @@ module Html2rss
|
|
|
35
35
|
end
|
|
36
36
|
end
|
|
37
37
|
|
|
38
|
-
TYPES = ['text/css', 'text/xsl'].freeze
|
|
38
|
+
TYPES = ['text/css', 'text/xsl'].to_set.freeze
|
|
39
39
|
|
|
40
40
|
def initialize(href:, type:, media: 'all')
|
|
41
41
|
raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
|
data/lib/html2rss/rss_builder.rb
CHANGED
|
@@ -4,93 +4,94 @@ require 'rss'
|
|
|
4
4
|
|
|
5
5
|
module Html2rss
|
|
6
6
|
##
|
|
7
|
-
# Builds
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
7
|
+
# Builds an RSS Feed by providing channel, articles and stylesheets.
|
|
8
|
+
class RssBuilder
|
|
9
|
+
class << self
|
|
10
|
+
def add_item(article, item_maker)
|
|
11
|
+
add_item_string_values(article, item_maker)
|
|
12
|
+
add_item_categories(article, item_maker)
|
|
13
|
+
Enclosure.add(article.enclosure, item_maker)
|
|
14
|
+
add_item_guid(article, item_maker)
|
|
15
|
+
end
|
|
14
16
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
17
|
+
private
|
|
18
|
+
|
|
19
|
+
def add_item_string_values(article, item_maker)
|
|
20
|
+
%i[title description author].each do |attr|
|
|
21
|
+
next unless (value = article.send(attr))
|
|
22
|
+
next if value.empty?
|
|
23
|
+
|
|
24
|
+
item_maker.send(:"#{attr}=", value)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
item_maker.link = article.url.to_s if article.url
|
|
28
|
+
item_maker.pubDate = article.published_at&.rfc2822
|
|
25
29
|
end
|
|
26
|
-
end
|
|
27
30
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
# @param maker [RSS::Maker] RSS maker instance.
|
|
32
|
-
# @param stylesheets [Array<String>] Array of stylesheets to add.
|
|
33
|
-
def self.add_stylesheets(maker, stylesheets)
|
|
34
|
-
Stylesheet.add(maker, stylesheets)
|
|
35
|
-
end
|
|
31
|
+
def add_item_categories(article, item_maker)
|
|
32
|
+
article.categories.each { |category| item_maker.categories.new_category.content = category }
|
|
33
|
+
end
|
|
36
34
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def self.add_channel(maker, config)
|
|
43
|
-
channel = maker.channel
|
|
44
|
-
CHANNEL_TAGS.each do |tag|
|
|
45
|
-
Channel.add(channel, config, [tag])
|
|
35
|
+
def add_item_guid(article, item_maker)
|
|
36
|
+
item_maker.guid.tap do |guid|
|
|
37
|
+
guid.content = article.guid
|
|
38
|
+
guid.isPermaLink = false
|
|
39
|
+
end
|
|
46
40
|
end
|
|
47
41
|
end
|
|
48
42
|
|
|
49
43
|
##
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
# @param
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
44
|
+
# @param channel [Html2rss::RssBuilder::Channel] The channel information for the RSS feed.
|
|
45
|
+
# @param articles [Array<Html2rss::RssBuilder::Article>] The list of articles to include in the RSS feed.
|
|
46
|
+
# @param stylesheets [Array<Hash>] An optional array of stylesheet configurations.
|
|
47
|
+
def initialize(channel:, articles:, stylesheets: [])
|
|
48
|
+
@channel = channel
|
|
49
|
+
@articles = articles
|
|
50
|
+
@stylesheets = stylesheets
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def call
|
|
54
|
+
RSS::Maker.make('2.0') do |maker|
|
|
55
|
+
Stylesheet.add(maker, stylesheets)
|
|
56
|
+
|
|
57
|
+
make_channel(maker.channel)
|
|
58
|
+
make_items(maker)
|
|
61
59
|
end
|
|
62
60
|
end
|
|
63
61
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
# @return [nil]
|
|
71
|
-
def self.add_item(maker, item, item_attributes)
|
|
72
|
-
new_item = maker.items.new_item
|
|
73
|
-
Item.add(new_item, item, item_attributes)
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
attr_reader :channel, :articles
|
|
65
|
+
|
|
66
|
+
def stylesheets
|
|
67
|
+
@stylesheets.map { |style| Stylesheet.new(**style) }
|
|
74
68
|
end
|
|
75
69
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
70
|
+
def make_channel(maker)
|
|
71
|
+
%i[language title description ttl].each do |key|
|
|
72
|
+
maker.public_send(:"#{key}=", channel.public_send(key))
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
maker.link = channel.url.to_s
|
|
76
|
+
maker.generator = generator
|
|
77
|
+
maker.updated = channel.last_build_date
|
|
83
78
|
end
|
|
84
79
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
# @return [Array<Html2rss::Item>] Array of items.
|
|
90
|
-
def self.fetch_items(config)
|
|
91
|
-
Html2rss::Item.from_url(config.url, config)
|
|
80
|
+
def make_items(maker)
|
|
81
|
+
articles.each do |article|
|
|
82
|
+
maker.items.new_item { |item_maker| self.class.add_item(article, item_maker) }
|
|
83
|
+
end
|
|
92
84
|
end
|
|
93
85
|
|
|
94
|
-
|
|
86
|
+
def generator
|
|
87
|
+
scraper_namespace_regex = /(?<namespace>Html2rss|Scraper)::/
|
|
88
|
+
|
|
89
|
+
scraper_counts = articles.flat_map(&:scraper).tally.map do |klass, count|
|
|
90
|
+
scraper_name = klass.to_s.gsub(scraper_namespace_regex, '')
|
|
91
|
+
"#{scraper_name} (#{count})"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
"html2rss V. #{Html2rss::VERSION} (scrapers: #{scraper_counts.join(', ')})"
|
|
95
|
+
end
|
|
95
96
|
end
|
|
96
97
|
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry-validation'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Selectors
|
|
7
|
+
##
|
|
8
|
+
# Validates the configuration hash for :selectors.
|
|
9
|
+
class Config < Dry::Validation::Contract
|
|
10
|
+
NESTING_KEY = :dynamic_keys_workaround
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
# Validates the configuration of the :items selector
|
|
14
|
+
class Items < Dry::Validation::Contract
|
|
15
|
+
params do
|
|
16
|
+
required(:selector).filled(:string)
|
|
17
|
+
optional(:order).filled(included_in?: %w[reverse])
|
|
18
|
+
optional(:enhance).filled(:bool?)
|
|
19
|
+
optional(:pagination).hash do
|
|
20
|
+
required(:max_pages).filled(:integer, gt?: 0)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
##
|
|
26
|
+
# Validates the configuration of a single selector.
|
|
27
|
+
class Selector < Dry::Validation::Contract
|
|
28
|
+
params do
|
|
29
|
+
optional(:selector)
|
|
30
|
+
optional(:extractor).filled(:string)
|
|
31
|
+
optional(:attribute).filled(:string)
|
|
32
|
+
optional(:static).filled(:string)
|
|
33
|
+
optional(:post_process).array(:hash)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
rule(:selector) do
|
|
37
|
+
key(:selector).failure('`selector` must be a string') if value && !value.is_a?(String)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
rule(:extractor) do
|
|
41
|
+
# dependent on the extractor, validate required fields, (i.e. static, attribute)
|
|
42
|
+
case value
|
|
43
|
+
when 'attribute'
|
|
44
|
+
key(:attribute).failure('`attribute` must be a string') unless values[:attribute].is_a?(String)
|
|
45
|
+
when 'static'
|
|
46
|
+
key(:static).failure('`static` must be a string') unless values[:static].is_a?(String)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
rule(:post_process).each do
|
|
51
|
+
case (name = value[:name])
|
|
52
|
+
when 'gsub'
|
|
53
|
+
key(:pattern).failure('`pattern` must be a string') unless value[:pattern].is_a?(String)
|
|
54
|
+
key(:replacement).failure('`replacement` must be a string') unless value[:replacement].is_a?(String)
|
|
55
|
+
when 'substring'
|
|
56
|
+
key(:start).failure('`start` must be an integer') unless value[:start].is_a?(Integer)
|
|
57
|
+
key(:end).failure('`end` must be an integer or omitted') if !value[:end].nil? && !value[:end].is_a?(Integer)
|
|
58
|
+
when 'template'
|
|
59
|
+
key(:string).failure('`string` must be a string') unless value[:string].is_a?(String)
|
|
60
|
+
when 'html_to_markdown', 'markdown_to_html', 'parse_time', 'parse_uri', 'sanitize_html'
|
|
61
|
+
# nothing to validate
|
|
62
|
+
when nil
|
|
63
|
+
key(:post_process).failure('Missing post_processor `name`')
|
|
64
|
+
else
|
|
65
|
+
key(:post_process).failure("Unknown post_processor `name`: #{name}")
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
##
|
|
71
|
+
# Validates the configuration of the :enclosure Selector
|
|
72
|
+
class Enclosure < Selector
|
|
73
|
+
params do
|
|
74
|
+
optional(:content_type).filled(:string, format?: %r{^[\w-]+/[\w-]+$})
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
params do
|
|
79
|
+
required(NESTING_KEY).hash
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
rule(NESTING_KEY) do
|
|
83
|
+
value.each_pair do |selector_key, selector|
|
|
84
|
+
case selector_key.to_sym
|
|
85
|
+
when Selectors::ITEMS_SELECTOR_KEY
|
|
86
|
+
Items.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
|
|
87
|
+
when :enclosure
|
|
88
|
+
Enclosure.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
|
|
89
|
+
when :guid, :categories
|
|
90
|
+
unless selector.is_a?(Array)
|
|
91
|
+
key(selector_key).failure("`#{selector_key}` must be an array")
|
|
92
|
+
next
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
key(selector_key).failure("`#{selector_key}` must contain at least one element") if selector.empty?
|
|
96
|
+
|
|
97
|
+
selector.each do |name|
|
|
98
|
+
next if values[NESTING_KEY].key?(name.to_sym)
|
|
99
|
+
|
|
100
|
+
key(selector_key).failure("`#{selector_key}` references unspecified `#{name}`")
|
|
101
|
+
end
|
|
102
|
+
else
|
|
103
|
+
# From here on, the selector is found under its "dynamic" selector_key
|
|
104
|
+
Selector.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
##
|
|
110
|
+
# Shortcut to validate the config.
|
|
111
|
+
# @param config [Hash] the configuration hash to validate
|
|
112
|
+
# @return [Dry::Validation::Result] the result of the validation
|
|
113
|
+
def self.call(config)
|
|
114
|
+
# dry-validation/schema does not support "Dynamic Keys" yet: https://github.com/dry-rb/dry-schema/issues/37
|
|
115
|
+
# But :selectors contains mostly "dynamic" keys, as the user defines them to extract article attributes.
|
|
116
|
+
# --> Validate the dynamic keys manually.
|
|
117
|
+
# To be able to specify a `rule`, nest the config under NESTING_KEY and mark that as `required`.
|
|
118
|
+
new.call(NESTING_KEY => config)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns the value of the attribute.
|
|
8
|
+
#
|
|
9
|
+
# Imagine this +time+ HTML tag with a +datetime+ attribute:
|
|
10
|
+
#
|
|
11
|
+
# <time datetime="2019-07-01">...</time>
|
|
12
|
+
#
|
|
13
|
+
# YAML usage example:
|
|
14
|
+
#
|
|
15
|
+
# selectors:
|
|
16
|
+
# link:
|
|
17
|
+
# selector: time
|
|
18
|
+
# extractor: attribute
|
|
19
|
+
# attribute: datetime
|
|
20
|
+
#
|
|
21
|
+
# Would return:
|
|
22
|
+
# '2019-07-01'
|
|
23
|
+
#
|
|
24
|
+
# In case you're extracting a date or a time, consider parsing it
|
|
25
|
+
# during post processing with {PostProcessors::ParseTime}.
|
|
26
|
+
class Attribute
|
|
27
|
+
# The available options for the attribute extractor.
|
|
28
|
+
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Initializes the Attribute extractor.
|
|
32
|
+
#
|
|
33
|
+
# @param xml [Nokogiri::XML::Element]
|
|
34
|
+
# @param options [Options]
|
|
35
|
+
def initialize(xml, options)
|
|
36
|
+
@options = options
|
|
37
|
+
@element = Extractors.element(xml, options.selector)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
##
|
|
41
|
+
# Retrieves and returns the attribute's value as a string.
|
|
42
|
+
#
|
|
43
|
+
# @return [String] The value of the attribute.
|
|
44
|
+
def get
|
|
45
|
+
@element.attr(@options.attribute).to_s
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns the value of the +href+ attribute.
|
|
8
|
+
# It always returns absolute URLs. If the extracted +href+ value is a
|
|
9
|
+
# relative URL, it prepends the channel's URL.
|
|
10
|
+
#
|
|
11
|
+
# Imagine this +a+ HTML element with a +href+ attribute:
|
|
12
|
+
#
|
|
13
|
+
# <a href="/posts/latest-findings">...</a>
|
|
14
|
+
#
|
|
15
|
+
# YAML usage example:
|
|
16
|
+
# channel:
|
|
17
|
+
# url: http://blog-without-a-feed.example.com
|
|
18
|
+
# ...
|
|
19
|
+
# selectors:
|
|
20
|
+
# link:
|
|
21
|
+
# selector: a
|
|
22
|
+
# extractor: href
|
|
23
|
+
#
|
|
24
|
+
# Would return:
|
|
25
|
+
# 'http://blog-without-a-feed.example.com/posts/latest-findings'
|
|
26
|
+
class Href
|
|
27
|
+
# The available options for the href (attribute) extractor.
|
|
28
|
+
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Initializes the Href extractor.
|
|
32
|
+
#
|
|
33
|
+
# @param xml [Nokogiri::XML::Element]
|
|
34
|
+
# @param options [Options]
|
|
35
|
+
def initialize(xml, options)
|
|
36
|
+
@options = options
|
|
37
|
+
@element = Extractors.element(xml, options.selector)
|
|
38
|
+
@href = @element.attr('href').to_s
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
##
|
|
42
|
+
# Retrieves and returns the normalized absolute URL.
|
|
43
|
+
#
|
|
44
|
+
# @return [String] The absolute URL.
|
|
45
|
+
def get
|
|
46
|
+
return nil unless @href
|
|
47
|
+
|
|
48
|
+
Url.from_relative(@href, @options.channel[:url])
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns the HTML content of the specified element.
|
|
8
|
+
#
|
|
9
|
+
# Example HTML structure:
|
|
10
|
+
#
|
|
11
|
+
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
|
12
|
+
#
|
|
13
|
+
# YAML usage example:
|
|
14
|
+
#
|
|
15
|
+
# selectors:
|
|
16
|
+
# description:
|
|
17
|
+
# selector: p
|
|
18
|
+
# extractor: html
|
|
19
|
+
#
|
|
20
|
+
# Would return:
|
|
21
|
+
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
|
22
|
+
#
|
|
23
|
+
# Always ensure to sanitize the HTML during post-processing with
|
|
24
|
+
# {PostProcessors::SanitizeHtml}.
|
|
25
|
+
class Html
|
|
26
|
+
# The available options for the html extractor.
|
|
27
|
+
Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
28
|
+
|
|
29
|
+
##
|
|
30
|
+
# Initializes the Html extractor.
|
|
31
|
+
#
|
|
32
|
+
# @param xml [Nokogiri::XML::Element]
|
|
33
|
+
# @param options [Options]
|
|
34
|
+
def initialize(xml, options)
|
|
35
|
+
@element = Extractors.element(xml, options.selector)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# Retrieves and returns the HTML content of the element.
|
|
40
|
+
#
|
|
41
|
+
# @return [String] The HTML content.
|
|
42
|
+
def get
|
|
43
|
+
@element.to_s
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns a static value provided in the options.
|
|
8
|
+
#
|
|
9
|
+
# Example usage in YAML:
|
|
10
|
+
#
|
|
11
|
+
# selectors:
|
|
12
|
+
# author:
|
|
13
|
+
# extractor: static
|
|
14
|
+
# static: Foobar
|
|
15
|
+
#
|
|
16
|
+
# Would return:
|
|
17
|
+
# 'Foobar'
|
|
18
|
+
class Static
|
|
19
|
+
# The available option for the static extractor.
|
|
20
|
+
Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Initializes the Static extractor.
|
|
24
|
+
#
|
|
25
|
+
# @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
|
|
26
|
+
# @param options [Options] Options containing the static value.
|
|
27
|
+
def initialize(_xml, options)
|
|
28
|
+
@options = options
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
##
|
|
32
|
+
# Retrieves and returns the static value.
|
|
33
|
+
#
|
|
34
|
+
# @return [String, Symbol] The static value provided in options.
|
|
35
|
+
def get
|
|
36
|
+
@options.static
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Return the text content of the attribute. This is the default extractor used,
|
|
8
|
+
# when no extractor is explicitly given.
|
|
9
|
+
#
|
|
10
|
+
# Example HTML structure:
|
|
11
|
+
#
|
|
12
|
+
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
|
13
|
+
#
|
|
14
|
+
# YAML usage example:
|
|
15
|
+
#
|
|
16
|
+
# selectors:
|
|
17
|
+
# description:
|
|
18
|
+
# selector: p
|
|
19
|
+
# extractor: text
|
|
20
|
+
#
|
|
21
|
+
# Would return:
|
|
22
|
+
# 'Lorem ipsum dolor ...'
|
|
23
|
+
class Text
|
|
24
|
+
# The available options for the text extractor.
|
|
25
|
+
Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# Initializes the Text extractor.
|
|
29
|
+
#
|
|
30
|
+
# @param xml [Nokogiri::XML::Element]
|
|
31
|
+
# @param options [Options]
|
|
32
|
+
def initialize(xml, options)
|
|
33
|
+
@element = Extractors.element(xml, options.selector)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# Retrieves and returns the text content of the element.
|
|
38
|
+
#
|
|
39
|
+
# @return [String] The text content.
|
|
40
|
+
def get
|
|
41
|
+
@element.text.to_s.strip.gsub(/\s+/, ' ')
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
##
|
|
6
|
+
# Provides a namespace for item extractors.
|
|
7
|
+
module Extractors
|
|
8
|
+
##
|
|
9
|
+
# Maps the extractor name to the class implementing the extractor.
|
|
10
|
+
#
|
|
11
|
+
# The key is the name to use in the feed config.
|
|
12
|
+
NAME_TO_CLASS = {
|
|
13
|
+
attribute: Attribute,
|
|
14
|
+
href: Href,
|
|
15
|
+
html: Html,
|
|
16
|
+
static: Static,
|
|
17
|
+
text: Text
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
20
|
+
##
|
|
21
|
+
# Maps the extractor class to its corresponding options class.
|
|
22
|
+
ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
|
|
23
|
+
hash[klass] = klass.const_get(:Options)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
DEFAULT_EXTRACTOR = :text
|
|
27
|
+
|
|
28
|
+
class << self
|
|
29
|
+
##
|
|
30
|
+
# Retrieves an element from Nokogiri XML based on the selector.
|
|
31
|
+
#
|
|
32
|
+
# @param xml [Nokogiri::XML::Document]
|
|
33
|
+
# @param selector [String, nil]
|
|
34
|
+
# @return [Nokogiri::XML::ElementSet] selected XML elements
|
|
35
|
+
def element(xml, selector)
|
|
36
|
+
selector ? xml.css(selector) : xml
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @param attribute_options [Hash<Symbol, Object>]
|
|
40
|
+
# Should contain at least `:extractor` (the name) and required options for that extractor.
|
|
41
|
+
# @param xml [Nokogiri::XML::Document]
|
|
42
|
+
# @return [Object] instance of the specified item extractor class
|
|
43
|
+
def get(attribute_options, xml)
|
|
44
|
+
extractor_class = NAME_TO_CLASS[attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR]
|
|
45
|
+
options = ITEM_OPTION_CLASSES[extractor_class].new(attribute_options.slice(*extractor_class::Options.members))
|
|
46
|
+
|
|
47
|
+
extractor_class.new(xml, options).get
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|