html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry-validation'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Selectors
|
|
7
|
+
##
|
|
8
|
+
# Validates the configuration hash for :selectors.
|
|
9
|
+
class Config < Dry::Validation::Contract
|
|
10
|
+
# Required wrapper key used to validate dynamic selector names.
|
|
11
|
+
NESTING_KEY = :dynamic_keys_workaround
|
|
12
|
+
|
|
13
|
+
##
|
|
14
|
+
# Validates the configuration of the :items selector
|
|
15
|
+
class Items < Dry::Validation::Contract
|
|
16
|
+
params do
|
|
17
|
+
required(:selector).filled(:string)
|
|
18
|
+
optional(:order).filled(included_in?: %w[reverse])
|
|
19
|
+
optional(:enhance).filled(:bool?)
|
|
20
|
+
optional(:pagination).hash do
|
|
21
|
+
required(:max_pages).filled(:integer, gt?: 0)
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
##
|
|
27
|
+
# Validates the configuration of a single selector.
|
|
28
|
+
class Selector < Dry::Validation::Contract
|
|
29
|
+
params do
|
|
30
|
+
optional(:selector)
|
|
31
|
+
optional(:extractor).filled(:string)
|
|
32
|
+
optional(:attribute).filled(:string)
|
|
33
|
+
optional(:static).filled(:string)
|
|
34
|
+
optional(:post_process).array(:hash)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
rule(:selector) do
|
|
38
|
+
key(:selector).failure('`selector` must be a string') if value && !value.is_a?(String)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
rule(:extractor) do
|
|
42
|
+
# dependent on the extractor, validate required fields, (i.e. static, attribute)
|
|
43
|
+
case value
|
|
44
|
+
when 'attribute'
|
|
45
|
+
key(:attribute).failure('`attribute` must be a string') unless values[:attribute].is_a?(String)
|
|
46
|
+
when 'static'
|
|
47
|
+
key(:static).failure('`static` must be a string') unless values[:static].is_a?(String)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
rule(:post_process).each do
|
|
52
|
+
case (name = value[:name])
|
|
53
|
+
when 'gsub'
|
|
54
|
+
key(:pattern).failure('`pattern` must be a string') unless value[:pattern].is_a?(String)
|
|
55
|
+
key(:replacement).failure('`replacement` must be a string') unless value[:replacement].is_a?(String)
|
|
56
|
+
when 'substring'
|
|
57
|
+
key(:start).failure('`start` must be an integer') unless value[:start].is_a?(Integer)
|
|
58
|
+
key(:end).failure('`end` must be an integer or omitted') if !value[:end].nil? && !value[:end].is_a?(Integer)
|
|
59
|
+
when 'template'
|
|
60
|
+
key(:string).failure('`string` must be a string') unless value[:string].is_a?(String)
|
|
61
|
+
when 'html_to_markdown', 'markdown_to_html', 'parse_time', 'parse_uri', 'sanitize_html'
|
|
62
|
+
# nothing to validate
|
|
63
|
+
when nil
|
|
64
|
+
key(:post_process).failure('Missing post_processor `name`')
|
|
65
|
+
else
|
|
66
|
+
key(:post_process).failure("Unknown post_processor `name`: #{name}")
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
##
|
|
72
|
+
# Validates the configuration of the :enclosure Selector
|
|
73
|
+
class Enclosure < Selector
|
|
74
|
+
params do
|
|
75
|
+
optional(:content_type).filled(:string, format?: %r{^[\w-]+/[\w-]+$})
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
params do
|
|
80
|
+
required(NESTING_KEY).hash
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
rule(NESTING_KEY) do
|
|
84
|
+
value.each_pair do |selector_key, selector|
|
|
85
|
+
case selector_key.to_sym
|
|
86
|
+
when Selectors::ITEMS_SELECTOR_KEY
|
|
87
|
+
Items.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
|
|
88
|
+
when :enclosure
|
|
89
|
+
Enclosure.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
|
|
90
|
+
when :guid, :categories
|
|
91
|
+
unless selector.is_a?(Array)
|
|
92
|
+
key(selector_key).failure("`#{selector_key}` must be an array")
|
|
93
|
+
next
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
key(selector_key).failure("`#{selector_key}` must contain at least one element") if selector.empty?
|
|
97
|
+
|
|
98
|
+
selector.each do |name|
|
|
99
|
+
next if values[NESTING_KEY].key?(name.to_sym)
|
|
100
|
+
|
|
101
|
+
key(selector_key).failure("`#{selector_key}` references unspecified `#{name}`")
|
|
102
|
+
end
|
|
103
|
+
else
|
|
104
|
+
# From here on, the selector is found under its "dynamic" selector_key
|
|
105
|
+
Selector.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
##
|
|
111
|
+
# Shortcut to validate the config.
|
|
112
|
+
# @param config [Hash] the configuration hash to validate
|
|
113
|
+
# @return [Dry::Validation::Result] the result of the validation
|
|
114
|
+
def self.call(config)
|
|
115
|
+
# dry-validation/schema does not support "Dynamic Keys" yet: https://github.com/dry-rb/dry-schema/issues/37
|
|
116
|
+
# But :selectors contains mostly "dynamic" keys, as the user defines them to extract article attributes.
|
|
117
|
+
# --> Validate the dynamic keys manually.
|
|
118
|
+
# To be able to specify a `rule`, nest the config under NESTING_KEY and mark that as `required`.
|
|
119
|
+
new.call(NESTING_KEY => config)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns the value of the attribute.
|
|
8
|
+
#
|
|
9
|
+
# Imagine this +time+ HTML tag with a +datetime+ attribute:
|
|
10
|
+
#
|
|
11
|
+
# <time datetime="2019-07-01">...</time>
|
|
12
|
+
#
|
|
13
|
+
# YAML usage example:
|
|
14
|
+
#
|
|
15
|
+
# selectors:
|
|
16
|
+
# link:
|
|
17
|
+
# selector: time
|
|
18
|
+
# extractor: attribute
|
|
19
|
+
# attribute: datetime
|
|
20
|
+
#
|
|
21
|
+
# Would return:
|
|
22
|
+
# '2019-07-01'
|
|
23
|
+
#
|
|
24
|
+
# In case you're extracting a date or a time, consider parsing it
|
|
25
|
+
# during post processing with {PostProcessors::ParseTime}.
|
|
26
|
+
class Attribute
|
|
27
|
+
# The available options for the attribute extractor.
|
|
28
|
+
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Initializes the Attribute extractor.
|
|
32
|
+
#
|
|
33
|
+
# @param xml [Nokogiri::XML::Element]
|
|
34
|
+
# @param options [Options]
|
|
35
|
+
# @option options [String] :selector CSS selector used to find the element
|
|
36
|
+
# @option options [String] :attribute attribute name to extract from the selected element
|
|
37
|
+
def initialize(xml, options)
|
|
38
|
+
@options = options
|
|
39
|
+
@element = Extractors.element(xml, options.selector)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# Retrieves and returns the attribute's value as a string.
|
|
44
|
+
#
|
|
45
|
+
# @return [String] The value of the attribute.
|
|
46
|
+
def get
|
|
47
|
+
@element.attr(@options.attribute).to_s
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns the value of the +href+ attribute.
|
|
8
|
+
# It always returns absolute URLs. If the extracted +href+ value is a
|
|
9
|
+
# relative URL, it prepends the channel's URL.
|
|
10
|
+
#
|
|
11
|
+
# Imagine this +a+ HTML element with a +href+ attribute:
|
|
12
|
+
#
|
|
13
|
+
# <a href="/posts/latest-findings">...</a>
|
|
14
|
+
#
|
|
15
|
+
# YAML usage example:
|
|
16
|
+
# channel:
|
|
17
|
+
# url: http://blog-without-a-feed.example.com
|
|
18
|
+
# ...
|
|
19
|
+
# selectors:
|
|
20
|
+
# link:
|
|
21
|
+
# selector: a
|
|
22
|
+
# extractor: href
|
|
23
|
+
#
|
|
24
|
+
# Would return:
|
|
25
|
+
# 'http://blog-without-a-feed.example.com/posts/latest-findings'
|
|
26
|
+
class Href
|
|
27
|
+
# The available options for the href (attribute) extractor.
|
|
28
|
+
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Initializes the Href extractor.
|
|
32
|
+
#
|
|
33
|
+
# @param xml [Nokogiri::XML::Element]
|
|
34
|
+
# @param options [Options]
|
|
35
|
+
# @option options [String] :selector CSS selector used to find the link element
|
|
36
|
+
# @option options [Hash{Symbol => Object}] :channel channel configuration, including :url
|
|
37
|
+
def initialize(xml, options)
|
|
38
|
+
@options = options
|
|
39
|
+
@element = Extractors.element(xml, options.selector)
|
|
40
|
+
@href = @element.attr('href').to_s
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
##
|
|
44
|
+
# Retrieves and returns the normalized absolute URL.
|
|
45
|
+
#
|
|
46
|
+
# @return [String] The absolute URL.
|
|
47
|
+
def get
|
|
48
|
+
return nil unless @href
|
|
49
|
+
|
|
50
|
+
Url.from_relative(@href, @options.channel[:url])
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns the HTML content of the specified element.
|
|
8
|
+
#
|
|
9
|
+
# Example HTML structure:
|
|
10
|
+
#
|
|
11
|
+
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
|
12
|
+
#
|
|
13
|
+
# YAML usage example:
|
|
14
|
+
#
|
|
15
|
+
# selectors:
|
|
16
|
+
# description:
|
|
17
|
+
# selector: p
|
|
18
|
+
# extractor: html
|
|
19
|
+
#
|
|
20
|
+
# Would return:
|
|
21
|
+
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
|
22
|
+
#
|
|
23
|
+
# Always ensure to sanitize the HTML during post-processing with
|
|
24
|
+
# {PostProcessors::SanitizeHtml}.
|
|
25
|
+
class Html
|
|
26
|
+
# The available options for the html extractor.
|
|
27
|
+
Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
28
|
+
|
|
29
|
+
##
|
|
30
|
+
# Initializes the Html extractor.
|
|
31
|
+
#
|
|
32
|
+
# @param xml [Nokogiri::XML::Element]
|
|
33
|
+
# @param options [Options]
|
|
34
|
+
# @option options [String] :selector CSS selector used to find the element
|
|
35
|
+
def initialize(xml, options)
|
|
36
|
+
@element = Extractors.element(xml, options.selector)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
##
|
|
40
|
+
# Retrieves and returns the HTML content of the element.
|
|
41
|
+
#
|
|
42
|
+
# @return [String] The HTML content.
|
|
43
|
+
def get
|
|
44
|
+
@element.to_s
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Returns a static value provided in the options.
|
|
8
|
+
#
|
|
9
|
+
# Example usage in YAML:
|
|
10
|
+
#
|
|
11
|
+
# selectors:
|
|
12
|
+
# byline:
|
|
13
|
+
# extractor: static
|
|
14
|
+
# static: Foobar
|
|
15
|
+
#
|
|
16
|
+
# Would return:
|
|
17
|
+
# 'Foobar'
|
|
18
|
+
class Static
|
|
19
|
+
# The available option for the static extractor.
|
|
20
|
+
Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Initializes the Static extractor.
|
|
24
|
+
#
|
|
25
|
+
# @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
|
|
26
|
+
# @param options [Options] Options containing the static value.
|
|
27
|
+
# @option options [String, Symbol] :static static value returned by this extractor
|
|
28
|
+
def initialize(_xml, options)
|
|
29
|
+
@options = options
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# Retrieves and returns the static value.
|
|
34
|
+
#
|
|
35
|
+
# @return [String, Symbol] The static value provided in options.
|
|
36
|
+
def get
|
|
37
|
+
@options.static
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module Extractors
|
|
6
|
+
##
|
|
7
|
+
# Return the text content of the attribute. This is the default extractor used,
|
|
8
|
+
# when no extractor is explicitly given.
|
|
9
|
+
#
|
|
10
|
+
# Example HTML structure:
|
|
11
|
+
#
|
|
12
|
+
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
|
13
|
+
#
|
|
14
|
+
# YAML usage example:
|
|
15
|
+
#
|
|
16
|
+
# selectors:
|
|
17
|
+
# description:
|
|
18
|
+
# selector: p
|
|
19
|
+
# extractor: text
|
|
20
|
+
#
|
|
21
|
+
# Would return:
|
|
22
|
+
# 'Lorem ipsum dolor ...'
|
|
23
|
+
class Text
|
|
24
|
+
# The available options for the text extractor.
|
|
25
|
+
Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# Initializes the Text extractor.
|
|
29
|
+
#
|
|
30
|
+
# @param xml [Nokogiri::XML::Element]
|
|
31
|
+
# @param options [Options]
|
|
32
|
+
# @option options [String] :selector CSS selector used to find the element
|
|
33
|
+
def initialize(xml, options)
|
|
34
|
+
@element = Extractors.element(xml, options.selector)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
##
|
|
38
|
+
# Retrieves and returns the text content of the element.
|
|
39
|
+
#
|
|
40
|
+
# @return [String] The text content.
|
|
41
|
+
def get
|
|
42
|
+
@element.text.to_s.strip.gsub(/\s+/, ' ')
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
##
|
|
6
|
+
# Provides a namespace for item extractors.
|
|
7
|
+
module Extractors
|
|
8
|
+
##
|
|
9
|
+
# Maps the extractor name to the class implementing the extractor.
|
|
10
|
+
#
|
|
11
|
+
# The key is the name to use in the feed config.
|
|
12
|
+
NAME_TO_CLASS = {
|
|
13
|
+
attribute: Attribute,
|
|
14
|
+
href: Href,
|
|
15
|
+
html: Html,
|
|
16
|
+
static: Static,
|
|
17
|
+
text: Text
|
|
18
|
+
}.freeze
|
|
19
|
+
|
|
20
|
+
##
|
|
21
|
+
# Maps the extractor class to its corresponding options class.
|
|
22
|
+
ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
|
|
23
|
+
hash[klass] = klass.const_get(:Options)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Extractor used when none is explicitly configured.
|
|
27
|
+
DEFAULT_EXTRACTOR = :text
|
|
28
|
+
|
|
29
|
+
class << self
|
|
30
|
+
##
|
|
31
|
+
# Retrieves an element from Nokogiri XML based on the selector.
|
|
32
|
+
#
|
|
33
|
+
# @param xml [Nokogiri::XML::Document]
|
|
34
|
+
# @param selector [String, nil]
|
|
35
|
+
# @return [Nokogiri::XML::ElementSet] selected XML elements
|
|
36
|
+
def element(xml, selector)
|
|
37
|
+
selector ? xml.css(selector) : xml
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @param attribute_options [Hash{Symbol => Object}]
|
|
41
|
+
# Should contain at least `:extractor` (the name) and required options for that extractor.
|
|
42
|
+
# @param xml [Nokogiri::XML::Document]
|
|
43
|
+
# @return [Object] instance of the specified item extractor class
|
|
44
|
+
def get(attribute_options, xml)
|
|
45
|
+
extractor_class = NAME_TO_CLASS[attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR]
|
|
46
|
+
options = ITEM_OPTION_CLASSES[extractor_class].new(attribute_options.slice(*extractor_class::Options.members))
|
|
47
|
+
|
|
48
|
+
extractor_class.new(xml, options).get
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Selectors
|
|
7
|
+
##
|
|
8
|
+
# A naive implementation of "Object to XML": converts a Ruby object to XML format.
|
|
9
|
+
class ObjectToXmlConverter
|
|
10
|
+
# Wrapper tags used for top-level collection conversion.
|
|
11
|
+
OBJECT_TO_XML_TAGS = {
|
|
12
|
+
hash: ['<object>', '</object>'],
|
|
13
|
+
array: ['<array>', '</array>']
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
##
|
|
17
|
+
# @param object [Object] any Ruby object (Hash, Array, String, Symbol, etc.)
|
|
18
|
+
def initialize(object)
|
|
19
|
+
@object = object
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Converts the object to XML format.
|
|
24
|
+
#
|
|
25
|
+
# @return [String] representing the object in XML
|
|
26
|
+
def call
|
|
27
|
+
object_to_xml(@object).tap do |converted|
|
|
28
|
+
Html2rss::Log.debug("#{self.class}: converted object to XML (#{converted.bytesize} bytes)")
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def object_to_xml(object)
|
|
35
|
+
case object
|
|
36
|
+
when Hash
|
|
37
|
+
hash_to_xml(object)
|
|
38
|
+
when Array
|
|
39
|
+
array_to_xml(object)
|
|
40
|
+
else
|
|
41
|
+
CGI.escapeHTML(object.to_s)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def hash_to_xml(object)
|
|
46
|
+
prefix, suffix = OBJECT_TO_XML_TAGS[:hash]
|
|
47
|
+
inner_xml = object.each_with_object(+'') do |(key, value), str|
|
|
48
|
+
str << "<#{key}>#{object_to_xml(value)}</#{key}>"
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
"#{prefix}#{inner_xml}#{suffix}"
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def array_to_xml(object)
|
|
55
|
+
prefix, suffix = OBJECT_TO_XML_TAGS[:array]
|
|
56
|
+
inner_xml = object.each_with_object(+'') { |value, str| str << object_to_xml(value) }
|
|
57
|
+
|
|
58
|
+
"#{prefix}#{inner_xml}#{suffix}"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Selectors
|
|
5
|
+
module PostProcessors
|
|
6
|
+
##
|
|
7
|
+
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
|
8
|
+
class Base
|
|
9
|
+
# Validates the presence of required options in the context
|
|
10
|
+
#
|
|
11
|
+
# @param keys [Array<Symbol>] the keys to check for presence
|
|
12
|
+
# @param context [Selectors::Context] the context containing options
|
|
13
|
+
# @return [void]
|
|
14
|
+
# @raise [MissingOption] if any key is missing
|
|
15
|
+
def self.expect_options(keys, context)
|
|
16
|
+
keys.each do |key|
|
|
17
|
+
unless (options = context[:options]).key?(key)
|
|
18
|
+
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
|
19
|
+
cause: nil
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Asserts that the value is of the expected type(s)
|
|
25
|
+
#
|
|
26
|
+
# @param value [Object] the value to check
|
|
27
|
+
# @param types [Array<Class>, Class] the expected type(s)
|
|
28
|
+
# @param name [String] the name of the option being checked
|
|
29
|
+
# @param context [Selectors::Context] call-site context used for richer validation errors
|
|
30
|
+
# @return [void]
|
|
31
|
+
# @raise [InvalidType] if the value is not of the expected type(s)
|
|
32
|
+
def self.assert_type(value, types = [], name, context:)
|
|
33
|
+
return if Array(types).any? { |type| value.is_a?(type) }
|
|
34
|
+
|
|
35
|
+
options = if context.respond_to?(:options)
|
|
36
|
+
context.options
|
|
37
|
+
else
|
|
38
|
+
{ file: File.basename(caller(1, 1).first.split(':').first) }
|
|
39
|
+
end
|
|
40
|
+
message = "The type of `#{name}` must be #{Array(types).join(' or ')}, " \
|
|
41
|
+
"but is: #{value.class} in: #{options.inspect}"
|
|
42
|
+
raise InvalidType, message, [], cause: nil
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
##
|
|
46
|
+
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
|
47
|
+
#
|
|
48
|
+
# @param _value [Object] extracted selector value
|
|
49
|
+
# @param _context [Selectors::Context] post-processor execution context
|
|
50
|
+
# @return [void]
|
|
51
|
+
def self.validate_args!(_value, _context)
|
|
52
|
+
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Initializes the post processor
|
|
56
|
+
#
|
|
57
|
+
# @param value [Object] the value to be processed
|
|
58
|
+
# @param context [Selectors::Context] runtime selector context and options
|
|
59
|
+
def initialize(value, context)
|
|
60
|
+
klass = self.class
|
|
61
|
+
klass.assert_type(context, Selectors::Context, 'context', context:)
|
|
62
|
+
klass.validate_args!(value, context)
|
|
63
|
+
|
|
64
|
+
@value = value
|
|
65
|
+
@context = context
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
attr_reader :value, :context
|
|
69
|
+
|
|
70
|
+
# Abstract method to be implemented by subclasses
|
|
71
|
+
#
|
|
72
|
+
# @return [Object] transformed value
|
|
73
|
+
# @raise [NotImplementedError] if not implemented in subclass
|
|
74
|
+
def get
|
|
75
|
+
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'regexp_parser'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Selectors
|
|
7
|
+
module PostProcessors
|
|
8
|
+
##
|
|
9
|
+
# Imagine this HTML:
|
|
10
|
+
# <h1>Foo bar and boo<h1>
|
|
11
|
+
#
|
|
12
|
+
# YAML usage example:
|
|
13
|
+
# selectors:
|
|
14
|
+
# title:
|
|
15
|
+
# selector: h1
|
|
16
|
+
# post_process:
|
|
17
|
+
# name: gsub
|
|
18
|
+
# pattern: boo
|
|
19
|
+
# replacement: baz
|
|
20
|
+
#
|
|
21
|
+
# Would return:
|
|
22
|
+
# 'Foo bar and baz'
|
|
23
|
+
#
|
|
24
|
+
# `pattern` can be a Regexp or a String. If it is a String, it will remove
|
|
25
|
+
# one pair of surrounding slashes ('/') to keep backwards compatibility
|
|
26
|
+
# and then parse it to build a Regexp.
|
|
27
|
+
#
|
|
28
|
+
# `replacement` can be a String or a Hash.
|
|
29
|
+
#
|
|
30
|
+
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
|
31
|
+
class Gsub < Base
|
|
32
|
+
# @param value [String] extracted selector value
|
|
33
|
+
# @param context [Selectors::Context] post-processor context
|
|
34
|
+
# @return [void]
|
|
35
|
+
def self.validate_args!(value, context)
|
|
36
|
+
assert_type value, String, :value, context:
|
|
37
|
+
expect_options(%i[replacement pattern], context)
|
|
38
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement, context:
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
##
|
|
42
|
+
# @param value [String]
|
|
43
|
+
# @param context [Selectors::Context]
|
|
44
|
+
def initialize(value, context)
|
|
45
|
+
super
|
|
46
|
+
|
|
47
|
+
options = context[:options]
|
|
48
|
+
|
|
49
|
+
@replacement = options[:replacement]
|
|
50
|
+
@pattern = options[:pattern]
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
##
|
|
54
|
+
# @return [String]
|
|
55
|
+
def get
|
|
56
|
+
value.to_s.gsub(pattern, replacement)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
private
|
|
60
|
+
|
|
61
|
+
attr_accessor :replacement
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# @return [Regexp]
|
|
65
|
+
def pattern
|
|
66
|
+
@pattern.is_a?(String) ? parse_regexp_string(@pattern) : @pattern
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
##
|
|
70
|
+
# Parses the given String and builds a Regexp out of it.
|
|
71
|
+
#
|
|
72
|
+
# It will remove one pair of surrounding slashes ('/') from the String
|
|
73
|
+
# to maintain backwards compatibility before building the Regexp.
|
|
74
|
+
#
|
|
75
|
+
# @param string [String]
|
|
76
|
+
# @return [Regexp]
|
|
77
|
+
def parse_regexp_string(string)
|
|
78
|
+
raise ArgumentError, 'must be a string!' unless string.is_a?(String)
|
|
79
|
+
|
|
80
|
+
# Only remove surrounding slashes if the string has at least 3 characters
|
|
81
|
+
# to avoid issues with single character strings like "/"
|
|
82
|
+
string = string[1..-2] if string.length >= 3 && string.start_with?('/') && string.end_with?('/')
|
|
83
|
+
Regexp::Parser.parse(string, options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE).to_re
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|