html2rss 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -270
- data/exe/html2rss +6 -0
- data/html2rss.gemspec +18 -23
- data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
- data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
- data/lib/html2rss/attribute_post_processors/template.rb +36 -12
- data/lib/html2rss/attribute_post_processors.rb +28 -5
- data/lib/html2rss/cli.rb +29 -0
- data/lib/html2rss/config/channel.rb +117 -0
- data/lib/html2rss/config/selectors.rb +91 -0
- data/lib/html2rss/config.rb +71 -82
- data/lib/html2rss/item.rb +122 -46
- data/lib/html2rss/item_extractors/attribute.rb +20 -7
- data/lib/html2rss/item_extractors/href.rb +20 -4
- data/lib/html2rss/item_extractors/html.rb +18 -6
- data/lib/html2rss/item_extractors/static.rb +18 -7
- data/lib/html2rss/item_extractors/text.rb +17 -5
- data/lib/html2rss/item_extractors.rb +75 -10
- data/lib/html2rss/object_to_xml_converter.rb +56 -0
- data/lib/html2rss/rss_builder/channel.rb +21 -0
- data/lib/html2rss/rss_builder/item.rb +83 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
- data/lib/html2rss/rss_builder.rb +96 -0
- data/lib/html2rss/utils.rb +94 -19
- data/lib/html2rss/version.rb +5 -1
- data/lib/html2rss.rb +57 -20
- metadata +53 -165
- data/.gitignore +0 -12
- data/.rspec +0 -4
- data/.rubocop.yml +0 -164
- data/.travis.yml +0 -25
- data/.yardopts +0 -6
- data/CHANGELOG.md +0 -221
- data/Gemfile +0 -8
- data/Gemfile.lock +0 -139
- data/bin/console +0 -15
- data/bin/setup +0 -8
- data/lib/html2rss/feed_builder.rb +0 -81
- data/lib/html2rss/item_extractors/current_time.rb +0 -21
- data/support/logo.png +0 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class Config
|
7
|
+
##
|
8
|
+
# Holds the configuration for the feed's channel options.
|
9
|
+
# This contains:
|
10
|
+
#
|
11
|
+
# 1. the RSS channel attributes
|
12
|
+
# 2. html2rss options like json or custom HTTP-headers for the request
|
13
|
+
class Channel
|
14
|
+
##
|
15
|
+
# @param config [Hash<Symbol, Object>]
|
16
|
+
# @return [Set<String>] the required parameter names
|
17
|
+
def self.required_params_for_config(config)
|
18
|
+
config.each_with_object(Set.new) do |(_, value), required_params|
|
19
|
+
required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
##
|
24
|
+
# @param channel [Hash<Symbol, Object>]
|
25
|
+
# @param params [Hash]
|
26
|
+
def initialize(channel, params: {})
|
27
|
+
raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
|
28
|
+
raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
|
29
|
+
|
30
|
+
@config = process_params(channel, params.transform_keys(&:to_sym))
|
31
|
+
end
|
32
|
+
|
33
|
+
##
|
34
|
+
# The HTTP headers to use for the request.
|
35
|
+
#
|
36
|
+
# @return [Hash<Symbol, String>]
|
37
|
+
def headers
|
38
|
+
config.fetch(:headers, {})
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# @return [String]
|
43
|
+
def author
|
44
|
+
config.fetch(:author, 'html2rss')
|
45
|
+
end
|
46
|
+
|
47
|
+
##
|
48
|
+
# @return [Integer]
|
49
|
+
def ttl
|
50
|
+
config.fetch(:ttl, 360)
|
51
|
+
end
|
52
|
+
|
53
|
+
##
|
54
|
+
# @return [String]
|
55
|
+
def title
|
56
|
+
config.fetch(:title) { Utils.titleized_url(url) }
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# @return [String] language code
|
61
|
+
def language
|
62
|
+
config.fetch(:language, 'en')
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# @return [String]
|
67
|
+
def description
|
68
|
+
config.fetch(:description) { "Latest items from #{url}." }
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
# @return [Addressable::URI]
|
73
|
+
def url
|
74
|
+
Addressable::URI.parse(config[:url]).normalize
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# @return [String] time_zone name
|
79
|
+
def time_zone
|
80
|
+
config.fetch(:time_zone, 'UTC')
|
81
|
+
end
|
82
|
+
|
83
|
+
##
|
84
|
+
# @return [true, false]
|
85
|
+
def json?
|
86
|
+
config.fetch(:json, false)
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
# @return [Hash<Symbol, Object>]
|
92
|
+
attr_reader :config
|
93
|
+
|
94
|
+
##
|
95
|
+
# @param config [Hash<Symbol, Object>]
|
96
|
+
# @param params [Hash<Symbol, String>]
|
97
|
+
# @return [nil]
|
98
|
+
def assert_required_params_presence(config, params)
|
99
|
+
missing_params = self.class.required_params_for_config(config) - params.keys.map(&:to_s)
|
100
|
+
raise ParamsMissing, missing_params.to_a.join(', ') unless missing_params.empty?
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Sets the variables used in the feed config's channel.
|
105
|
+
#
|
106
|
+
# @param config [Hash<Symbol, Object>]
|
107
|
+
# @param params [Hash<Symbol, Object>]
|
108
|
+
# @return [Hash<Symbol, Object>]
|
109
|
+
def process_params(config, params)
|
110
|
+
assert_required_params_presence(config, params)
|
111
|
+
config.transform_values do |value|
|
112
|
+
value.is_a?(String) ? format(value, params) : value
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class Config
|
5
|
+
##
|
6
|
+
# Holds the configurations of the selectors.
|
7
|
+
class Selectors
|
8
|
+
ITEMS_SELECTOR_NAME = :items
|
9
|
+
|
10
|
+
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
|
+
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
|
12
|
+
|
13
|
+
##
|
14
|
+
# @param config [Hash<Symbol, Object>]
|
15
|
+
def initialize(config)
|
16
|
+
validate_config(config)
|
17
|
+
@config = config
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# @param name [Symbol]
|
22
|
+
# @return [true, false]
|
23
|
+
def selector?(name)
|
24
|
+
name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# @param name [Symbol]
|
29
|
+
# @return [Selector]
|
30
|
+
def selector(name)
|
31
|
+
raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
|
32
|
+
|
33
|
+
Selector.new(config[name])
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# @return [Set<Symbol>]
|
38
|
+
def category_selector_names
|
39
|
+
selector_keys_for(:categories)
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# @return [Set<Symbol>]
|
44
|
+
def guid_selector_names
|
45
|
+
selector_keys_for(:guid, default: :title_or_description)
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Returns the CSS/XPath selector.
|
50
|
+
#
|
51
|
+
# @param name [Symbol]
|
52
|
+
# @return [String]
|
53
|
+
def selector_string(name)
|
54
|
+
Selector.new(config[name]).selector
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# @return [Set<Symbol>]
|
59
|
+
def item_selector_names
|
60
|
+
@item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# @return [Symbol, nil]
|
65
|
+
def items_order
|
66
|
+
config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
attr_reader :config
|
72
|
+
|
73
|
+
def validate_config(config)
|
74
|
+
raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# Returns the selector keys for the selector named `name`. If none, returns [default].
|
79
|
+
#
|
80
|
+
# @param name [Symbol]
|
81
|
+
# @param default [String, Symbol]
|
82
|
+
# @return [Set<Symbol>]
|
83
|
+
def selector_keys_for(name, default: nil)
|
84
|
+
config.fetch(name) { Array(default) }.tap do |array|
|
85
|
+
array.reject! { |entry| entry.to_s == '' }
|
86
|
+
array.map!(&:to_sym)
|
87
|
+
end.to_set
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -1,95 +1,84 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'forwardable'
|
2
4
|
|
3
5
|
module Html2rss
|
4
6
|
##
|
5
7
|
# The Config class abstracts from the config data structure and
|
6
8
|
# provides default values.
|
7
9
|
class Config
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
channel_config
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
##
|
13
|
+
# The Error class to be thrown when a feed config requires params, but none
|
14
|
+
# were passed to Config.
|
15
|
+
class ParamsMissing < Html2rss::Error; end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Thrown when the feed config does not contain a value at `:channel`.
|
19
|
+
class ChannelMissing < Html2rss::Error; end
|
20
|
+
|
21
|
+
# Struct to store XML Stylesheet attributes
|
22
|
+
Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
|
23
|
+
|
24
|
+
def_delegator :@channel, :author
|
25
|
+
def_delegator :@channel, :ttl
|
26
|
+
def_delegator :@channel, :title
|
27
|
+
def_delegator :@channel, :language
|
28
|
+
def_delegator :@channel, :description
|
29
|
+
def_delegator :@channel, :url
|
30
|
+
def_delegator :@channel, :url, :link
|
31
|
+
def_delegator :@channel, :time_zone
|
32
|
+
def_delegator :@channel, :json?
|
33
|
+
|
34
|
+
def_delegator :@selectors, :item_selector_names
|
35
|
+
def_delegator :@selectors, :selector?
|
36
|
+
def_delegator :@selectors, :category_selector_names
|
37
|
+
def_delegator :@selectors, :guid_selector_names
|
38
|
+
def_delegator :@selectors, :items_order
|
39
|
+
def_delegator :@selectors, :selector_string
|
40
|
+
|
41
|
+
##
|
42
|
+
# Initializes the Config object with feed configuration, global settings, and parameters.
|
43
|
+
#
|
44
|
+
# @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
|
45
|
+
# @param global [Hash<Symbol, Object>] Global settings hash.
|
46
|
+
# @param params [Hash<Symbol, String>] Parameters hash.
|
47
|
+
def initialize(feed_config, global = {}, params = {})
|
48
|
+
channel_config = feed_config[:channel]
|
49
|
+
raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
|
50
|
+
|
51
|
+
@channel = Channel.new(channel_config, params:)
|
52
|
+
@selectors = Selectors.new(feed_config[:selectors])
|
53
|
+
@global = global
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Retrieves selector attributes merged with channel attributes.
|
58
|
+
#
|
59
|
+
# @param name [Symbol] Selector name.
|
60
|
+
# @return [Hash<Symbol, Object>] Merged attributes hash.
|
61
|
+
def selector_attributes_with_channel(name)
|
62
|
+
@selectors.selector(name).to_h.merge(channel: @channel)
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Retrieves headers merged from global settings and channel headers.
|
67
|
+
#
|
68
|
+
# @return [Hash] Merged headers hash.
|
56
69
|
def headers
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
def attribute_options(name)
|
61
|
-
feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
|
62
|
-
end
|
63
|
-
|
64
|
-
def attribute?(name)
|
65
|
-
attribute_names.include?(name)
|
70
|
+
@global.fetch(:headers, {}).merge(@channel.headers)
|
66
71
|
end
|
67
72
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
categories.uniq!
|
75
|
-
|
76
|
-
categories
|
73
|
+
##
|
74
|
+
# Retrieves stylesheets from global settings.
|
75
|
+
#
|
76
|
+
# @return [Array<Stylesheet>] Array of Stylesheet structs.
|
77
|
+
def stylesheets
|
78
|
+
@global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
|
77
79
|
end
|
78
80
|
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
def attribute_names
|
84
|
-
@attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
|
85
|
-
end
|
86
|
-
|
87
|
-
def items_order
|
88
|
-
feed_config.dig(:selectors, :items, :order)&.to_sym
|
89
|
-
end
|
90
|
-
|
91
|
-
private
|
92
|
-
|
93
|
-
attr_reader :feed_config, :channel_config, :global_config
|
81
|
+
# Provides read-only access to the channel object.
|
82
|
+
attr_reader :channel
|
94
83
|
end
|
95
84
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -1,12 +1,39 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
3
|
require 'nokogiri'
|
4
4
|
|
5
5
|
module Html2rss
|
6
6
|
##
|
7
|
-
# Takes the selected Nokogiri::HTML and responds to
|
7
|
+
# Takes the selected Nokogiri::HTML and responds to accessor names
|
8
8
|
# defined in the feed config.
|
9
|
+
#
|
10
|
+
# Instances can only be created via `.from_url` and
|
11
|
+
# each represents an internally used "RSS item".
|
12
|
+
# Such an item provides dynamically defined attributes as methods.
|
9
13
|
class Item
|
14
|
+
# A context instance is passed to Item Extractors.
|
15
|
+
Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
|
16
|
+
# Class to keep an Item's <enclosure>.
|
17
|
+
Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
|
18
|
+
|
19
|
+
##
|
20
|
+
# Fetches items from a given URL using configuration settings.
|
21
|
+
#
|
22
|
+
# @param url [String] URL to fetch items from.
|
23
|
+
# @param config [Html2rss::Config] Configuration object.
|
24
|
+
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
|
+
def self.from_url(url, config)
|
26
|
+
body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
|
27
|
+
|
28
|
+
Nokogiri.HTML(body)
|
29
|
+
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
30
|
+
.map { |xml| new(xml, config) }
|
31
|
+
.select(&:valid?)
|
32
|
+
end
|
33
|
+
|
34
|
+
##
|
35
|
+
# @param xml [Nokogiri::XML::Element]
|
36
|
+
# @param config [Html2rss::Config]
|
10
37
|
def initialize(xml, config)
|
11
38
|
@xml = xml
|
12
39
|
@config = config
|
@@ -14,86 +41,135 @@ module Html2rss
|
|
14
41
|
|
15
42
|
private_class_method :new
|
16
43
|
|
44
|
+
##
|
45
|
+
# Checks if the object responds to a method dynamically based on the configuration.
|
46
|
+
#
|
47
|
+
# @param method_name [Symbol]
|
48
|
+
# @param _include_private [true, false]
|
49
|
+
# @return [true, false]
|
17
50
|
def respond_to_missing?(method_name, _include_private = false)
|
18
|
-
config.
|
51
|
+
config.selector?(method_name) || super
|
19
52
|
end
|
20
53
|
|
54
|
+
##
|
55
|
+
# Dynamically extracts data based on the method name.
|
56
|
+
#
|
57
|
+
# @param method_name [Symbol]
|
58
|
+
# @param _args [Array]
|
59
|
+
# @return [String] extracted value for the selector.
|
21
60
|
def method_missing(method_name, *_args)
|
22
61
|
return super unless respond_to_missing?(method_name)
|
23
62
|
|
24
|
-
|
25
|
-
|
26
|
-
extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
|
27
|
-
value = extractor.new(xml, attribute_options).get
|
28
|
-
|
29
|
-
post_process(value, attribute_options.fetch(:post_process, false))
|
63
|
+
extract(method_name)
|
30
64
|
end
|
31
65
|
|
32
|
-
|
33
|
-
|
34
|
-
|
66
|
+
##
|
67
|
+
# Selects and processes data according to the selector name.
|
68
|
+
#
|
69
|
+
# @param tag [Symbol]
|
70
|
+
# @return [String] the extracted value for the selector.
|
71
|
+
def extract(tag)
|
72
|
+
attribute_options = config.selector_attributes_with_channel(tag.to_sym)
|
73
|
+
|
74
|
+
post_process(
|
75
|
+
ItemExtractors.item_extractor_factory(attribute_options, xml).get,
|
76
|
+
attribute_options.fetch(:post_process, false)
|
77
|
+
)
|
35
78
|
end
|
36
79
|
|
37
80
|
##
|
38
|
-
#
|
81
|
+
# Checks if the item is valid accordin to RSS 2.0 spec,
|
82
|
+
# by ensuring it has at least a title or a description.
|
83
|
+
#
|
84
|
+
# @return [true, false]
|
39
85
|
def valid?
|
40
|
-
|
41
|
-
description = self.description if config.attribute?(:description)
|
42
|
-
[title, description].join != ''
|
86
|
+
title_or_description.to_s != ''
|
43
87
|
end
|
44
88
|
|
45
89
|
##
|
46
|
-
#
|
47
|
-
|
48
|
-
|
49
|
-
|
90
|
+
# Returns either the title or the description, preferring title if available.
|
91
|
+
#
|
92
|
+
# @return [String, nil]
|
93
|
+
def title_or_description
|
94
|
+
return title if config.selector?(:title)
|
50
95
|
|
51
|
-
|
52
|
-
config.attribute?(:enclosure)
|
96
|
+
description if config.selector?(:description)
|
53
97
|
end
|
54
98
|
|
55
|
-
|
56
|
-
|
99
|
+
##
|
100
|
+
#
|
101
|
+
# @return [String] SHA1 hashed GUID.
|
102
|
+
def guid
|
103
|
+
content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
|
57
104
|
|
58
|
-
|
105
|
+
Digest::SHA1.hexdigest(content)
|
59
106
|
end
|
60
107
|
|
61
108
|
##
|
62
|
-
#
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
.map { |xml_item| new xml_item, config }
|
68
|
-
.keep_if(&:valid?)
|
109
|
+
# Retrieves categories for the item based on configured category selectors.
|
110
|
+
#
|
111
|
+
# @return [Array<String>] list of categories.
|
112
|
+
def categories
|
113
|
+
config.category_selector_names.map { |method_name| public_send(method_name) }
|
69
114
|
end
|
70
115
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
body = request.get.body
|
116
|
+
##
|
117
|
+
# Checks if the item has an enclosure based on configuration.
|
118
|
+
#
|
119
|
+
# @return [true, false]
|
120
|
+
def enclosure?
|
121
|
+
config.selector?(:enclosure)
|
122
|
+
end
|
80
123
|
|
81
|
-
|
124
|
+
##
|
125
|
+
# Retrieves enclosure details for the item.
|
126
|
+
#
|
127
|
+
# @return [Enclosure] enclosure details.
|
128
|
+
def enclosure
|
129
|
+
url = enclosure_url
|
130
|
+
|
131
|
+
raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
|
132
|
+
|
133
|
+
Enclosure.new(
|
134
|
+
type: Html2rss::Utils.guess_content_type_from_url(url),
|
135
|
+
bits_length: 0,
|
136
|
+
url: url.to_s
|
137
|
+
)
|
82
138
|
end
|
83
|
-
private_class_method :get_body_from_url
|
84
139
|
|
85
|
-
|
140
|
+
private
|
141
|
+
|
142
|
+
# @return [Nokogiri::XML::Element] XML element representing the item.
|
143
|
+
attr_reader :xml
|
144
|
+
# @return [Html2rss::Config] Configuration object for the item.
|
145
|
+
attr_reader :config
|
86
146
|
|
147
|
+
##
|
148
|
+
# Processes the extracted value according to post-processing options.
|
149
|
+
#
|
150
|
+
# @param value [String] extracted value.
|
151
|
+
# @param post_process_options [Hash<Symbol, Object>] post-processing options.
|
152
|
+
# @return [String] processed value.
|
87
153
|
def post_process(value, post_process_options)
|
88
154
|
return value unless post_process_options
|
89
155
|
|
90
156
|
[post_process_options].flatten.each do |options|
|
91
157
|
value = AttributePostProcessors.get_processor(options[:name])
|
92
|
-
.new(value, options
|
158
|
+
.new(value, Context.new(options:, item: self, config:))
|
93
159
|
.get
|
94
160
|
end
|
95
161
|
|
96
162
|
value
|
97
163
|
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
|
167
|
+
#
|
168
|
+
# @return [Addressable::URI, nil] absolute URL of the enclosure.
|
169
|
+
def enclosure_url
|
170
|
+
enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
|
171
|
+
|
172
|
+
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
|
173
|
+
end
|
98
174
|
end
|
99
175
|
end
|
@@ -1,9 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
6
|
# Returns the value of the attribute.
|
5
7
|
#
|
6
|
-
# Imagine this +time+ HTML
|
8
|
+
# Imagine this +time+ HTML tag with a +datetime+ attribute:
|
7
9
|
#
|
8
10
|
# <time datetime="2019-07-01">...</time>
|
9
11
|
#
|
@@ -18,19 +20,30 @@ module Html2rss
|
|
18
20
|
# Would return:
|
19
21
|
# '2019-07-01'
|
20
22
|
#
|
21
|
-
# In case you're extracting a date or a time,
|
22
|
-
# during post processing with
|
23
|
-
# {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
|
23
|
+
# In case you're extracting a date or a time, consider parsing it
|
24
|
+
# during post processing with {AttributePostProcessors::ParseTime}.
|
24
25
|
class Attribute
|
26
|
+
# The available options for the attribute extractor.
|
27
|
+
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
|
28
|
+
|
29
|
+
##
|
30
|
+
# Initializes the Attribute extractor.
|
31
|
+
#
|
32
|
+
# @param xml [Nokogiri::XML::Element]
|
33
|
+
# @param options [Options]
|
25
34
|
def initialize(xml, options)
|
26
35
|
@options = options
|
27
|
-
@element = ItemExtractors.element(xml, options)
|
36
|
+
@element = ItemExtractors.element(xml, options.selector)
|
28
37
|
end
|
29
38
|
|
30
39
|
##
|
31
|
-
#
|
40
|
+
# Retrieves and returns the attribute's value as a string.
|
41
|
+
#
|
42
|
+
# @return [String] The value of the attribute.
|
32
43
|
def get
|
33
|
-
@element.attr(@options
|
44
|
+
@element.attr(@options.attribute).to_s.freeze
|
45
|
+
rescue NoMethodError => error
|
46
|
+
raise "Failed to extract attribute: #{error.message}"
|
34
47
|
end
|
35
48
|
end
|
36
49
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
@@ -21,15 +23,29 @@ module Html2rss
|
|
21
23
|
# Would return:
|
22
24
|
# 'http://blog-without-a-feed.example.com/posts/latest-findings'
|
23
25
|
class Href
|
26
|
+
# The available options for the href (attribute) extractor.
|
27
|
+
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
|
28
|
+
|
29
|
+
##
|
30
|
+
# Initializes the Href extractor.
|
31
|
+
#
|
32
|
+
# @param xml [Nokogiri::XML::Element]
|
33
|
+
# @param options [Options]
|
24
34
|
def initialize(xml, options)
|
25
35
|
@options = options
|
26
|
-
element = ItemExtractors.element(xml, options)
|
27
|
-
@href =
|
36
|
+
@element = ItemExtractors.element(xml, options.selector)
|
37
|
+
@href = @element.attr('href').to_s
|
28
38
|
end
|
29
39
|
|
30
|
-
|
40
|
+
##
|
41
|
+
# Retrieves and returns the normalized absolute URL.
|
42
|
+
#
|
43
|
+
# @return [String] The absolute URL.
|
31
44
|
def get
|
32
|
-
|
45
|
+
return nil unless @href
|
46
|
+
|
47
|
+
sanitized_href = Html2rss::Utils.sanitize_url(@href)
|
48
|
+
Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
|
33
49
|
end
|
34
50
|
end
|
35
51
|
end
|