html2rss 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.mergify.yml +15 -0
- data/.rubocop.yml +11 -145
- data/Gemfile +19 -2
- data/Gemfile.lock +111 -97
- data/README.md +323 -270
- data/bin/console +1 -0
- data/exe/html2rss +6 -0
- data/html2rss.gemspec +15 -20
- data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
- data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
- data/lib/html2rss/attribute_post_processors/template.rb +36 -12
- data/lib/html2rss/attribute_post_processors.rb +28 -5
- data/lib/html2rss/cli.rb +29 -0
- data/lib/html2rss/config/channel.rb +117 -0
- data/lib/html2rss/config/selectors.rb +91 -0
- data/lib/html2rss/config.rb +71 -82
- data/lib/html2rss/item.rb +118 -42
- data/lib/html2rss/item_extractors/attribute.rb +20 -7
- data/lib/html2rss/item_extractors/href.rb +20 -4
- data/lib/html2rss/item_extractors/html.rb +18 -6
- data/lib/html2rss/item_extractors/static.rb +18 -7
- data/lib/html2rss/item_extractors/text.rb +17 -5
- data/lib/html2rss/item_extractors.rb +75 -10
- data/lib/html2rss/object_to_xml_converter.rb +56 -0
- data/lib/html2rss/rss_builder/channel.rb +21 -0
- data/lib/html2rss/rss_builder/item.rb +83 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
- data/lib/html2rss/rss_builder.rb +96 -0
- data/lib/html2rss/utils.rb +94 -19
- data/lib/html2rss/version.rb +5 -1
- data/lib/html2rss.rb +51 -20
- data/rakefile.rb +16 -0
- metadata +51 -154
- data/.travis.yml +0 -25
- data/CHANGELOG.md +0 -221
- data/lib/html2rss/feed_builder.rb +0 -81
- data/lib/html2rss/item_extractors/current_time.rb +0 -21
- data/support/logo.png +0 -0
@@ -1,9 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
|
-
#
|
6
|
+
# Returns the HTML content of the specified element.
|
5
7
|
#
|
6
|
-
#
|
8
|
+
# Example HTML structure:
|
7
9
|
#
|
8
10
|
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
9
11
|
#
|
@@ -17,15 +19,25 @@ module Html2rss
|
|
17
19
|
# Would return:
|
18
20
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
19
21
|
#
|
20
|
-
# Always
|
21
|
-
# {AttributePostProcessors::SanitizeHtml}
|
22
|
+
# Always ensure to sanitize the HTML during post-processing with
|
23
|
+
# {AttributePostProcessors::SanitizeHtml}.
|
22
24
|
class Html
|
25
|
+
# The available options for the html extractor.
|
26
|
+
Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
|
27
|
+
|
28
|
+
##
|
29
|
+
# Initializes the Html extractor.
|
30
|
+
#
|
31
|
+
# @param xml [Nokogiri::XML::Element]
|
32
|
+
# @param options [Options]
|
23
33
|
def initialize(xml, options)
|
24
|
-
@element = ItemExtractors.element(xml, options)
|
34
|
+
@element = ItemExtractors.element(xml, options.selector)
|
25
35
|
end
|
26
36
|
|
27
37
|
##
|
28
|
-
#
|
38
|
+
# Retrieves and returns the HTML content of the element.
|
39
|
+
#
|
40
|
+
# @return [String] The HTML content.
|
29
41
|
def get
|
30
42
|
@element.to_s
|
31
43
|
end
|
@@ -1,27 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
|
-
#
|
6
|
+
# Returns a static value provided in the options.
|
7
|
+
#
|
8
|
+
# Example usage in YAML:
|
5
9
|
#
|
6
10
|
# selectors:
|
7
|
-
#
|
11
|
+
# author:
|
8
12
|
# extractor: static
|
9
13
|
# static: Foobar
|
10
14
|
#
|
11
15
|
# Would return:
|
12
16
|
# 'Foobar'
|
13
17
|
class Static
|
18
|
+
# The available option for the static extractor.
|
19
|
+
Options = Struct.new('StaticOptions', :static, keyword_init: true)
|
20
|
+
|
21
|
+
##
|
22
|
+
# Initializes the Static extractor.
|
23
|
+
#
|
24
|
+
# @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
|
25
|
+
# @param options [Options] Options containing the static value.
|
14
26
|
def initialize(_xml, options)
|
15
27
|
@options = options
|
16
28
|
end
|
17
29
|
|
18
|
-
|
30
|
+
##
|
31
|
+
# Retrieves and returns the static value.
|
19
32
|
#
|
20
|
-
#
|
21
|
-
# Static.new(xml, options).get
|
22
|
-
# # => 'Foobar'
|
33
|
+
# @return [String, Symbol] The static value provided in options.
|
23
34
|
def get
|
24
|
-
@options
|
35
|
+
@options.static
|
25
36
|
end
|
26
37
|
end
|
27
38
|
end
|
@@ -1,10 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
|
-
# Return the text of the attribute. This is the default extractor used,
|
6
|
+
# Return the text content of the attribute. This is the default extractor used,
|
5
7
|
# when no extractor is explicitly given.
|
6
8
|
#
|
7
|
-
#
|
9
|
+
# Example HTML structure:
|
8
10
|
#
|
9
11
|
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
10
12
|
#
|
@@ -18,14 +20,24 @@ module Html2rss
|
|
18
20
|
# Would return:
|
19
21
|
# 'Lorem ipsum dolor ...'
|
20
22
|
class Text
|
23
|
+
# The available options for the text extractor.
|
24
|
+
Options = Struct.new('TextOptions', :selector, keyword_init: true)
|
25
|
+
|
26
|
+
##
|
27
|
+
# Initializes the Text extractor.
|
28
|
+
#
|
29
|
+
# @param xml [Nokogiri::XML::Element]
|
30
|
+
# @param options [Options]
|
21
31
|
def initialize(xml, options)
|
22
|
-
@element = ItemExtractors.element(xml, options)
|
32
|
+
@element = ItemExtractors.element(xml, options.selector)
|
23
33
|
end
|
24
34
|
|
25
35
|
##
|
26
|
-
#
|
36
|
+
# Retrieves and returns the text content of the element.
|
37
|
+
#
|
38
|
+
# @return [String] The text content.
|
27
39
|
def get
|
28
|
-
@element.text.to_s.strip.
|
40
|
+
@element.text.to_s.strip.gsub(/\s+/, ' ')
|
29
41
|
end
|
30
42
|
end
|
31
43
|
end
|
@@ -1,23 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
##
|
3
5
|
# Provides a namespace for item extractors.
|
4
6
|
module ItemExtractors
|
5
|
-
|
6
|
-
|
7
|
+
##
|
8
|
+
# The Error class to be thrown when an unknown extractor name is requested.
|
9
|
+
class UnknownExtractorName < StandardError; end
|
7
10
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
11
|
+
##
|
12
|
+
# Maps the extractor name to the class implementing the extractor.
|
13
|
+
#
|
14
|
+
# The key is the name to use in the feed config.
|
15
|
+
NAME_TO_CLASS = {
|
16
|
+
attribute: Attribute,
|
17
|
+
href: Href,
|
18
|
+
html: Html,
|
19
|
+
static: Static,
|
20
|
+
text: Text
|
21
|
+
}.freeze
|
12
22
|
|
13
|
-
|
23
|
+
##
|
24
|
+
# Maps the extractor class to its corresponding options class.
|
25
|
+
ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
|
26
|
+
hash[klass] = klass.const_get(:Options)
|
14
27
|
end
|
15
28
|
|
29
|
+
DEFAULT_EXTRACTOR = :text
|
30
|
+
|
16
31
|
##
|
17
|
-
#
|
18
|
-
|
19
|
-
|
32
|
+
# Retrieves an element from Nokogiri XML based on the selector.
|
33
|
+
#
|
34
|
+
# @param xml [Nokogiri::XML::Document]
|
35
|
+
# @param selector [String, nil]
|
36
|
+
# @return [Nokogiri::XML::ElementSet] selected XML elements
|
37
|
+
def self.element(xml, selector)
|
20
38
|
selector ? xml.css(selector) : xml
|
21
39
|
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# Creates an instance of the requested item extractor.
|
43
|
+
#
|
44
|
+
# @param attribute_options [Hash<Symbol, Object>]
|
45
|
+
# Should contain at least `:extractor` (the name) and required options for that extractor.
|
46
|
+
# @param xml [Nokogiri::XML::Document]
|
47
|
+
# @return [Object] instance of the specified item extractor class
|
48
|
+
def self.item_extractor_factory(attribute_options, xml)
|
49
|
+
extractor_name = attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR
|
50
|
+
extractor_class = find_extractor_class(extractor_name)
|
51
|
+
options_instance = build_options_instance(extractor_class, attribute_options)
|
52
|
+
create_extractor_instance(extractor_class, xml, options_instance)
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Finds the extractor class based on the name.
|
57
|
+
#
|
58
|
+
# @param extractor_name [Symbol] the name of the extractor
|
59
|
+
# @return [Class] the class implementing the extractor
|
60
|
+
# @raise [UnknownExtractorName] if the extractor class is not found
|
61
|
+
def self.find_extractor_class(extractor_name)
|
62
|
+
NAME_TO_CLASS[extractor_name] || raise(UnknownExtractorName,
|
63
|
+
"Unknown extractor name '#{extractor_name}' requested in NAME_TO_CLASS")
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# Builds the options instance for the extractor class.
|
68
|
+
#
|
69
|
+
# @param extractor_class [Class] the class implementing the extractor
|
70
|
+
# @param attribute_options [Hash<Symbol, Object>] the attribute options
|
71
|
+
# @return [Object] an instance of the options class for the extractor
|
72
|
+
def self.build_options_instance(extractor_class, attribute_options)
|
73
|
+
options = attribute_options.slice(*extractor_class::Options.members)
|
74
|
+
ITEM_OPTION_CLASSES[extractor_class].new(options)
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# Creates an instance of the extractor class.
|
79
|
+
#
|
80
|
+
# @param extractor_class [Class] the class implementing the extractor
|
81
|
+
# @param xml [Nokogiri::XML::Document] the XML document
|
82
|
+
# @param options_instance [Object] the options instance
|
83
|
+
# @return [Object] an instance of the extractor class
|
84
|
+
def self.create_extractor_instance(extractor_class, xml, options_instance)
|
85
|
+
extractor_class.new(xml, options_instance)
|
86
|
+
end
|
22
87
|
end
|
23
88
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# A naive implementation of "Object to XML": converts a Ruby object to XML format.
|
9
|
+
class ObjectToXmlConverter
|
10
|
+
OBJECT_TO_XML_TAGS = {
|
11
|
+
hash: ['<object>', '</object>'],
|
12
|
+
enumerable: ['<array>', '</array>']
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
##
|
16
|
+
# @param object [Object] any Ruby object (Hash, Array, String, Symbol, etc.)
|
17
|
+
def initialize(object)
|
18
|
+
@object = object
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Converts the object to XML format.
|
23
|
+
#
|
24
|
+
# @return [String] representing the object in XML
|
25
|
+
def call
|
26
|
+
object_to_xml(@object)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def object_to_xml(object)
|
32
|
+
case object
|
33
|
+
when Hash
|
34
|
+
hash_to_xml(object)
|
35
|
+
when Enumerable
|
36
|
+
enumerable_to_xml(object)
|
37
|
+
else
|
38
|
+
CGI.escapeHTML(object.to_s)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def hash_to_xml(object)
|
43
|
+
prefix, suffix = OBJECT_TO_XML_TAGS[:hash]
|
44
|
+
inner_xml = object.map { |key, value| "<#{key}>#{object_to_xml(value)}</#{key}>" }.join
|
45
|
+
|
46
|
+
"#{prefix}#{inner_xml}#{suffix}"
|
47
|
+
end
|
48
|
+
|
49
|
+
def enumerable_to_xml(object)
|
50
|
+
prefix, suffix = OBJECT_TO_XML_TAGS[:enumerable]
|
51
|
+
inner_xml = object.map { |value| object_to_xml(value) }.join
|
52
|
+
|
53
|
+
"#{prefix}#{inner_xml}#{suffix}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module RssBuilder
|
5
|
+
##
|
6
|
+
# Builds the <channel> tag (with the provided maker).
|
7
|
+
class Channel
|
8
|
+
##
|
9
|
+
# @param maker [RSS::Maker::RSS20::Channel]
|
10
|
+
# @param config [Html2rss::Config]
|
11
|
+
# @param tags [Set<Symbol>]
|
12
|
+
# @return nil
|
13
|
+
def self.add(maker, config, tags)
|
14
|
+
tags.each { |tag| maker.public_send(:"#{tag}=", config.public_send(tag)) }
|
15
|
+
|
16
|
+
maker.generator = "html2rss V. #{::Html2rss::VERSION}"
|
17
|
+
maker.lastBuildDate = Time.now
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mime/types'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
module RssBuilder
|
7
|
+
##
|
8
|
+
# Builds an <item> tag (with the provided maker).
|
9
|
+
class Item
|
10
|
+
# Tags which should be processed every time and require non-trivial assignments/treatments.
|
11
|
+
SPECIAL_TAGS = %i[categories enclosure guid].freeze
|
12
|
+
|
13
|
+
##
|
14
|
+
# Adds the item to the Item Maker
|
15
|
+
#
|
16
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
17
|
+
# @param item [Html2rss::Item]
|
18
|
+
# @param tags [Set<Symbol>]
|
19
|
+
# @return nil
|
20
|
+
def self.add(maker, item, tags)
|
21
|
+
tags.each do |tag|
|
22
|
+
next if SPECIAL_TAGS.include?(tag)
|
23
|
+
|
24
|
+
maker.public_send(:"#{tag}=", item.public_send(tag))
|
25
|
+
end
|
26
|
+
|
27
|
+
SPECIAL_TAGS.each do |tag|
|
28
|
+
send(:"add_#{tag}", item, maker)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Adds the <category> tags, if there should be any.
|
34
|
+
#
|
35
|
+
# @param item [Html2rss::Item]
|
36
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
37
|
+
# @return nil
|
38
|
+
def self.add_categories(item, maker)
|
39
|
+
item.categories.each { |category| maker.categories.new_category.content = category }
|
40
|
+
end
|
41
|
+
private_class_method :add_categories
|
42
|
+
|
43
|
+
##
|
44
|
+
# Adds an enclosure, if there should be one.
|
45
|
+
#
|
46
|
+
# @param item [Html2rss::Item]
|
47
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
48
|
+
# @return nil
|
49
|
+
def self.add_enclosure(item, maker)
|
50
|
+
return unless item.enclosure?
|
51
|
+
|
52
|
+
set_enclosure_attributes(item.enclosure, maker.enclosure)
|
53
|
+
end
|
54
|
+
private_class_method :add_enclosure
|
55
|
+
|
56
|
+
##
|
57
|
+
# Sets the attributes of an RSS enclosure.
|
58
|
+
#
|
59
|
+
# @param item_enclosure [Html2rss::Enclosure]
|
60
|
+
# @param rss_enclosure [RSS::Maker::RSS20::Items::Enclosure]
|
61
|
+
# @return nil
|
62
|
+
def self.set_enclosure_attributes(item_enclosure, rss_enclosure)
|
63
|
+
rss_enclosure.type = item_enclosure.type
|
64
|
+
rss_enclosure.length = item_enclosure.bits_length
|
65
|
+
rss_enclosure.url = item_enclosure.url
|
66
|
+
end
|
67
|
+
private_class_method :set_enclosure_attributes
|
68
|
+
|
69
|
+
##
|
70
|
+
# Adds a non-permalink GUID to the item.
|
71
|
+
#
|
72
|
+
# @param item [Html2rss::Item]
|
73
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
74
|
+
# @return nil
|
75
|
+
def self.add_guid(item, maker)
|
76
|
+
guid = maker.guid
|
77
|
+
guid.content = item.guid
|
78
|
+
guid.isPermaLink = false
|
79
|
+
end
|
80
|
+
private_class_method :add_guid
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module RssBuilder
|
5
|
+
##
|
6
|
+
# Adds XML stylesheet tags (with the provided maker).
|
7
|
+
class Stylesheet
|
8
|
+
##
|
9
|
+
# Adds the stylesheet XML tags to the RSS.
|
10
|
+
#
|
11
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
12
|
+
# @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
|
13
|
+
# @return [nil]
|
14
|
+
def self.add(maker, stylesheets)
|
15
|
+
stylesheets.each do |stylesheet|
|
16
|
+
add_stylesheet(maker, stylesheet)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# Adds a single Stylesheet to the RSS.
|
22
|
+
#
|
23
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
24
|
+
# @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
|
25
|
+
# @return [nil]
|
26
|
+
def self.add_stylesheet(maker, stylesheet)
|
27
|
+
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
28
|
+
xss.href = stylesheet.href
|
29
|
+
xss.type = stylesheet.type
|
30
|
+
xss.media = stylesheet.media
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private_class_method :add_stylesheet
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rss'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
##
|
7
|
+
# Builds the RSS 2.0 feed, which consists of the '<channel>' and the '<item>'s
|
8
|
+
# tags in the RSS.
|
9
|
+
module RssBuilder
|
10
|
+
# Possible tags inside a RSS 2.0 <channel> tag.
|
11
|
+
CHANNEL_TAGS = %i[language author title description link ttl].freeze
|
12
|
+
# Possible tags inside a RSS 2.0 <item> tag.
|
13
|
+
ITEM_TAGS = %i[title link description author comments updated].freeze
|
14
|
+
|
15
|
+
##
|
16
|
+
# Builds an RSS 2.0 feed based on the provided configuration.
|
17
|
+
#
|
18
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
19
|
+
# @return [RSS::Rss] RSS feed object.
|
20
|
+
def self.build(config)
|
21
|
+
RSS::Maker.make('2.0') do |maker|
|
22
|
+
add_stylesheets(maker, config.stylesheets)
|
23
|
+
add_channel(maker, config)
|
24
|
+
add_items(maker, config)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Adds stylesheets to the RSS maker.
|
30
|
+
#
|
31
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
32
|
+
# @param stylesheets [Array<String>] Array of stylesheets to add.
|
33
|
+
def self.add_stylesheets(maker, stylesheets)
|
34
|
+
Stylesheet.add(maker, stylesheets)
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Adds channel information to the RSS maker.
|
39
|
+
#
|
40
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
41
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
42
|
+
def self.add_channel(maker, config)
|
43
|
+
channel = maker.channel
|
44
|
+
CHANNEL_TAGS.each do |tag|
|
45
|
+
Channel.add(channel, config, [tag])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Adds items to the RSS maker based on configuration.
|
51
|
+
#
|
52
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
53
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
54
|
+
def self.add_items(maker, config)
|
55
|
+
item_attributes = extract_item_attributes(config)
|
56
|
+
items = fetch_items(config)
|
57
|
+
items.reverse! if config.items_order == :reverse
|
58
|
+
|
59
|
+
items.each do |item|
|
60
|
+
add_item(maker, item, item_attributes)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Adds a single item to the RSS maker.
|
66
|
+
#
|
67
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
68
|
+
# @param item [Html2rss::Item] Item to add.
|
69
|
+
# @param item_attributes [Array<Symbol>] Array of item attributes.
|
70
|
+
# @return [nil]
|
71
|
+
def self.add_item(maker, item, item_attributes)
|
72
|
+
new_item = maker.items.new_item
|
73
|
+
Item.add(new_item, item, item_attributes)
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# Extracts item attributes from configuration.
|
78
|
+
#
|
79
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
80
|
+
# @return [Array<Symbol>] Array of item attributes.
|
81
|
+
def self.extract_item_attributes(config)
|
82
|
+
config.item_selector_names & ITEM_TAGS
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# Fetches items from the URL specified in configuration.
|
87
|
+
#
|
88
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
89
|
+
# @return [Array<Html2rss::Item>] Array of items.
|
90
|
+
def self.fetch_items(config)
|
91
|
+
Html2rss::Item.from_url(config.url, config)
|
92
|
+
end
|
93
|
+
|
94
|
+
private_class_method :extract_item_attributes, :fetch_items, :add_item
|
95
|
+
end
|
96
|
+
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,40 +1,115 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'addressable/uri'
|
3
|
-
require '
|
4
|
+
require 'faraday'
|
5
|
+
require 'faraday/follow_redirects'
|
4
6
|
require 'json'
|
5
|
-
require '
|
7
|
+
require 'regexp_parser'
|
8
|
+
require 'tzinfo'
|
9
|
+
require 'mime/types'
|
10
|
+
require_relative 'object_to_xml_converter'
|
6
11
|
|
7
12
|
module Html2rss
|
8
13
|
##
|
9
14
|
# The collecting tank for utility methods.
|
10
15
|
module Utils
|
16
|
+
##
|
17
|
+
# @param url [String, Addressable::URI]
|
18
|
+
# @param base_url [String]
|
19
|
+
# @return [Addressable::URI]
|
11
20
|
def self.build_absolute_url_from_relative(url, base_url)
|
12
|
-
url = URI(url)
|
21
|
+
url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
|
13
22
|
|
14
23
|
return url if url.absolute?
|
15
24
|
|
16
|
-
URI(base_url)
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
end
|
25
|
+
base_uri = Addressable::URI.parse(base_url)
|
26
|
+
base_uri.path = '/' if base_uri.path.empty?
|
27
|
+
|
28
|
+
base_uri.join(url).normalize
|
21
29
|
end
|
22
30
|
|
23
|
-
|
24
|
-
|
31
|
+
##
|
32
|
+
# Removes any space, parses and normalizes the given url.
|
33
|
+
# @param url [String]
|
34
|
+
# @return [String, nil] sanitized and normalized URL, or nil if input is empty
|
35
|
+
def self.sanitize_url(url)
|
36
|
+
url = url.to_s.gsub(/\s+/, ' ').strip
|
37
|
+
return if url.empty?
|
38
|
+
|
39
|
+
Addressable::URI.parse(url).normalize.to_s
|
25
40
|
end
|
26
41
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
42
|
+
##
|
43
|
+
# Allows override of time zone locally inside supplied block; resets previous time zone when done.
|
44
|
+
#
|
45
|
+
# @param time_zone [String]
|
46
|
+
# @param default_time_zone [String]
|
47
|
+
# @return [Object] whatever the given block returns
|
48
|
+
def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
|
49
|
+
raise ArgumentError, 'a block is required' unless block_given?
|
50
|
+
|
51
|
+
time_zone = TZInfo::Timezone.get(time_zone)
|
52
|
+
|
53
|
+
prev_tz = ENV.fetch('TZ', default_time_zone)
|
54
|
+
ENV['TZ'] = time_zone.name
|
55
|
+
yield
|
56
|
+
ensure
|
57
|
+
ENV['TZ'] = prev_tz if prev_tz
|
31
58
|
end
|
32
59
|
|
33
|
-
|
34
|
-
|
35
|
-
|
60
|
+
##
|
61
|
+
# Builds a titleized representation of the URL.
|
62
|
+
# @param url [String, Addressable::URI]
|
63
|
+
# @return [String]
|
64
|
+
def self.titleized_url(url)
|
65
|
+
uri = Addressable::URI.parse(url)
|
66
|
+
host = uri.host
|
67
|
+
|
68
|
+
nicer_path = uri.path.split('/').reject(&:empty?)
|
69
|
+
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
70
|
+
end
|
71
|
+
|
72
|
+
##
|
73
|
+
# @param url [String, Addressable::URI]
|
74
|
+
# @param convert_json_to_xml [true, false] Should JSON be converted to XML
|
75
|
+
# @param headers [Hash] additional HTTP request headers to use for the request
|
76
|
+
# @return [String] body of the HTTP response
|
77
|
+
def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
|
78
|
+
response = Faraday.new(url:, headers:) do |faraday|
|
79
|
+
faraday.use Faraday::FollowRedirects::Middleware
|
80
|
+
faraday.adapter Faraday.default_adapter
|
81
|
+
end.get
|
82
|
+
|
83
|
+
body = response.body
|
84
|
+
|
85
|
+
convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
# Parses the given String and builds a Regexp out of it.
|
90
|
+
#
|
91
|
+
# It will remove one pair of surrounding slashes ('/') from the String
|
92
|
+
# to maintain backwards compatibility before building the Regexp.
|
93
|
+
#
|
94
|
+
# @param string [String]
|
95
|
+
# @return [Regexp]
|
96
|
+
def self.build_regexp_from_string(string)
|
97
|
+
raise ArgumentError, 'must be a string!' unless string.is_a?(String)
|
98
|
+
|
99
|
+
string = string[1..-2] if string.start_with?('/') && string.end_with?('/')
|
100
|
+
Regexp::Parser.parse(string, options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE).to_re
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Guesses the content type based on the file extension of the URL.
|
105
|
+
#
|
106
|
+
# @param url [String, Addressable::URI]
|
107
|
+
# @return [String] guessed content type, defaults to 'application/octet-stream'
|
108
|
+
def self.guess_content_type_from_url(url)
|
109
|
+
url = url.to_s.split('?').first
|
36
110
|
|
37
|
-
|
111
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
112
|
+
content_type.first&.to_s || 'application/octet-stream'
|
38
113
|
end
|
39
114
|
end
|
40
115
|
end
|