html2rss 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -270
- data/exe/html2rss +6 -0
- data/html2rss.gemspec +18 -23
- data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
- data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
- data/lib/html2rss/attribute_post_processors/template.rb +36 -12
- data/lib/html2rss/attribute_post_processors.rb +28 -5
- data/lib/html2rss/cli.rb +29 -0
- data/lib/html2rss/config/channel.rb +117 -0
- data/lib/html2rss/config/selectors.rb +91 -0
- data/lib/html2rss/config.rb +71 -82
- data/lib/html2rss/item.rb +122 -46
- data/lib/html2rss/item_extractors/attribute.rb +20 -7
- data/lib/html2rss/item_extractors/href.rb +20 -4
- data/lib/html2rss/item_extractors/html.rb +18 -6
- data/lib/html2rss/item_extractors/static.rb +18 -7
- data/lib/html2rss/item_extractors/text.rb +17 -5
- data/lib/html2rss/item_extractors.rb +75 -10
- data/lib/html2rss/object_to_xml_converter.rb +56 -0
- data/lib/html2rss/rss_builder/channel.rb +21 -0
- data/lib/html2rss/rss_builder/item.rb +83 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
- data/lib/html2rss/rss_builder.rb +96 -0
- data/lib/html2rss/utils.rb +94 -19
- data/lib/html2rss/version.rb +5 -1
- data/lib/html2rss.rb +57 -20
- metadata +53 -165
- data/.gitignore +0 -12
- data/.rspec +0 -4
- data/.rubocop.yml +0 -164
- data/.travis.yml +0 -25
- data/.yardopts +0 -6
- data/CHANGELOG.md +0 -221
- data/Gemfile +0 -8
- data/Gemfile.lock +0 -139
- data/bin/console +0 -15
- data/bin/setup +0 -8
- data/lib/html2rss/feed_builder.rb +0 -81
- data/lib/html2rss/item_extractors/current_time.rb +0 -21
- data/support/logo.png +0 -0
@@ -1,9 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
|
-
#
|
6
|
+
# Returns the HTML content of the specified element.
|
5
7
|
#
|
6
|
-
#
|
8
|
+
# Example HTML structure:
|
7
9
|
#
|
8
10
|
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
9
11
|
#
|
@@ -17,15 +19,25 @@ module Html2rss
|
|
17
19
|
# Would return:
|
18
20
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
19
21
|
#
|
20
|
-
# Always
|
21
|
-
# {AttributePostProcessors::SanitizeHtml}
|
22
|
+
# Always ensure to sanitize the HTML during post-processing with
|
23
|
+
# {AttributePostProcessors::SanitizeHtml}.
|
22
24
|
class Html
|
25
|
+
# The available options for the html extractor.
|
26
|
+
Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
|
27
|
+
|
28
|
+
##
|
29
|
+
# Initializes the Html extractor.
|
30
|
+
#
|
31
|
+
# @param xml [Nokogiri::XML::Element]
|
32
|
+
# @param options [Options]
|
23
33
|
def initialize(xml, options)
|
24
|
-
@element = ItemExtractors.element(xml, options)
|
34
|
+
@element = ItemExtractors.element(xml, options.selector)
|
25
35
|
end
|
26
36
|
|
27
37
|
##
|
28
|
-
#
|
38
|
+
# Retrieves and returns the HTML content of the element.
|
39
|
+
#
|
40
|
+
# @return [String] The HTML content.
|
29
41
|
def get
|
30
42
|
@element.to_s
|
31
43
|
end
|
@@ -1,27 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
|
-
#
|
6
|
+
# Returns a static value provided in the options.
|
7
|
+
#
|
8
|
+
# Example usage in YAML:
|
5
9
|
#
|
6
10
|
# selectors:
|
7
|
-
#
|
11
|
+
# author:
|
8
12
|
# extractor: static
|
9
13
|
# static: Foobar
|
10
14
|
#
|
11
15
|
# Would return:
|
12
16
|
# 'Foobar'
|
13
17
|
class Static
|
18
|
+
# The available option for the static extractor.
|
19
|
+
Options = Struct.new('StaticOptions', :static, keyword_init: true)
|
20
|
+
|
21
|
+
##
|
22
|
+
# Initializes the Static extractor.
|
23
|
+
#
|
24
|
+
# @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
|
25
|
+
# @param options [Options] Options containing the static value.
|
14
26
|
def initialize(_xml, options)
|
15
27
|
@options = options
|
16
28
|
end
|
17
29
|
|
18
|
-
|
30
|
+
##
|
31
|
+
# Retrieves and returns the static value.
|
19
32
|
#
|
20
|
-
#
|
21
|
-
# Static.new(xml, options).get
|
22
|
-
# # => 'Foobar'
|
33
|
+
# @return [String, Symbol] The static value provided in options.
|
23
34
|
def get
|
24
|
-
@options
|
35
|
+
@options.static
|
25
36
|
end
|
26
37
|
end
|
27
38
|
end
|
@@ -1,10 +1,12 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
|
-
# Return the text of the attribute. This is the default extractor used,
|
6
|
+
# Return the text content of the attribute. This is the default extractor used,
|
5
7
|
# when no extractor is explicitly given.
|
6
8
|
#
|
7
|
-
#
|
9
|
+
# Example HTML structure:
|
8
10
|
#
|
9
11
|
# <p>Lorem <b>ipsum</b> dolor ...</p>
|
10
12
|
#
|
@@ -18,14 +20,24 @@ module Html2rss
|
|
18
20
|
# Would return:
|
19
21
|
# 'Lorem ipsum dolor ...'
|
20
22
|
class Text
|
23
|
+
# The available options for the text extractor.
|
24
|
+
Options = Struct.new('TextOptions', :selector, keyword_init: true)
|
25
|
+
|
26
|
+
##
|
27
|
+
# Initializes the Text extractor.
|
28
|
+
#
|
29
|
+
# @param xml [Nokogiri::XML::Element]
|
30
|
+
# @param options [Options]
|
21
31
|
def initialize(xml, options)
|
22
|
-
@element = ItemExtractors.element(xml, options)
|
32
|
+
@element = ItemExtractors.element(xml, options.selector)
|
23
33
|
end
|
24
34
|
|
25
35
|
##
|
26
|
-
#
|
36
|
+
# Retrieves and returns the text content of the element.
|
37
|
+
#
|
38
|
+
# @return [String] The text content.
|
27
39
|
def get
|
28
|
-
@element.text.to_s.strip.
|
40
|
+
@element.text.to_s.strip.gsub(/\s+/, ' ')
|
29
41
|
end
|
30
42
|
end
|
31
43
|
end
|
@@ -1,23 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
##
|
3
5
|
# Provides a namespace for item extractors.
|
4
6
|
module ItemExtractors
|
5
|
-
|
6
|
-
|
7
|
+
##
|
8
|
+
# The Error class to be thrown when an unknown extractor name is requested.
|
9
|
+
class UnknownExtractorName < Html2rss::Error; end
|
7
10
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
11
|
+
##
|
12
|
+
# Maps the extractor name to the class implementing the extractor.
|
13
|
+
#
|
14
|
+
# The key is the name to use in the feed config.
|
15
|
+
NAME_TO_CLASS = {
|
16
|
+
attribute: Attribute,
|
17
|
+
href: Href,
|
18
|
+
html: Html,
|
19
|
+
static: Static,
|
20
|
+
text: Text
|
21
|
+
}.freeze
|
12
22
|
|
13
|
-
|
23
|
+
##
|
24
|
+
# Maps the extractor class to its corresponding options class.
|
25
|
+
ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
|
26
|
+
hash[klass] = klass.const_get(:Options)
|
14
27
|
end
|
15
28
|
|
29
|
+
DEFAULT_EXTRACTOR = :text
|
30
|
+
|
16
31
|
##
|
17
|
-
#
|
18
|
-
|
19
|
-
|
32
|
+
# Retrieves an element from Nokogiri XML based on the selector.
|
33
|
+
#
|
34
|
+
# @param xml [Nokogiri::XML::Document]
|
35
|
+
# @param selector [String, nil]
|
36
|
+
# @return [Nokogiri::XML::ElementSet] selected XML elements
|
37
|
+
def self.element(xml, selector)
|
20
38
|
selector ? xml.css(selector) : xml
|
21
39
|
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# Creates an instance of the requested item extractor.
|
43
|
+
#
|
44
|
+
# @param attribute_options [Hash<Symbol, Object>]
|
45
|
+
# Should contain at least `:extractor` (the name) and required options for that extractor.
|
46
|
+
# @param xml [Nokogiri::XML::Document]
|
47
|
+
# @return [Object] instance of the specified item extractor class
|
48
|
+
def self.item_extractor_factory(attribute_options, xml)
|
49
|
+
extractor_name = attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR
|
50
|
+
extractor_class = find_extractor_class(extractor_name)
|
51
|
+
options_instance = build_options_instance(extractor_class, attribute_options)
|
52
|
+
create_extractor_instance(extractor_class, xml, options_instance)
|
53
|
+
end
|
54
|
+
|
55
|
+
##
|
56
|
+
# Finds the extractor class based on the name.
|
57
|
+
#
|
58
|
+
# @param extractor_name [Symbol] the name of the extractor
|
59
|
+
# @return [Class] the class implementing the extractor
|
60
|
+
# @raise [UnknownExtractorName] if the extractor class is not found
|
61
|
+
def self.find_extractor_class(extractor_name)
|
62
|
+
NAME_TO_CLASS[extractor_name] || raise(UnknownExtractorName,
|
63
|
+
"Unknown extractor name '#{extractor_name}' requested in NAME_TO_CLASS")
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# Builds the options instance for the extractor class.
|
68
|
+
#
|
69
|
+
# @param extractor_class [Class] the class implementing the extractor
|
70
|
+
# @param attribute_options [Hash<Symbol, Object>] the attribute options
|
71
|
+
# @return [Object] an instance of the options class for the extractor
|
72
|
+
def self.build_options_instance(extractor_class, attribute_options)
|
73
|
+
options = attribute_options.slice(*extractor_class::Options.members)
|
74
|
+
ITEM_OPTION_CLASSES[extractor_class].new(options)
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# Creates an instance of the extractor class.
|
79
|
+
#
|
80
|
+
# @param extractor_class [Class] the class implementing the extractor
|
81
|
+
# @param xml [Nokogiri::XML::Document] the XML document
|
82
|
+
# @param options_instance [Object] the options instance
|
83
|
+
# @return [Object] an instance of the extractor class
|
84
|
+
def self.create_extractor_instance(extractor_class, xml, options_instance)
|
85
|
+
extractor_class.new(xml, options_instance)
|
86
|
+
end
|
22
87
|
end
|
23
88
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# A naive implementation of "Object to XML": converts a Ruby object to XML format.
|
9
|
+
class ObjectToXmlConverter
|
10
|
+
OBJECT_TO_XML_TAGS = {
|
11
|
+
hash: ['<object>', '</object>'],
|
12
|
+
enumerable: ['<array>', '</array>']
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
##
|
16
|
+
# @param object [Object] any Ruby object (Hash, Array, String, Symbol, etc.)
|
17
|
+
def initialize(object)
|
18
|
+
@object = object
|
19
|
+
end
|
20
|
+
|
21
|
+
##
|
22
|
+
# Converts the object to XML format.
|
23
|
+
#
|
24
|
+
# @return [String] representing the object in XML
|
25
|
+
def call
|
26
|
+
object_to_xml(@object)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def object_to_xml(object)
|
32
|
+
case object
|
33
|
+
when Hash
|
34
|
+
hash_to_xml(object)
|
35
|
+
when Enumerable
|
36
|
+
enumerable_to_xml(object)
|
37
|
+
else
|
38
|
+
CGI.escapeHTML(object.to_s)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def hash_to_xml(object)
|
43
|
+
prefix, suffix = OBJECT_TO_XML_TAGS[:hash]
|
44
|
+
inner_xml = object.map { |key, value| "<#{key}>#{object_to_xml(value)}</#{key}>" }.join
|
45
|
+
|
46
|
+
"#{prefix}#{inner_xml}#{suffix}"
|
47
|
+
end
|
48
|
+
|
49
|
+
def enumerable_to_xml(object)
|
50
|
+
prefix, suffix = OBJECT_TO_XML_TAGS[:enumerable]
|
51
|
+
inner_xml = object.map { |value| object_to_xml(value) }.join
|
52
|
+
|
53
|
+
"#{prefix}#{inner_xml}#{suffix}"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module RssBuilder
|
5
|
+
##
|
6
|
+
# Builds the <channel> tag (with the provided maker).
|
7
|
+
class Channel
|
8
|
+
##
|
9
|
+
# @param maker [RSS::Maker::RSS20::Channel]
|
10
|
+
# @param config [Html2rss::Config]
|
11
|
+
# @param tags [Set<Symbol>]
|
12
|
+
# @return nil
|
13
|
+
def self.add(maker, config, tags)
|
14
|
+
tags.each { |tag| maker.public_send(:"#{tag}=", config.public_send(tag)) }
|
15
|
+
|
16
|
+
maker.generator = "html2rss V. #{::Html2rss::VERSION}"
|
17
|
+
maker.lastBuildDate = Time.now
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mime/types'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
module RssBuilder
|
7
|
+
##
|
8
|
+
# Builds an <item> tag (with the provided maker).
|
9
|
+
class Item
|
10
|
+
# Tags which should be processed every time and require non-trivial assignments/treatments.
|
11
|
+
SPECIAL_TAGS = %i[categories enclosure guid].freeze
|
12
|
+
|
13
|
+
##
|
14
|
+
# Adds the item to the Item Maker
|
15
|
+
#
|
16
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
17
|
+
# @param item [Html2rss::Item]
|
18
|
+
# @param tags [Set<Symbol>]
|
19
|
+
# @return nil
|
20
|
+
def self.add(maker, item, tags)
|
21
|
+
tags.each do |tag|
|
22
|
+
next if SPECIAL_TAGS.include?(tag)
|
23
|
+
|
24
|
+
maker.public_send(:"#{tag}=", item.public_send(tag))
|
25
|
+
end
|
26
|
+
|
27
|
+
SPECIAL_TAGS.each do |tag|
|
28
|
+
send(:"add_#{tag}", item, maker)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Adds the <category> tags, if there should be any.
|
34
|
+
#
|
35
|
+
# @param item [Html2rss::Item]
|
36
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
37
|
+
# @return nil
|
38
|
+
def self.add_categories(item, maker)
|
39
|
+
item.categories.each { |category| maker.categories.new_category.content = category }
|
40
|
+
end
|
41
|
+
private_class_method :add_categories
|
42
|
+
|
43
|
+
##
|
44
|
+
# Adds an enclosure, if there should be one.
|
45
|
+
#
|
46
|
+
# @param item [Html2rss::Item]
|
47
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
48
|
+
# @return nil
|
49
|
+
def self.add_enclosure(item, maker)
|
50
|
+
return unless item.enclosure?
|
51
|
+
|
52
|
+
set_enclosure_attributes(item.enclosure, maker.enclosure)
|
53
|
+
end
|
54
|
+
private_class_method :add_enclosure
|
55
|
+
|
56
|
+
##
|
57
|
+
# Sets the attributes of an RSS enclosure.
|
58
|
+
#
|
59
|
+
# @param item_enclosure [Html2rss::Enclosure]
|
60
|
+
# @param rss_enclosure [RSS::Maker::RSS20::Items::Enclosure]
|
61
|
+
# @return nil
|
62
|
+
def self.set_enclosure_attributes(item_enclosure, rss_enclosure)
|
63
|
+
rss_enclosure.type = item_enclosure.type
|
64
|
+
rss_enclosure.length = item_enclosure.bits_length
|
65
|
+
rss_enclosure.url = item_enclosure.url
|
66
|
+
end
|
67
|
+
private_class_method :set_enclosure_attributes
|
68
|
+
|
69
|
+
##
|
70
|
+
# Adds a non-permalink GUID to the item.
|
71
|
+
#
|
72
|
+
# @param item [Html2rss::Item]
|
73
|
+
# @param maker [RSS::Maker::RSS20::Items::Item]
|
74
|
+
# @return nil
|
75
|
+
def self.add_guid(item, maker)
|
76
|
+
guid = maker.guid
|
77
|
+
guid.content = item.guid
|
78
|
+
guid.isPermaLink = false
|
79
|
+
end
|
80
|
+
private_class_method :add_guid
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module RssBuilder
|
5
|
+
##
|
6
|
+
# Adds XML stylesheet tags (with the provided maker).
|
7
|
+
class Stylesheet
|
8
|
+
##
|
9
|
+
# Adds the stylesheet XML tags to the RSS.
|
10
|
+
#
|
11
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
12
|
+
# @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
|
13
|
+
# @return [nil]
|
14
|
+
def self.add(maker, stylesheets)
|
15
|
+
stylesheets.each do |stylesheet|
|
16
|
+
add_stylesheet(maker, stylesheet)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# Adds a single Stylesheet to the RSS.
|
22
|
+
#
|
23
|
+
# @param maker [RSS::Maker::RSS20] RSS maker object.
|
24
|
+
# @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
|
25
|
+
# @return [nil]
|
26
|
+
def self.add_stylesheet(maker, stylesheet)
|
27
|
+
maker.xml_stylesheets.new_xml_stylesheet do |xss|
|
28
|
+
xss.href = stylesheet.href
|
29
|
+
xss.type = stylesheet.type
|
30
|
+
xss.media = stylesheet.media
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private_class_method :add_stylesheet
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'rss'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
##
|
7
|
+
# Builds the RSS 2.0 feed, which consists of the '<channel>' and the '<item>'s
|
8
|
+
# tags in the RSS.
|
9
|
+
module RssBuilder
|
10
|
+
# Possible tags inside a RSS 2.0 <channel> tag.
|
11
|
+
CHANNEL_TAGS = %i[language author title description link ttl].freeze
|
12
|
+
# Possible tags inside a RSS 2.0 <item> tag.
|
13
|
+
ITEM_TAGS = %i[title link description author comments updated].freeze
|
14
|
+
|
15
|
+
##
|
16
|
+
# Builds an RSS 2.0 feed based on the provided configuration.
|
17
|
+
#
|
18
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
19
|
+
# @return [RSS::Rss] RSS feed object.
|
20
|
+
def self.build(config)
|
21
|
+
RSS::Maker.make('2.0') do |maker|
|
22
|
+
add_stylesheets(maker, config.stylesheets)
|
23
|
+
add_channel(maker, config)
|
24
|
+
add_items(maker, config)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Adds stylesheets to the RSS maker.
|
30
|
+
#
|
31
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
32
|
+
# @param stylesheets [Array<String>] Array of stylesheets to add.
|
33
|
+
def self.add_stylesheets(maker, stylesheets)
|
34
|
+
Stylesheet.add(maker, stylesheets)
|
35
|
+
end
|
36
|
+
|
37
|
+
##
|
38
|
+
# Adds channel information to the RSS maker.
|
39
|
+
#
|
40
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
41
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
42
|
+
def self.add_channel(maker, config)
|
43
|
+
channel = maker.channel
|
44
|
+
CHANNEL_TAGS.each do |tag|
|
45
|
+
Channel.add(channel, config, [tag])
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Adds items to the RSS maker based on configuration.
|
51
|
+
#
|
52
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
53
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
54
|
+
def self.add_items(maker, config)
|
55
|
+
item_attributes = extract_item_attributes(config)
|
56
|
+
items = fetch_items(config)
|
57
|
+
items.reverse! if config.items_order == :reverse
|
58
|
+
|
59
|
+
items.each do |item|
|
60
|
+
add_item(maker, item, item_attributes)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
##
|
65
|
+
# Adds a single item to the RSS maker.
|
66
|
+
#
|
67
|
+
# @param maker [RSS::Maker] RSS maker instance.
|
68
|
+
# @param item [Html2rss::Item] Item to add.
|
69
|
+
# @param item_attributes [Array<Symbol>] Array of item attributes.
|
70
|
+
# @return [nil]
|
71
|
+
def self.add_item(maker, item, item_attributes)
|
72
|
+
new_item = maker.items.new_item
|
73
|
+
Item.add(new_item, item, item_attributes)
|
74
|
+
end
|
75
|
+
|
76
|
+
##
|
77
|
+
# Extracts item attributes from configuration.
|
78
|
+
#
|
79
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
80
|
+
# @return [Array<Symbol>] Array of item attributes.
|
81
|
+
def self.extract_item_attributes(config)
|
82
|
+
config.item_selector_names & ITEM_TAGS
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# Fetches items from the URL specified in configuration.
|
87
|
+
#
|
88
|
+
# @param config [Html2rss::Config] Configuration object containing feed details.
|
89
|
+
# @return [Array<Html2rss::Item>] Array of items.
|
90
|
+
def self.fetch_items(config)
|
91
|
+
Html2rss::Item.from_url(config.url, config)
|
92
|
+
end
|
93
|
+
|
94
|
+
private_class_method :extract_item_attributes, :fetch_items, :add_item
|
95
|
+
end
|
96
|
+
end
|
data/lib/html2rss/utils.rb
CHANGED
@@ -1,40 +1,115 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
2
3
|
require 'addressable/uri'
|
3
|
-
require '
|
4
|
+
require 'faraday'
|
5
|
+
require 'faraday/follow_redirects'
|
4
6
|
require 'json'
|
5
|
-
require '
|
7
|
+
require 'regexp_parser'
|
8
|
+
require 'tzinfo'
|
9
|
+
require 'mime/types'
|
10
|
+
require_relative 'object_to_xml_converter'
|
6
11
|
|
7
12
|
module Html2rss
|
8
13
|
##
|
9
14
|
# The collecting tank for utility methods.
|
10
15
|
module Utils
|
16
|
+
##
|
17
|
+
# @param url [String, Addressable::URI]
|
18
|
+
# @param base_url [String]
|
19
|
+
# @return [Addressable::URI]
|
11
20
|
def self.build_absolute_url_from_relative(url, base_url)
|
12
|
-
url = URI(url)
|
21
|
+
url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
|
13
22
|
|
14
23
|
return url if url.absolute?
|
15
24
|
|
16
|
-
URI(base_url)
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
end
|
25
|
+
base_uri = Addressable::URI.parse(base_url)
|
26
|
+
base_uri.path = '/' if base_uri.path.empty?
|
27
|
+
|
28
|
+
base_uri.join(url).normalize
|
21
29
|
end
|
22
30
|
|
23
|
-
|
24
|
-
|
31
|
+
##
|
32
|
+
# Removes any space, parses and normalizes the given url.
|
33
|
+
# @param url [String]
|
34
|
+
# @return [String, nil] sanitized and normalized URL, or nil if input is empty
|
35
|
+
def self.sanitize_url(url)
|
36
|
+
url = url.to_s.gsub(/\s+/, ' ').strip
|
37
|
+
return if url.empty?
|
38
|
+
|
39
|
+
Addressable::URI.parse(url).normalize.to_s
|
25
40
|
end
|
26
41
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
42
|
+
##
|
43
|
+
# Allows override of time zone locally inside supplied block; resets previous time zone when done.
|
44
|
+
#
|
45
|
+
# @param time_zone [String]
|
46
|
+
# @param default_time_zone [String]
|
47
|
+
# @return [Object] whatever the given block returns
|
48
|
+
def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
|
49
|
+
raise ArgumentError, 'a block is required' unless block_given?
|
50
|
+
|
51
|
+
time_zone = TZInfo::Timezone.get(time_zone)
|
52
|
+
|
53
|
+
prev_tz = ENV.fetch('TZ', default_time_zone)
|
54
|
+
ENV['TZ'] = time_zone.name
|
55
|
+
yield
|
56
|
+
ensure
|
57
|
+
ENV['TZ'] = prev_tz if prev_tz
|
31
58
|
end
|
32
59
|
|
33
|
-
|
34
|
-
|
35
|
-
|
60
|
+
##
|
61
|
+
# Builds a titleized representation of the URL.
|
62
|
+
# @param url [String, Addressable::URI]
|
63
|
+
# @return [String]
|
64
|
+
def self.titleized_url(url)
|
65
|
+
uri = Addressable::URI.parse(url)
|
66
|
+
host = uri.host
|
67
|
+
|
68
|
+
nicer_path = uri.path.split('/').reject(&:empty?)
|
69
|
+
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
70
|
+
end
|
71
|
+
|
72
|
+
##
|
73
|
+
# @param url [String, Addressable::URI]
|
74
|
+
# @param convert_json_to_xml [true, false] Should JSON be converted to XML
|
75
|
+
# @param headers [Hash] additional HTTP request headers to use for the request
|
76
|
+
# @return [String] body of the HTTP response
|
77
|
+
def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
|
78
|
+
response = Faraday.new(url:, headers:) do |faraday|
|
79
|
+
faraday.use Faraday::FollowRedirects::Middleware
|
80
|
+
faraday.adapter Faraday.default_adapter
|
81
|
+
end.get
|
82
|
+
|
83
|
+
body = response.body
|
84
|
+
|
85
|
+
convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
# Parses the given String and builds a Regexp out of it.
|
90
|
+
#
|
91
|
+
# It will remove one pair of surrounding slashes ('/') from the String
|
92
|
+
# to maintain backwards compatibility before building the Regexp.
|
93
|
+
#
|
94
|
+
# @param string [String]
|
95
|
+
# @return [Regexp]
|
96
|
+
def self.build_regexp_from_string(string)
|
97
|
+
raise ArgumentError, 'must be a string!' unless string.is_a?(String)
|
98
|
+
|
99
|
+
string = string[1..-2] if string.start_with?('/') && string.end_with?('/')
|
100
|
+
Regexp::Parser.parse(string, options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE).to_re
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Guesses the content type based on the file extension of the URL.
|
105
|
+
#
|
106
|
+
# @param url [String, Addressable::URI]
|
107
|
+
# @return [String] guessed content type, defaults to 'application/octet-stream'
|
108
|
+
def self.guess_content_type_from_url(url)
|
109
|
+
url = url.to_s.split('?').first
|
36
110
|
|
37
|
-
|
111
|
+
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
112
|
+
content_type.first&.to_s || 'application/octet-stream'
|
38
113
|
end
|
39
114
|
end
|
40
115
|
end
|