html2rss 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -656
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +115 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'mime/types'
|
|
4
|
-
|
|
5
|
-
module Html2rss
|
|
6
|
-
module RssBuilder
|
|
7
|
-
##
|
|
8
|
-
# Builds an <item> tag (with the provided maker).
|
|
9
|
-
class Item
|
|
10
|
-
# Tags which should be processed every time and require non-trivial assignments/treatments.
|
|
11
|
-
SPECIAL_TAGS = %i[categories enclosure guid].freeze
|
|
12
|
-
|
|
13
|
-
##
|
|
14
|
-
# Adds the item to the Item Maker
|
|
15
|
-
#
|
|
16
|
-
# @param maker [RSS::Maker::RSS20::Items::Item]
|
|
17
|
-
# @param item [Html2rss::Item]
|
|
18
|
-
# @param tags [Set<Symbol>]
|
|
19
|
-
# @return nil
|
|
20
|
-
def self.add(maker, item, tags)
|
|
21
|
-
tags.each do |tag|
|
|
22
|
-
next if SPECIAL_TAGS.include?(tag)
|
|
23
|
-
|
|
24
|
-
maker.public_send(:"#{tag}=", item.public_send(tag))
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
SPECIAL_TAGS.each do |tag|
|
|
28
|
-
send(:"add_#{tag}", item, maker)
|
|
29
|
-
end
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
##
|
|
33
|
-
# Adds the <category> tags, if there should be any.
|
|
34
|
-
#
|
|
35
|
-
# @param item [Html2rss::Item]
|
|
36
|
-
# @param maker [RSS::Maker::RSS20::Items::Item]
|
|
37
|
-
# @return nil
|
|
38
|
-
def self.add_categories(item, maker)
|
|
39
|
-
item.categories.each { |category| maker.categories.new_category.content = category }
|
|
40
|
-
end
|
|
41
|
-
private_class_method :add_categories
|
|
42
|
-
|
|
43
|
-
##
|
|
44
|
-
# Adds an enclosure, if there should be one.
|
|
45
|
-
#
|
|
46
|
-
# @param item [Html2rss::Item]
|
|
47
|
-
# @param maker [RSS::Maker::RSS20::Items::Item]
|
|
48
|
-
# @return nil
|
|
49
|
-
def self.add_enclosure(item, maker)
|
|
50
|
-
return unless item.enclosure?
|
|
51
|
-
|
|
52
|
-
set_enclosure_attributes(item.enclosure, maker.enclosure)
|
|
53
|
-
end
|
|
54
|
-
private_class_method :add_enclosure
|
|
55
|
-
|
|
56
|
-
##
|
|
57
|
-
# Sets the attributes of an RSS enclosure.
|
|
58
|
-
#
|
|
59
|
-
# @param item_enclosure [Html2rss::Enclosure]
|
|
60
|
-
# @param rss_enclosure [RSS::Maker::RSS20::Items::Enclosure]
|
|
61
|
-
# @return nil
|
|
62
|
-
def self.set_enclosure_attributes(item_enclosure, rss_enclosure)
|
|
63
|
-
rss_enclosure.type = item_enclosure.type
|
|
64
|
-
rss_enclosure.length = item_enclosure.bits_length
|
|
65
|
-
rss_enclosure.url = item_enclosure.url
|
|
66
|
-
end
|
|
67
|
-
private_class_method :set_enclosure_attributes
|
|
68
|
-
|
|
69
|
-
##
|
|
70
|
-
# Adds a non-permalink GUID to the item.
|
|
71
|
-
#
|
|
72
|
-
# @param item [Html2rss::Item]
|
|
73
|
-
# @param maker [RSS::Maker::RSS20::Items::Item]
|
|
74
|
-
# @return nil
|
|
75
|
-
def self.add_guid(item, maker)
|
|
76
|
-
guid = maker.guid
|
|
77
|
-
guid.content = item.guid
|
|
78
|
-
guid.isPermaLink = false
|
|
79
|
-
end
|
|
80
|
-
private_class_method :add_guid
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
end
|
data/lib/html2rss/utils.rb
DELETED
|
@@ -1,113 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'addressable/uri'
|
|
4
|
-
require 'json'
|
|
5
|
-
require 'regexp_parser'
|
|
6
|
-
require 'tzinfo'
|
|
7
|
-
require 'mime/types'
|
|
8
|
-
require_relative 'object_to_xml_converter'
|
|
9
|
-
|
|
10
|
-
module Html2rss
|
|
11
|
-
##
|
|
12
|
-
# The collecting tank for utility methods.
|
|
13
|
-
module Utils
|
|
14
|
-
##
|
|
15
|
-
# @param url [String, Addressable::URI]
|
|
16
|
-
# @param base_url [String, Addressable::URI]
|
|
17
|
-
# @return [Addressable::URI]
|
|
18
|
-
def self.build_absolute_url_from_relative(url, base_url)
|
|
19
|
-
url = Addressable::URI.parse(url)
|
|
20
|
-
return url if url.absolute?
|
|
21
|
-
|
|
22
|
-
base_uri = Addressable::URI.parse(base_url)
|
|
23
|
-
base_uri.path = '/' if base_uri.path.empty?
|
|
24
|
-
|
|
25
|
-
base_uri.join(url).normalize
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
##
|
|
29
|
-
# Removes any space, parses and normalizes the given url.
|
|
30
|
-
# @param url [String]
|
|
31
|
-
# @return [Addressable::URI, nil] normalized URL, or nil if input is empty
|
|
32
|
-
def self.sanitize_url(url)
|
|
33
|
-
url = url.to_s.gsub(/\s+/, ' ').strip
|
|
34
|
-
return if url.empty?
|
|
35
|
-
|
|
36
|
-
Addressable::URI.parse(url).normalize
|
|
37
|
-
end
|
|
38
|
-
|
|
39
|
-
##
|
|
40
|
-
# Allows override of time zone locally inside supplied block; resets previous time zone when done.
|
|
41
|
-
#
|
|
42
|
-
# @param time_zone [String]
|
|
43
|
-
# @param default_time_zone [String]
|
|
44
|
-
# @yield block to execute with the given time zone
|
|
45
|
-
# @return [Object] whatever the given block returns
|
|
46
|
-
def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
|
|
47
|
-
raise ArgumentError, 'a block is required' unless block_given?
|
|
48
|
-
|
|
49
|
-
time_zone = TZInfo::Timezone.get(time_zone)
|
|
50
|
-
|
|
51
|
-
prev_tz = ENV.fetch('TZ', default_time_zone)
|
|
52
|
-
ENV['TZ'] = time_zone.name
|
|
53
|
-
yield
|
|
54
|
-
ensure
|
|
55
|
-
ENV['TZ'] = prev_tz if prev_tz
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
##
|
|
59
|
-
# Builds a titleized representation of the URL with prefixed host.
|
|
60
|
-
# @param url [Addressable::URI]
|
|
61
|
-
# @return [String]
|
|
62
|
-
def self.titleized_channel_url(url)
|
|
63
|
-
nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
|
|
64
|
-
host = url.host
|
|
65
|
-
|
|
66
|
-
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
##
|
|
70
|
-
# Builds a titleized representation of the URL.
|
|
71
|
-
# @param url [Addressable::URI]
|
|
72
|
-
# @return [String]
|
|
73
|
-
def self.titleized_url(url)
|
|
74
|
-
return '' if url.path.empty?
|
|
75
|
-
|
|
76
|
-
nicer_path = CGI.unescapeURIComponent(url.path)
|
|
77
|
-
.split('/')
|
|
78
|
-
.flat_map do |part|
|
|
79
|
-
part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
|
|
80
|
-
end
|
|
81
|
-
|
|
82
|
-
nicer_path.map!(&:capitalize)
|
|
83
|
-
File.basename nicer_path.join(' '), '.*'
|
|
84
|
-
end
|
|
85
|
-
|
|
86
|
-
##
|
|
87
|
-
# Parses the given String and builds a Regexp out of it.
|
|
88
|
-
#
|
|
89
|
-
# It will remove one pair of surrounding slashes ('/') from the String
|
|
90
|
-
# to maintain backwards compatibility before building the Regexp.
|
|
91
|
-
#
|
|
92
|
-
# @param string [String]
|
|
93
|
-
# @return [Regexp]
|
|
94
|
-
def self.build_regexp_from_string(string)
|
|
95
|
-
raise ArgumentError, 'must be a string!' unless string.is_a?(String)
|
|
96
|
-
|
|
97
|
-
string = string[1..-2] if string.start_with?('/') && string.end_with?('/')
|
|
98
|
-
Regexp::Parser.parse(string, options: ::Regexp::EXTENDED | ::Regexp::IGNORECASE).to_re
|
|
99
|
-
end
|
|
100
|
-
|
|
101
|
-
##
|
|
102
|
-
# Guesses the content type based on the file extension of the URL.
|
|
103
|
-
#
|
|
104
|
-
# @param url [Addressable::URI]
|
|
105
|
-
# @return [String] guessed content type, defaults to 'application/octet-stream'
|
|
106
|
-
def self.guess_content_type_from_url(url)
|
|
107
|
-
url = url.path.split('?').first
|
|
108
|
-
|
|
109
|
-
content_type = MIME::Types.type_for(File.extname(url).delete('.'))
|
|
110
|
-
content_type.first&.to_s || 'application/octet-stream'
|
|
111
|
-
end
|
|
112
|
-
end
|
|
113
|
-
end
|