html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'sanitize'
|
|
4
|
-
require_relative 'html_transformers/transform_urls_to_absolute_ones'
|
|
5
|
-
require_relative 'html_transformers/wrap_img_in_a'
|
|
6
|
-
|
|
7
|
-
module Html2rss
|
|
8
|
-
module AttributePostProcessors
|
|
9
|
-
##
|
|
10
|
-
# Returns sanitized HTML code as String.
|
|
11
|
-
#
|
|
12
|
-
# It sanitizes by using the [sanitize gem](https://github.com/rgrove/sanitize) with
|
|
13
|
-
# [Sanitize::Config::RELAXED](https://github.com/rgrove/sanitize#sanitizeconfigrelaxed).
|
|
14
|
-
#
|
|
15
|
-
# Furthermore, it adds:
|
|
16
|
-
#
|
|
17
|
-
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
|
18
|
-
# - `referrer-policy='no-referrer'` to <img> tags
|
|
19
|
-
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
|
20
|
-
# linking to the <img>'s `src`.
|
|
21
|
-
#
|
|
22
|
-
# Imagine this HTML structure:
|
|
23
|
-
#
|
|
24
|
-
# <section>
|
|
25
|
-
# Lorem <b>ipsum</b> dolor...
|
|
26
|
-
# <iframe src="https://evil.corp/miner"></iframe>
|
|
27
|
-
# <script>alert();</script>
|
|
28
|
-
# </section>
|
|
29
|
-
#
|
|
30
|
-
# YAML usage example:
|
|
31
|
-
#
|
|
32
|
-
# selectors:
|
|
33
|
-
# description:
|
|
34
|
-
# selector: '.section'
|
|
35
|
-
# extractor: html
|
|
36
|
-
# post_process:
|
|
37
|
-
# name: sanitize_html
|
|
38
|
-
#
|
|
39
|
-
# Would return:
|
|
40
|
-
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
|
41
|
-
class SanitizeHtml < Base
|
|
42
|
-
def self.validate_args!(value, context)
|
|
43
|
-
assert_type value, String, :value, context:
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
##
|
|
47
|
-
# Shorthand method to get the sanitized HTML.
|
|
48
|
-
# @param html [String]
|
|
49
|
-
# @param url [String, Addressable::URI]
|
|
50
|
-
def self.get(html, url)
|
|
51
|
-
raise ArgumentError, 'url must be a String or Addressable::URI' if url.to_s.empty?
|
|
52
|
-
return nil if html.to_s.empty?
|
|
53
|
-
|
|
54
|
-
new(html, { config: Config::Channel.new({ url: }) }).get
|
|
55
|
-
end
|
|
56
|
-
|
|
57
|
-
##
|
|
58
|
-
# @return [String]
|
|
59
|
-
def get
|
|
60
|
-
sanitized_html = Sanitize.fragment(value, sanitize_config)
|
|
61
|
-
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
private
|
|
65
|
-
|
|
66
|
-
##
|
|
67
|
-
# @return [Sanitize::Config]
|
|
68
|
-
def sanitize_config
|
|
69
|
-
Sanitize::Config.merge(
|
|
70
|
-
Sanitize::Config::RELAXED,
|
|
71
|
-
attributes: { all: %w[dir lang alt title translate] },
|
|
72
|
-
add_attributes:,
|
|
73
|
-
transformers: [
|
|
74
|
-
method(:transform_urls_to_absolute_ones),
|
|
75
|
-
method(:wrap_img_in_a)
|
|
76
|
-
]
|
|
77
|
-
)
|
|
78
|
-
end
|
|
79
|
-
|
|
80
|
-
def add_attributes
|
|
81
|
-
{
|
|
82
|
-
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
|
83
|
-
'img' => { 'referrer-policy' => 'no-referrer' }
|
|
84
|
-
}
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
def channel_url = context[:config].url
|
|
88
|
-
|
|
89
|
-
##
|
|
90
|
-
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
|
91
|
-
#
|
|
92
|
-
# @param env [Hash]
|
|
93
|
-
# @return [nil]
|
|
94
|
-
def transform_urls_to_absolute_ones(env)
|
|
95
|
-
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
|
96
|
-
end
|
|
97
|
-
|
|
98
|
-
##
|
|
99
|
-
# Wrapper for wrap_img_in_a.
|
|
100
|
-
#
|
|
101
|
-
# @param env [Hash]
|
|
102
|
-
# @return [nil]
|
|
103
|
-
def wrap_img_in_a(env)
|
|
104
|
-
HtmlTransformers::WrapImgInA.new.call(**env)
|
|
105
|
-
end
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
end
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
##
|
|
6
|
-
# Returns a defined part of a String.
|
|
7
|
-
#
|
|
8
|
-
# Both parameters must be an Integer and they can be negative.
|
|
9
|
-
# The +end+ parameter can be omitted, in that case it will not cut the
|
|
10
|
-
# String at the end.
|
|
11
|
-
#
|
|
12
|
-
# A Regexp or a MatchString is not supported.
|
|
13
|
-
#
|
|
14
|
-
# See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
|
|
15
|
-
# documentation for more information.
|
|
16
|
-
#
|
|
17
|
-
# Imagine this HTML:
|
|
18
|
-
# <h1>Foo bar and baz<h1>
|
|
19
|
-
#
|
|
20
|
-
# YAML usage example:
|
|
21
|
-
# selectors:
|
|
22
|
-
# title:
|
|
23
|
-
# selector: h1
|
|
24
|
-
# post_process:
|
|
25
|
-
# name: substring
|
|
26
|
-
# start: 4
|
|
27
|
-
# end: 6
|
|
28
|
-
#
|
|
29
|
-
# Would return:
|
|
30
|
-
# 'bar'
|
|
31
|
-
class Substring < Base
|
|
32
|
-
def self.validate_args!(value, context)
|
|
33
|
-
assert_type value, String, :value, context:
|
|
34
|
-
|
|
35
|
-
options = context[:options]
|
|
36
|
-
assert_type options[:start], Integer, :start, context:
|
|
37
|
-
|
|
38
|
-
end_index = options[:end]
|
|
39
|
-
assert_type(end_index, Integer, :end, context:) if end_index
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
##
|
|
43
|
-
# Extracts the substring from the original string based on the provided start and end indices.
|
|
44
|
-
#
|
|
45
|
-
# @return [String] The extracted substring.
|
|
46
|
-
def get
|
|
47
|
-
value[range]
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
##
|
|
51
|
-
# Determines the range for the substring extraction based on the provided start and end indices.
|
|
52
|
-
#
|
|
53
|
-
# @return [Range] The range object representing the start and end/Infinity (integers).
|
|
54
|
-
def range
|
|
55
|
-
return (start_index..) unless end_index?
|
|
56
|
-
|
|
57
|
-
if start_index == end_index
|
|
58
|
-
raise ArgumentError,
|
|
59
|
-
'The `start` value must be unequal to the `end` value.'
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
(start_index..end_index)
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
private
|
|
66
|
-
|
|
67
|
-
def end_index? = !context[:options][:end].to_s.empty?
|
|
68
|
-
def end_index = context[:options][:end].to_i
|
|
69
|
-
def start_index = context[:options][:start].to_i
|
|
70
|
-
end
|
|
71
|
-
end
|
|
72
|
-
end
|
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
module AttributePostProcessors
|
|
5
|
-
##
|
|
6
|
-
# Returns a formatted String according to the string pattern.
|
|
7
|
-
#
|
|
8
|
-
# If +self+ is used, the selectors extracted value will be used.
|
|
9
|
-
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
|
10
|
-
#
|
|
11
|
-
# Imagine this HTML:
|
|
12
|
-
#
|
|
13
|
-
# <li>
|
|
14
|
-
# <h1>Product</h1>
|
|
15
|
-
# <span class="price">23,42€</span>
|
|
16
|
-
# </li>
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
# YAML usage example:
|
|
20
|
-
#
|
|
21
|
-
# selectors:
|
|
22
|
-
# items:
|
|
23
|
-
# selector: 'li'
|
|
24
|
-
# price:
|
|
25
|
-
# selector: '.price'
|
|
26
|
-
# title:
|
|
27
|
-
# selector: h1
|
|
28
|
-
# post_process:
|
|
29
|
-
# name: template
|
|
30
|
-
# string: '%{self} (%{price})'
|
|
31
|
-
#
|
|
32
|
-
# Would return:
|
|
33
|
-
# 'Product (23,42€)'
|
|
34
|
-
class Template < Base
|
|
35
|
-
def self.validate_args!(value, context)
|
|
36
|
-
assert_type value, String, :value, context:
|
|
37
|
-
|
|
38
|
-
string = context[:options]&.dig(:string).to_s
|
|
39
|
-
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
|
40
|
-
end
|
|
41
|
-
|
|
42
|
-
##
|
|
43
|
-
# @param value [String]
|
|
44
|
-
# @param context [Item::Context]
|
|
45
|
-
def initialize(value, context)
|
|
46
|
-
super
|
|
47
|
-
|
|
48
|
-
@options = context[:options] || {}
|
|
49
|
-
@item = context[:item]
|
|
50
|
-
@string = @options[:string].to_s
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
##
|
|
54
|
-
# @return [String]
|
|
55
|
-
def get
|
|
56
|
-
@options[:methods] ? format_string_with_methods : format_string_with_dynamic_params
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
private
|
|
60
|
-
|
|
61
|
-
##
|
|
62
|
-
# @return [String] the string containing the template
|
|
63
|
-
attr_reader :string
|
|
64
|
-
|
|
65
|
-
##
|
|
66
|
-
# @return [Array<String>]
|
|
67
|
-
def methods
|
|
68
|
-
@methods ||= @options[:methods].map { |method_name| item_value(method_name) }
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
##
|
|
72
|
-
# Formats a string using methods.
|
|
73
|
-
#
|
|
74
|
-
# @return [String]
|
|
75
|
-
# @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
|
|
76
|
-
def format_string_with_methods
|
|
77
|
-
Log.warn '[DEPRECATION] This method of using params is deprecated and \
|
|
78
|
-
support for it will be removed in version 1.0.0.\
|
|
79
|
-
Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
|
|
80
|
-
|
|
81
|
-
string % methods
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
##
|
|
85
|
-
# @return [String]
|
|
86
|
-
def format_string_with_dynamic_params
|
|
87
|
-
param_names = string.scan(/%[<|{](\w*)[>|}]/)
|
|
88
|
-
param_names.flatten!
|
|
89
|
-
|
|
90
|
-
format(string, param_names.to_h { |name| [name.to_sym, item_value(name)] })
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
##
|
|
94
|
-
# @param method_name [String, Symbol]
|
|
95
|
-
# @return [String]
|
|
96
|
-
def item_value(method_name)
|
|
97
|
-
method_name.to_sym == :self ? value : @item.public_send(method_name).to_s
|
|
98
|
-
end
|
|
99
|
-
end
|
|
100
|
-
end
|
|
101
|
-
end
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
##
|
|
5
|
-
# Provides a namespace for attribute post processors.
|
|
6
|
-
module AttributePostProcessors
|
|
7
|
-
##
|
|
8
|
-
# Error raised when an unknown post processor name is requested.
|
|
9
|
-
class UnknownPostProcessorName < Html2rss::Error; end
|
|
10
|
-
|
|
11
|
-
##
|
|
12
|
-
# Error raised when a required option is missing.
|
|
13
|
-
class MissingOption < Html2rss::Error; end
|
|
14
|
-
|
|
15
|
-
##
|
|
16
|
-
# Error raised when an invalid type is provided.
|
|
17
|
-
class InvalidType < Html2rss::Error; end
|
|
18
|
-
|
|
19
|
-
##
|
|
20
|
-
# Maps the post processor name to the class implementing the post processor.
|
|
21
|
-
#
|
|
22
|
-
# The key is the name to use in the feed config.
|
|
23
|
-
NAME_TO_CLASS = {
|
|
24
|
-
gsub: Gsub,
|
|
25
|
-
html_to_markdown: HtmlToMarkdown,
|
|
26
|
-
markdown_to_html: MarkdownToHtml,
|
|
27
|
-
parse_time: ParseTime,
|
|
28
|
-
parse_uri: ParseUri,
|
|
29
|
-
sanitize_html: SanitizeHtml,
|
|
30
|
-
substring: Substring,
|
|
31
|
-
template: Template
|
|
32
|
-
}.freeze
|
|
33
|
-
|
|
34
|
-
##
|
|
35
|
-
# Retrieves the attribute post processor class based on the given name.
|
|
36
|
-
#
|
|
37
|
-
# @param name [Symbol] The name of the post processor.
|
|
38
|
-
# @return [Class] The attribute post processor class.
|
|
39
|
-
# @raise [UnknownPostProcessorName] If the requested name is not found in NAME_TO_CLASS.
|
|
40
|
-
def self.get_processor(name)
|
|
41
|
-
NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Can't find a post processor named '#{name}'")
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
end
|
|
@@ -1,127 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'zlib'
|
|
4
|
-
require 'sanitize'
|
|
5
|
-
require 'nokogiri'
|
|
6
|
-
|
|
7
|
-
module Html2rss
|
|
8
|
-
class AutoSource
|
|
9
|
-
##
|
|
10
|
-
# Article is a simple data object representing an article extracted from a page.
|
|
11
|
-
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
|
12
|
-
class Article
|
|
13
|
-
include Enumerable
|
|
14
|
-
include Comparable
|
|
15
|
-
|
|
16
|
-
PROVIDED_KEYS = %i[id title description url image guid published_at scraper].freeze
|
|
17
|
-
|
|
18
|
-
##
|
|
19
|
-
# Removes the specified pattern from the beginning of the text
|
|
20
|
-
# within a given range if the pattern occurs before the range's end.
|
|
21
|
-
#
|
|
22
|
-
# @param text [String]
|
|
23
|
-
# @param pattern [String]
|
|
24
|
-
# @param end_of_range [Integer] - Optional, defaults to half the size of the text
|
|
25
|
-
# @return [String]
|
|
26
|
-
def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
|
|
27
|
-
return text unless text.is_a?(String) && pattern.is_a?(String)
|
|
28
|
-
|
|
29
|
-
index = text.index(pattern)
|
|
30
|
-
return text if index.nil? || index >= end_of_range
|
|
31
|
-
|
|
32
|
-
text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
##
|
|
36
|
-
# Checks if the text contains HTML tags.
|
|
37
|
-
# @param text [String]
|
|
38
|
-
# @return [Boolean]
|
|
39
|
-
def self.contains_html?(text)
|
|
40
|
-
Nokogiri::HTML.fragment(text).children.any?(&:element?)
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
# @param options [Hash<Symbol, String>]
|
|
44
|
-
def initialize(**options)
|
|
45
|
-
@to_h = {}
|
|
46
|
-
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
|
47
|
-
@to_h.freeze
|
|
48
|
-
|
|
49
|
-
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
|
50
|
-
|
|
51
|
-
Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
|
|
52
|
-
end
|
|
53
|
-
|
|
54
|
-
# Checks if the article is valid based on the presence of URL, ID, and either title or description.
|
|
55
|
-
# @return [Boolean] True if the article is valid, otherwise false.
|
|
56
|
-
def valid?
|
|
57
|
-
!url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
# @yield [key, value]
|
|
61
|
-
# @return [Enumerator] if no block is given
|
|
62
|
-
def each
|
|
63
|
-
return enum_for(:each) unless block_given?
|
|
64
|
-
|
|
65
|
-
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
def id
|
|
69
|
-
@to_h[:id]
|
|
70
|
-
end
|
|
71
|
-
|
|
72
|
-
def title
|
|
73
|
-
@to_h[:title]
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
def description
|
|
77
|
-
return @description if defined?(@description)
|
|
78
|
-
|
|
79
|
-
return if (description = @to_h[:description]).to_s.empty?
|
|
80
|
-
|
|
81
|
-
@description = self.class.remove_pattern_from_start(description, title) if title
|
|
82
|
-
|
|
83
|
-
if self.class.contains_html?(@description) && url
|
|
84
|
-
@description = Html2rss::AttributePostProcessors::SanitizeHtml.get(description, url)
|
|
85
|
-
else
|
|
86
|
-
@description
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# @return [Addressable::URI, nil]
|
|
91
|
-
def url
|
|
92
|
-
@url ||= Html2rss::Utils.sanitize_url(@to_h[:url])
|
|
93
|
-
end
|
|
94
|
-
|
|
95
|
-
# @return [Addressable::URI, nil]
|
|
96
|
-
def image
|
|
97
|
-
@image ||= Html2rss::Utils.sanitize_url(@to_h[:image])
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
# Generates a unique identifier based on the URL and ID using CRC32.
|
|
101
|
-
# @return [String]
|
|
102
|
-
def guid
|
|
103
|
-
@guid ||= Zlib.crc32([url, id].join('#!/')).to_s(36).encode('utf-8')
|
|
104
|
-
end
|
|
105
|
-
|
|
106
|
-
# Parses and returns the published_at time.
|
|
107
|
-
# @return [DateTime, nil]
|
|
108
|
-
def published_at
|
|
109
|
-
return if (string = @to_h[:published_at].to_s.strip).empty?
|
|
110
|
-
|
|
111
|
-
@published_at ||= DateTime.parse(string)
|
|
112
|
-
rescue ArgumentError
|
|
113
|
-
nil
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
def scraper
|
|
117
|
-
@to_h[:scraper]
|
|
118
|
-
end
|
|
119
|
-
|
|
120
|
-
def <=>(other)
|
|
121
|
-
return nil unless other.is_a?(Article)
|
|
122
|
-
|
|
123
|
-
0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
end
|
|
127
|
-
end
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
class AutoSource
|
|
5
|
-
##
|
|
6
|
-
# Extracts channel information from
|
|
7
|
-
# 1. the HTML document's <head>.
|
|
8
|
-
# 2. the HTTP response
|
|
9
|
-
class Channel
|
|
10
|
-
##
|
|
11
|
-
#
|
|
12
|
-
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
|
|
13
|
-
# @param url [Addressable::URI] The URL of the channel.
|
|
14
|
-
# @param headers [Hash<String, String>] the http headers
|
|
15
|
-
# @param articles [Array<Html2rss::AutoSource::Article>] The articles.
|
|
16
|
-
def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
|
|
17
|
-
@parsed_body = parsed_body
|
|
18
|
-
@url = url
|
|
19
|
-
@headers = headers
|
|
20
|
-
@articles = articles
|
|
21
|
-
@stylesheets = stylesheets
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
attr_writer :articles
|
|
25
|
-
attr_reader :stylesheets
|
|
26
|
-
|
|
27
|
-
def url = @url.normalize.to_s
|
|
28
|
-
|
|
29
|
-
def title
|
|
30
|
-
@title ||= if (title = parsed_body.at_css('head > title')&.text.to_s) && !title.empty?
|
|
31
|
-
title.gsub(/\s+/, ' ').strip
|
|
32
|
-
else
|
|
33
|
-
Utils.titleized_channel_url(@url)
|
|
34
|
-
end
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
def description = parsed_body.at_css('meta[name="description"]')&.[]('content')
|
|
38
|
-
def last_build_date = headers['last-modified']
|
|
39
|
-
|
|
40
|
-
def language
|
|
41
|
-
return parsed_body['lang'] if parsed_body.name == 'html' && parsed_body['lang']
|
|
42
|
-
|
|
43
|
-
parsed_body.at_css('[lang]')&.[]('lang')
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
def image
|
|
47
|
-
url = parsed_body.at_css('meta[property="og:image"]')&.[]('content')
|
|
48
|
-
Html2rss::Utils.sanitize_url(url) if url
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def ttl
|
|
52
|
-
ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
|
|
53
|
-
return unless ttl
|
|
54
|
-
|
|
55
|
-
ttl.to_i.fdiv(60).ceil
|
|
56
|
-
end
|
|
57
|
-
|
|
58
|
-
def generator
|
|
59
|
-
"html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
|
|
60
|
-
end
|
|
61
|
-
|
|
62
|
-
private
|
|
63
|
-
|
|
64
|
-
attr_reader :parsed_body, :headers
|
|
65
|
-
|
|
66
|
-
def scraper_counts
|
|
67
|
-
scraper_counts = +''
|
|
68
|
-
|
|
69
|
-
@articles.each_with_object(Hash.new(0)) { |article, counts| counts[article.scraper] += 1 }
|
|
70
|
-
.each do |klass, count|
|
|
71
|
-
scraper_counts.concat("[#{klass.to_s.gsub('Html2rss::AutoSource::Scraper::', '')}=#{count}]")
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
scraper_counts
|
|
75
|
-
end
|
|
76
|
-
end
|
|
77
|
-
end
|
|
78
|
-
end
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Html2rss
|
|
4
|
-
class AutoSource
|
|
5
|
-
##
|
|
6
|
-
# Reducer is responsible for reducing the list of articles.
|
|
7
|
-
# It keeps only the longest attributes of articles with the same URL.
|
|
8
|
-
# It also filters out invalid articles.
|
|
9
|
-
class Reducer
|
|
10
|
-
class << self
|
|
11
|
-
def call(articles, **_options)
|
|
12
|
-
Log.debug "Reducer: inited with #{articles.size} articles"
|
|
13
|
-
|
|
14
|
-
reduce_by_keeping_longest_values(articles, keep: [:scraper]) { |article| article.url&.path }
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
private
|
|
18
|
-
|
|
19
|
-
# @param articles [Array<Article>]
|
|
20
|
-
# @return [Array<Article>] reduced articles
|
|
21
|
-
def reduce_by_keeping_longest_values(articles, keep:, &)
|
|
22
|
-
grouped_by_block = articles.group_by(&)
|
|
23
|
-
grouped_by_block.each_with_object([]) do |(_key, grouped_articles), result|
|
|
24
|
-
memo_object = {}
|
|
25
|
-
grouped_articles.each do |article_hash|
|
|
26
|
-
keep_longest_values(memo_object, article_hash, keep:)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
result << Article.new(**memo_object)
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
def keep_longest_values(memo_object, article_hash, keep:)
|
|
34
|
-
article_hash.each do |key, value|
|
|
35
|
-
next if value.eql?(memo_object[key])
|
|
36
|
-
|
|
37
|
-
if keep.include?(key)
|
|
38
|
-
memo_object[key] ||= []
|
|
39
|
-
memo_object[key] << value
|
|
40
|
-
elsif value && value.to_s.size > memo_object[key].to_s.size
|
|
41
|
-
memo_object[key] = value
|
|
42
|
-
end
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
end
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'rss'
|
|
4
|
-
|
|
5
|
-
module Html2rss
|
|
6
|
-
class AutoSource
|
|
7
|
-
##
|
|
8
|
-
# Converts the autosourced channel and articles to an RSS feed.
|
|
9
|
-
class RssBuilder
|
|
10
|
-
def self.add_guid(article, maker)
|
|
11
|
-
maker.guid.tap do |guid|
|
|
12
|
-
guid.content = article.guid
|
|
13
|
-
guid.isPermaLink = false
|
|
14
|
-
end
|
|
15
|
-
end
|
|
16
|
-
|
|
17
|
-
def self.add_image(article, maker)
|
|
18
|
-
url = article.image || return
|
|
19
|
-
|
|
20
|
-
maker.enclosure.tap do |enclosure|
|
|
21
|
-
enclosure.url = url
|
|
22
|
-
enclosure.type = Html2rss::Utils.guess_content_type_from_url(url)
|
|
23
|
-
enclosure.length = 0
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def initialize(channel:, articles:)
|
|
28
|
-
@channel = channel
|
|
29
|
-
@articles = articles
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
def call
|
|
33
|
-
RSS::Maker.make('2.0') do |maker|
|
|
34
|
-
Html2rss::RssBuilder::Stylesheet.add(maker, channel.stylesheets)
|
|
35
|
-
|
|
36
|
-
make_channel(maker.channel)
|
|
37
|
-
make_items(maker)
|
|
38
|
-
end
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
private
|
|
42
|
-
|
|
43
|
-
attr_reader :channel, :articles
|
|
44
|
-
|
|
45
|
-
def make_channel(maker)
|
|
46
|
-
%i[language title description ttl].each do |key|
|
|
47
|
-
maker.public_send(:"#{key}=", channel.public_send(key))
|
|
48
|
-
end
|
|
49
|
-
|
|
50
|
-
maker.link = channel.url
|
|
51
|
-
maker.generator = channel.generator
|
|
52
|
-
maker.updated = channel.last_build_date
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def make_items(maker)
|
|
56
|
-
articles.each do |article|
|
|
57
|
-
maker.items.new_item do |item_maker|
|
|
58
|
-
RssBuilder.add_guid(article, item_maker)
|
|
59
|
-
RssBuilder.add_image(article, item_maker)
|
|
60
|
-
|
|
61
|
-
item_maker.title = article.title
|
|
62
|
-
item_maker.description = article.description
|
|
63
|
-
item_maker.pubDate = article.published_at&.rfc2822
|
|
64
|
-
item_maker.link = article.url
|
|
65
|
-
end
|
|
66
|
-
end
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|