html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
##
|
|
6
|
+
# Normalizes HTTP headers for outgoing requests.
|
|
7
|
+
# Ensures a browser-like baseline while respecting caller overrides.
|
|
8
|
+
class RequestHeaders
|
|
9
|
+
DEFAULT_ACCEPT = %w[
|
|
10
|
+
text/html
|
|
11
|
+
application/xhtml+xml
|
|
12
|
+
application/xml;q=0.9
|
|
13
|
+
image/avif
|
|
14
|
+
image/webp
|
|
15
|
+
image/apng
|
|
16
|
+
*/*;q=0.8
|
|
17
|
+
].join(',')
|
|
18
|
+
|
|
19
|
+
DEFAULT_USER_AGENT = [
|
|
20
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
21
|
+
'AppleWebKit/537.36 (KHTML, like Gecko)',
|
|
22
|
+
'Chrome/123.0.0.0',
|
|
23
|
+
'Safari/537.36'
|
|
24
|
+
].join(' ')
|
|
25
|
+
|
|
26
|
+
DEFAULT_HEADERS = {
|
|
27
|
+
'Accept' => DEFAULT_ACCEPT,
|
|
28
|
+
'Cache-Control' => 'max-age=0',
|
|
29
|
+
'Connection' => 'keep-alive',
|
|
30
|
+
'Sec-Fetch-Dest' => 'document',
|
|
31
|
+
'Sec-Fetch-Mode' => 'navigate',
|
|
32
|
+
'Sec-Fetch-Site' => 'none',
|
|
33
|
+
'Sec-Fetch-User' => '?1',
|
|
34
|
+
'Upgrade-Insecure-Requests' => '1',
|
|
35
|
+
'User-Agent' => DEFAULT_USER_AGENT
|
|
36
|
+
}.freeze
|
|
37
|
+
|
|
38
|
+
class << self
|
|
39
|
+
##
|
|
40
|
+
# @return [Hash<String, String>] the unmodified default header set
|
|
41
|
+
def browser_defaults
|
|
42
|
+
DEFAULT_HEADERS.dup
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
##
|
|
46
|
+
# Normalizes the provided headers while applying Html2rss defaults.
|
|
47
|
+
#
|
|
48
|
+
# @param headers [Hash, nil] caller provided headers
|
|
49
|
+
# @param channel_language [String, nil] language defined on the channel
|
|
50
|
+
# @param url [String] request URL used to infer the Host header
|
|
51
|
+
# @return [Hash<String, String>] normalized HTTP headers
|
|
52
|
+
def normalize(headers, channel_language:, url:)
|
|
53
|
+
new(headers || {}, channel_language:, url:).to_h
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def initialize(headers, channel_language:, url:)
|
|
58
|
+
@headers = headers
|
|
59
|
+
@channel_language = channel_language
|
|
60
|
+
@url = url
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# @return [Hash<String, String>] normalized HTTP headers
|
|
65
|
+
def to_h
|
|
66
|
+
defaults = DEFAULT_HEADERS.dup
|
|
67
|
+
normalized = normalize_custom_headers(headers)
|
|
68
|
+
|
|
69
|
+
accept_override = normalized.delete('Accept')
|
|
70
|
+
defaults.merge!(normalized)
|
|
71
|
+
|
|
72
|
+
defaults['Accept'] = normalize_accept(accept_override)
|
|
73
|
+
defaults['Accept-Language'] = build_accept_language
|
|
74
|
+
defaults['Host'] ||= request_host
|
|
75
|
+
|
|
76
|
+
defaults.compact
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private
|
|
80
|
+
|
|
81
|
+
attr_reader :headers, :channel_language, :url
|
|
82
|
+
|
|
83
|
+
def normalize_custom_headers(custom)
|
|
84
|
+
custom.transform_keys { canonicalize(_1) }
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def canonicalize(key)
|
|
88
|
+
key.to_s.split('-').map!(&:capitalize).join('-')
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def normalize_accept(override)
|
|
92
|
+
return DEFAULT_ACCEPT if override.nil? || override.empty?
|
|
93
|
+
|
|
94
|
+
values = accept_values(DEFAULT_ACCEPT)
|
|
95
|
+
|
|
96
|
+
accept_values(override).reverse_each do |value|
|
|
97
|
+
next if values.include?(value)
|
|
98
|
+
|
|
99
|
+
values.unshift(value)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
values.join(',')
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def accept_values(header)
|
|
106
|
+
header.split(',').map!(&:strip).reject(&:empty?)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def build_accept_language
|
|
110
|
+
language = channel_language.to_s.strip
|
|
111
|
+
return 'en-US,en;q=0.9' if language.empty?
|
|
112
|
+
|
|
113
|
+
normalized = language.tr('_', '-')
|
|
114
|
+
primary, region = normalized.split('-', 2)
|
|
115
|
+
primary = primary.downcase
|
|
116
|
+
region = region&.upcase
|
|
117
|
+
|
|
118
|
+
return primary if region.nil?
|
|
119
|
+
|
|
120
|
+
"#{primary}-#{region},#{primary};q=0.9"
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
def request_host
|
|
124
|
+
return nil if url.nil? || url.empty?
|
|
125
|
+
|
|
126
|
+
Html2rss::Url.from_absolute(url).host
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
##
|
|
6
|
+
# Builds the exported configuration JSON Schema from the runtime validators.
|
|
7
|
+
module Schema
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
SCHEMA_FILENAME = 'html2rss-config.schema.json'
|
|
11
|
+
|
|
12
|
+
##
|
|
13
|
+
# Returns the exported configuration JSON Schema.
|
|
14
|
+
#
|
|
15
|
+
# @return [Hash<String, Object>] JSON Schema represented as a Ruby hash
|
|
16
|
+
def json_schema
|
|
17
|
+
load_json_schema_extension!
|
|
18
|
+
Builder.call
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# Resolves the packaged schema path used by downstream tools.
|
|
23
|
+
#
|
|
24
|
+
# @return [String] absolute path to the packaged JSON schema file
|
|
25
|
+
def path
|
|
26
|
+
search_path = File.expand_path(__dir__)
|
|
27
|
+
|
|
28
|
+
loop do
|
|
29
|
+
candidate = File.join(search_path, 'schema', SCHEMA_FILENAME)
|
|
30
|
+
return candidate if File.exist?(candidate)
|
|
31
|
+
|
|
32
|
+
parent_path = File.dirname(search_path)
|
|
33
|
+
break if parent_path == search_path
|
|
34
|
+
|
|
35
|
+
search_path = parent_path
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
File.expand_path("../../../schema/#{SCHEMA_FILENAME}", __dir__)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def load_json_schema_extension!
|
|
42
|
+
require 'dry/schema/extensions/json_schema'
|
|
43
|
+
Dry::Schema.load_extensions(:json_schema)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
##
|
|
47
|
+
# Orchestrates schema assembly from runtime validator contracts plus
|
|
48
|
+
# client-facing overlays.
|
|
49
|
+
class Builder
|
|
50
|
+
class << self
|
|
51
|
+
def call
|
|
52
|
+
new.call
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def call
|
|
57
|
+
schema = validator_schema
|
|
58
|
+
apply_top_level(schema)
|
|
59
|
+
assign_properties(schema.fetch(:properties))
|
|
60
|
+
DeepStringifier.call(schema)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def validator_schema
|
|
66
|
+
Html2rss::Config::Validator.new.schema.json_schema(loose: true)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def apply_top_level(schema)
|
|
70
|
+
schema['$schema'] = 'https://json-schema.org/draft/2020-12/schema'
|
|
71
|
+
schema[:anyOf] = [
|
|
72
|
+
{ 'required' => ['selectors'] },
|
|
73
|
+
{ 'required' => ['auto_source'] }
|
|
74
|
+
]
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def assign_properties(properties)
|
|
78
|
+
properties.merge!(
|
|
79
|
+
headers: Components.headers,
|
|
80
|
+
stylesheets: Components.stylesheets,
|
|
81
|
+
auto_source: Components.auto_source,
|
|
82
|
+
selectors: Components.selectors
|
|
83
|
+
)
|
|
84
|
+
properties.delete(:dynamic_params_error)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
##
|
|
89
|
+
# Exposes schema fragments that populate the top-level configuration schema.
|
|
90
|
+
module Components
|
|
91
|
+
module_function
|
|
92
|
+
|
|
93
|
+
def headers
|
|
94
|
+
{
|
|
95
|
+
type: 'object',
|
|
96
|
+
description: 'HTTP headers applied to every request.',
|
|
97
|
+
additionalProperties: { type: 'string' }
|
|
98
|
+
}
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def stylesheets
|
|
102
|
+
{
|
|
103
|
+
type: 'array',
|
|
104
|
+
description: 'Collection of stylesheets to attach to the RSS feed.',
|
|
105
|
+
items: Html2rss::Config::Validator::StylesheetConfig.json_schema(loose: true)
|
|
106
|
+
}
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def auto_source
|
|
110
|
+
schema = Html2rss::AutoSource::Config.json_schema(loose: true)
|
|
111
|
+
schema[:default] = DeepStringifier.call(Html2rss::AutoSource::DEFAULT_CONFIG)
|
|
112
|
+
schema
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def selectors
|
|
116
|
+
Selectors.schema
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
##
|
|
121
|
+
# Provides schema fragments that document selector configuration.
|
|
122
|
+
module Selectors
|
|
123
|
+
module_function
|
|
124
|
+
|
|
125
|
+
RESERVED_SELECTOR_PATTERN = '^(?!items$|enclosure$|guid$|categories$).+$'
|
|
126
|
+
|
|
127
|
+
def schema
|
|
128
|
+
{
|
|
129
|
+
type: 'object',
|
|
130
|
+
description: 'Selectors used to extract article attributes.',
|
|
131
|
+
properties: selector_properties,
|
|
132
|
+
patternProperties: pattern_properties,
|
|
133
|
+
additionalProperties: true
|
|
134
|
+
}
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# rubocop:disable Layout/LineLength
|
|
138
|
+
def selector_properties
|
|
139
|
+
{
|
|
140
|
+
items: items_schema,
|
|
141
|
+
enclosure: enclosure_schema,
|
|
142
|
+
guid: reference_array('List of selector keys used to build the GUID. Each entry must reference a sibling selector key; runtime validation enforces those references.'),
|
|
143
|
+
categories: reference_array('List of selector keys whose values will be used as categories. Each entry must reference a sibling selector key; runtime validation enforces those references.')
|
|
144
|
+
}
|
|
145
|
+
end
|
|
146
|
+
# rubocop:enable Layout/LineLength
|
|
147
|
+
|
|
148
|
+
def pattern_properties
|
|
149
|
+
{ RESERVED_SELECTOR_PATTERN => dynamic_selector_schema }
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def dynamic_selector_schema
|
|
153
|
+
Html2rss::Selectors::Config::Selector.new.schema.json_schema(loose: true).merge(
|
|
154
|
+
description: 'Dynamic selector definition keyed by attribute name.'
|
|
155
|
+
)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def items_schema
|
|
159
|
+
Html2rss::Selectors::Config::Items.new.schema.json_schema(loose: true).merge(
|
|
160
|
+
description: 'Defines the items selector and optional enhancement settings.'
|
|
161
|
+
)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def enclosure_schema
|
|
165
|
+
Html2rss::Selectors::Config::Enclosure.new.schema.json_schema(loose: true).merge(
|
|
166
|
+
description: 'Describes enclosure extraction settings.'
|
|
167
|
+
)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# JSON Schema can enforce non-empty reference arrays, while runtime
|
|
171
|
+
# validation remains authoritative for checking that each entry points
|
|
172
|
+
# to an existing sibling selector key.
|
|
173
|
+
def reference_array(description)
|
|
174
|
+
{
|
|
175
|
+
type: 'array',
|
|
176
|
+
description:,
|
|
177
|
+
minItems: 1,
|
|
178
|
+
items: {
|
|
179
|
+
type: 'string',
|
|
180
|
+
description: 'Selector key defined elsewhere in this object.'
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
##
|
|
187
|
+
# Converts nested hash keys to strings so the resulting schema serializes cleanly.
|
|
188
|
+
module DeepStringifier
|
|
189
|
+
module_function
|
|
190
|
+
|
|
191
|
+
def call(object)
|
|
192
|
+
case object
|
|
193
|
+
when Hash
|
|
194
|
+
stringify_hash(object)
|
|
195
|
+
when Array
|
|
196
|
+
object.map { |value| call(value) }
|
|
197
|
+
else
|
|
198
|
+
object
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
def stringify_hash(object)
|
|
203
|
+
object.to_h { |key, value| [key.to_s, call(value)] }
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry-validation'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Config
|
|
7
|
+
# Validates the configuration hash using Dry::Validation.
|
|
8
|
+
# The configuration options adhere to the documented schema in README.md.
|
|
9
|
+
class Validator < Dry::Validation::Contract
|
|
10
|
+
URI_REGEXP = Url::URI_REGEXP
|
|
11
|
+
STYLESHEET_TYPES = RssBuilder::Stylesheet::TYPES
|
|
12
|
+
LANGUAGE_FORMAT_REGEX = /\A[a-z]{2}(-[A-Z]{2})?\z/
|
|
13
|
+
|
|
14
|
+
ChannelConfig = Dry::Schema.Params do
|
|
15
|
+
required(:url).filled(:string, format?: URI_REGEXP)
|
|
16
|
+
optional(:title).maybe(:string)
|
|
17
|
+
optional(:description).maybe(:string)
|
|
18
|
+
optional(:language).maybe(:string, format?: LANGUAGE_FORMAT_REGEX)
|
|
19
|
+
optional(:ttl).maybe(:integer, gt?: 0)
|
|
20
|
+
optional(:time_zone).maybe(:string)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
StylesheetConfig = Dry::Schema.Params do
|
|
24
|
+
required(:href).filled(:string)
|
|
25
|
+
required(:type).filled(:string, included_in?: STYLESHEET_TYPES)
|
|
26
|
+
optional(:media).maybe(:string)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
BrowserlessPreloadClickSelectorConfig = Dry::Schema.Params do
|
|
30
|
+
required(:selector).filled(:string)
|
|
31
|
+
optional(:max_clicks).filled(:integer, gt?: 0)
|
|
32
|
+
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
BrowserlessPreloadScrollConfig = Dry::Schema.Params do
|
|
36
|
+
optional(:iterations).filled(:integer, gt?: 0)
|
|
37
|
+
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
BrowserlessPreloadConfig = Dry::Schema.Params do
|
|
41
|
+
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
42
|
+
optional(:click_selectors).array(BrowserlessPreloadClickSelectorConfig)
|
|
43
|
+
optional(:scroll_down).hash(BrowserlessPreloadScrollConfig)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
BrowserlessRequestConfig = Dry::Schema.Params do
|
|
47
|
+
optional(:preload).hash(BrowserlessPreloadConfig)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
RequestConfig = Dry::Schema.Params do
|
|
51
|
+
optional(:max_redirects).filled(:integer, gteq?: 0)
|
|
52
|
+
optional(:max_requests).filled(:integer, gt?: 0)
|
|
53
|
+
optional(:browserless).hash(BrowserlessRequestConfig)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
params do
|
|
57
|
+
required(:strategy).filled(:symbol)
|
|
58
|
+
required(:channel).hash(ChannelConfig)
|
|
59
|
+
optional(:headers).hash
|
|
60
|
+
optional(:stylesheets).array(StylesheetConfig)
|
|
61
|
+
optional(:auto_source).hash(AutoSource::Config)
|
|
62
|
+
optional(:selectors).hash
|
|
63
|
+
optional(:dynamic_params_error).maybe(:string)
|
|
64
|
+
optional(:request).hash(RequestConfig)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
rule(:headers) do
|
|
68
|
+
value&.each do |key, header_value|
|
|
69
|
+
unless header_value.is_a?(String)
|
|
70
|
+
key([:headers, key]).failure("must be a String, but got #{header_value.class}")
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
rule(:dynamic_params_error) do
|
|
76
|
+
base.failure(value) if value
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Ensure at least one of :selectors or :auto_source is present.
|
|
80
|
+
rule(:selectors, :auto_source) do
|
|
81
|
+
unless values.key?(:selectors) || values.key?(:auto_source)
|
|
82
|
+
base.failure("Configuration must include at least 'selectors' or 'auto_source'")
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
rule(:selectors) do
|
|
87
|
+
next unless value
|
|
88
|
+
|
|
89
|
+
errors = Html2rss::Selectors::Config.call(value).errors
|
|
90
|
+
errors.each { |error| key(:selectors).failure(error.text) } unless errors.empty?
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# URL validation delegated to Url class
|
|
94
|
+
rule(:channel) do
|
|
95
|
+
next unless values[:channel]&.key?(:url)
|
|
96
|
+
|
|
97
|
+
url_string = values[:channel][:url]
|
|
98
|
+
next if url_string.nil? || url_string.empty?
|
|
99
|
+
|
|
100
|
+
begin
|
|
101
|
+
Html2rss::Url.for_channel(url_string)
|
|
102
|
+
rescue ArgumentError => error
|
|
103
|
+
key(%i[channel url]).failure(error.message)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -1,82 +1,133 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require '
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'yaml'
|
|
4
5
|
|
|
5
6
|
module Html2rss
|
|
6
7
|
##
|
|
7
|
-
# The
|
|
8
|
-
# provides
|
|
8
|
+
# The provided configuration is used to generate the RSS feed.
|
|
9
|
+
# This class provides methods to load and process configuration from a YAML file,
|
|
10
|
+
# supporting both single and multiple feed configurations.
|
|
11
|
+
#
|
|
12
|
+
# Configuration is validated during initialization.
|
|
9
13
|
class Config
|
|
10
|
-
|
|
14
|
+
class InvalidConfig < Html2rss::Error; end
|
|
15
|
+
extend ClassMethods
|
|
11
16
|
|
|
12
17
|
##
|
|
13
|
-
#
|
|
14
|
-
# were passed to Config.
|
|
15
|
-
class ParamsMissing < Html2rss::Error; end
|
|
16
|
-
|
|
17
|
-
##
|
|
18
|
-
# Thrown when the feed config does not contain a value at `:channel`.
|
|
19
|
-
class ChannelMissing < Html2rss::Error; end
|
|
20
|
-
|
|
21
|
-
def_delegator :@channel, :author
|
|
22
|
-
def_delegator :@channel, :ttl
|
|
23
|
-
def_delegator :@channel, :title
|
|
24
|
-
def_delegator :@channel, :language
|
|
25
|
-
def_delegator :@channel, :description
|
|
26
|
-
def_delegator :@channel, :url
|
|
27
|
-
def_delegator :@channel, :url, :link
|
|
28
|
-
def_delegator :@channel, :time_zone
|
|
29
|
-
def_delegator :@channel, :json?
|
|
30
|
-
def_delegator :@channel, :strategy
|
|
31
|
-
|
|
32
|
-
def_delegator :@selectors, :item_selector_names
|
|
33
|
-
def_delegator :@selectors, :selector?
|
|
34
|
-
def_delegator :@selectors, :category_selector_names
|
|
35
|
-
def_delegator :@selectors, :guid_selector_names
|
|
36
|
-
def_delegator :@selectors, :items_order
|
|
37
|
-
def_delegator :@selectors, :selector_string
|
|
38
|
-
|
|
39
|
-
##
|
|
40
|
-
# Initializes the Config object with feed configuration, global settings, and parameters.
|
|
18
|
+
# Initializes the configuration object.
|
|
41
19
|
#
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
# @param
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@
|
|
20
|
+
# Processes deprecated attributes, applies default values, and validates the configuration.
|
|
21
|
+
#
|
|
22
|
+
# @param config [Hash<Symbol, Object>] the configuration hash.
|
|
23
|
+
# @raise [InvalidConfig] if the configuration fails validation.
|
|
24
|
+
def initialize(config)
|
|
25
|
+
@request_controls = RequestControls.from_config(config)
|
|
26
|
+
prepared_config = Preparer.new.call(config)
|
|
27
|
+
validated_config = validated_config_for(prepared_config)
|
|
28
|
+
|
|
29
|
+
@config = validated_config.freeze
|
|
30
|
+
@request_controls = request_controls.with_effective_values(
|
|
31
|
+
strategy: validated_config[:strategy],
|
|
32
|
+
max_redirects: validated_config.dig(:request, :max_redirects),
|
|
33
|
+
max_requests: validated_config.dig(:request, :max_requests)
|
|
34
|
+
)
|
|
52
35
|
end
|
|
53
36
|
|
|
37
|
+
def strategy = request_controls.strategy
|
|
38
|
+
def max_redirects = request_controls.max_redirects
|
|
39
|
+
def max_requests = request_controls.max_requests
|
|
40
|
+
def stylesheets = config[:stylesheets]
|
|
41
|
+
|
|
54
42
|
##
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# @return [Hash<Symbol, Object>] Merged attributes hash.
|
|
59
|
-
def selector_attributes_with_channel(name)
|
|
60
|
-
@selectors.selector(name).to_h.merge(channel: @channel)
|
|
43
|
+
# @return [Boolean] whether max_requests was explicitly configured by the caller
|
|
44
|
+
def explicit_max_requests?
|
|
45
|
+
request_controls.explicit?(:max_requests)
|
|
61
46
|
end
|
|
62
47
|
|
|
63
48
|
##
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def headers
|
|
68
|
-
|
|
49
|
+
# @return [Html2rss::RequestControls] request controls with provenance
|
|
50
|
+
attr_reader :request_controls
|
|
51
|
+
|
|
52
|
+
def headers = config[:headers]
|
|
53
|
+
def channel = config[:channel]
|
|
54
|
+
def url = config.dig(:channel, :url)
|
|
55
|
+
def time_zone = config.dig(:channel, :time_zone)
|
|
56
|
+
|
|
57
|
+
def request = config[:request]
|
|
58
|
+
|
|
59
|
+
def selectors = config[:selectors]
|
|
60
|
+
def auto_source = config[:auto_source]
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
attr_reader :config
|
|
65
|
+
|
|
66
|
+
# Normalizes raw config input before validation.
|
|
67
|
+
class Preparer
|
|
68
|
+
##
|
|
69
|
+
# @param config [Hash<Symbol, Object>] raw config input
|
|
70
|
+
# @return [Hash<Symbol, Object>] config with defaults and deprecations applied
|
|
71
|
+
def call(config)
|
|
72
|
+
config = config.dup if config.frozen?
|
|
73
|
+
|
|
74
|
+
config = handle_deprecated_channel_attributes(config)
|
|
75
|
+
config = apply_default_config(config)
|
|
76
|
+
config = apply_default_selectors_config(config) if config[:selectors]
|
|
77
|
+
config = apply_default_auto_source_config(config) if config[:auto_source]
|
|
78
|
+
|
|
79
|
+
config
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def handle_deprecated_channel_attributes(config)
|
|
85
|
+
{ strategy: RequestService.default_strategy_name, headers: {} }.each_pair do |key, default_value|
|
|
86
|
+
if !config[key] && (value = config.dig(:channel, key))
|
|
87
|
+
Log.warn("The `channel.#{key}` key is deprecated. Please move the definition of `#{key}` to the top level.")
|
|
88
|
+
config[key] = value
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
config[key] ||= default_value
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
config
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def apply_default_config(config)
|
|
98
|
+
deep_merge(Config.default_config, config)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def apply_default_selectors_config(config)
|
|
102
|
+
deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def apply_default_auto_source_config(config)
|
|
106
|
+
deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def deep_merge(base_config, override_config)
|
|
110
|
+
base_config.merge(override_config) do |_key, oldval, newval|
|
|
111
|
+
oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
|
|
112
|
+
end
|
|
113
|
+
end
|
|
69
114
|
end
|
|
70
115
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
116
|
+
def validated_config_for(config)
|
|
117
|
+
validator = Validator.new.call(config)
|
|
118
|
+
|
|
119
|
+
raise InvalidConfig, "Invalid configuration: #{validator.errors.to_h}" unless validator.success?
|
|
120
|
+
|
|
121
|
+
normalized_headers(validator.to_h)
|
|
77
122
|
end
|
|
78
123
|
|
|
79
|
-
|
|
80
|
-
|
|
124
|
+
def normalized_headers(validated_config)
|
|
125
|
+
validated_config[:headers] = RequestHeaders.normalize(
|
|
126
|
+
validated_config[:headers],
|
|
127
|
+
channel_language: validated_config.dig(:channel, :language),
|
|
128
|
+
url: validated_config.dig(:channel, :url)
|
|
129
|
+
)
|
|
130
|
+
validated_config
|
|
131
|
+
end
|
|
81
132
|
end
|
|
82
133
|
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class HtmlExtractor
|
|
5
|
+
# Extracts the earliest date from an article_tag.
|
|
6
|
+
class DateExtractor
|
|
7
|
+
# @return [DateTime, nil]
|
|
8
|
+
def self.call(article_tag)
|
|
9
|
+
times = article_tag.css('[datetime]').filter_map do |tag|
|
|
10
|
+
DateTime.parse(tag['datetime'])
|
|
11
|
+
rescue ArgumentError, TypeError
|
|
12
|
+
nil
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
times.min
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|