html2rss 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
|
@@ -6,6 +6,7 @@ module Html2rss
|
|
|
6
6
|
# Normalizes HTTP headers for outgoing requests.
|
|
7
7
|
# Ensures a browser-like baseline while respecting caller overrides.
|
|
8
8
|
class RequestHeaders
|
|
9
|
+
# Browser-like default `Accept` header value.
|
|
9
10
|
DEFAULT_ACCEPT = %w[
|
|
10
11
|
text/html
|
|
11
12
|
application/xhtml+xml
|
|
@@ -16,6 +17,7 @@ module Html2rss
|
|
|
16
17
|
*/*;q=0.8
|
|
17
18
|
].join(',')
|
|
18
19
|
|
|
20
|
+
# Browser-like default `User-Agent` header value.
|
|
19
21
|
DEFAULT_USER_AGENT = [
|
|
20
22
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
21
23
|
'AppleWebKit/537.36 (KHTML, like Gecko)',
|
|
@@ -23,6 +25,7 @@ module Html2rss
|
|
|
23
25
|
'Safari/537.36'
|
|
24
26
|
].join(' ')
|
|
25
27
|
|
|
28
|
+
# Baseline browser-like header set used for outbound requests.
|
|
26
29
|
DEFAULT_HEADERS = {
|
|
27
30
|
'Accept' => DEFAULT_ACCEPT,
|
|
28
31
|
'Cache-Control' => 'max-age=0',
|
|
@@ -37,7 +40,7 @@ module Html2rss
|
|
|
37
40
|
|
|
38
41
|
class << self
|
|
39
42
|
##
|
|
40
|
-
# @return [Hash
|
|
43
|
+
# @return [Hash{String => String}] the unmodified default header set
|
|
41
44
|
def browser_defaults
|
|
42
45
|
DEFAULT_HEADERS.dup
|
|
43
46
|
end
|
|
@@ -48,12 +51,15 @@ module Html2rss
|
|
|
48
51
|
# @param headers [Hash, nil] caller provided headers
|
|
49
52
|
# @param channel_language [String, nil] language defined on the channel
|
|
50
53
|
# @param url [String] request URL used to infer the Host header
|
|
51
|
-
# @return [Hash
|
|
54
|
+
# @return [Hash{String => String}] normalized HTTP headers
|
|
52
55
|
def normalize(headers, channel_language:, url:)
|
|
53
56
|
new(headers || {}, channel_language:, url:).to_h
|
|
54
57
|
end
|
|
55
58
|
end
|
|
56
59
|
|
|
60
|
+
# @param headers [Hash{String, Symbol => String}] caller-provided headers
|
|
61
|
+
# @param channel_language [String, nil] channel language hint for Accept-Language
|
|
62
|
+
# @param url [String, Html2rss::Url, nil] request URL used to infer Host
|
|
57
63
|
def initialize(headers, channel_language:, url:)
|
|
58
64
|
@headers = headers
|
|
59
65
|
@channel_language = channel_language
|
|
@@ -61,7 +67,7 @@ module Html2rss
|
|
|
61
67
|
end
|
|
62
68
|
|
|
63
69
|
##
|
|
64
|
-
# @return [Hash
|
|
70
|
+
# @return [Hash{String => String}] normalized HTTP headers
|
|
65
71
|
def to_h
|
|
66
72
|
defaults = DEFAULT_HEADERS.dup
|
|
67
73
|
normalized = normalize_custom_headers(headers)
|
|
@@ -7,12 +7,13 @@ module Html2rss
|
|
|
7
7
|
module Schema
|
|
8
8
|
module_function
|
|
9
9
|
|
|
10
|
+
# Canonical filename for the exported config JSON schema artifact.
|
|
10
11
|
SCHEMA_FILENAME = 'html2rss-config.schema.json'
|
|
11
12
|
|
|
12
13
|
##
|
|
13
14
|
# Returns the exported configuration JSON Schema.
|
|
14
15
|
#
|
|
15
|
-
# @return [Hash
|
|
16
|
+
# @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
|
|
16
17
|
def json_schema
|
|
17
18
|
load_json_schema_extension!
|
|
18
19
|
Builder.call
|
|
@@ -38,6 +39,7 @@ module Html2rss
|
|
|
38
39
|
File.expand_path("../../../schema/#{SCHEMA_FILENAME}", __dir__)
|
|
39
40
|
end
|
|
40
41
|
|
|
42
|
+
# @return [void]
|
|
41
43
|
def load_json_schema_extension!
|
|
42
44
|
require 'dry/schema/extensions/json_schema'
|
|
43
45
|
Dry::Schema.load_extensions(:json_schema)
|
|
@@ -48,11 +50,13 @@ module Html2rss
|
|
|
48
50
|
# client-facing overlays.
|
|
49
51
|
class Builder
|
|
50
52
|
class << self
|
|
53
|
+
# @return [Hash{String => Object}] fully assembled JSON schema hash
|
|
51
54
|
def call
|
|
52
55
|
new.call
|
|
53
56
|
end
|
|
54
57
|
end
|
|
55
58
|
|
|
59
|
+
# @return [Hash{String => Object}] fully assembled JSON schema hash
|
|
56
60
|
def call
|
|
57
61
|
schema = validator_schema
|
|
58
62
|
apply_top_level(schema)
|
|
@@ -76,6 +80,7 @@ module Html2rss
|
|
|
76
80
|
|
|
77
81
|
def assign_properties(properties)
|
|
78
82
|
properties.merge!(
|
|
83
|
+
strategy: Components.strategy,
|
|
79
84
|
headers: Components.headers,
|
|
80
85
|
stylesheets: Components.stylesheets,
|
|
81
86
|
auto_source: Components.auto_source,
|
|
@@ -90,6 +95,15 @@ module Html2rss
|
|
|
90
95
|
module Components
|
|
91
96
|
module_function
|
|
92
97
|
|
|
98
|
+
# @return [Hash{Symbol => Object}] schema fragment for strategy selection
|
|
99
|
+
def strategy
|
|
100
|
+
{
|
|
101
|
+
type: 'string',
|
|
102
|
+
not: { type: 'null' }
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @return [Hash{Symbol => Object}] schema fragment for headers
|
|
93
107
|
def headers
|
|
94
108
|
{
|
|
95
109
|
type: 'object',
|
|
@@ -98,6 +112,7 @@ module Html2rss
|
|
|
98
112
|
}
|
|
99
113
|
end
|
|
100
114
|
|
|
115
|
+
# @return [Hash{Symbol => Object}] schema fragment for stylesheet definitions
|
|
101
116
|
def stylesheets
|
|
102
117
|
{
|
|
103
118
|
type: 'array',
|
|
@@ -106,12 +121,14 @@ module Html2rss
|
|
|
106
121
|
}
|
|
107
122
|
end
|
|
108
123
|
|
|
124
|
+
# @return [Hash{Symbol => Object}] schema fragment for auto_source configuration
|
|
109
125
|
def auto_source
|
|
110
126
|
schema = Html2rss::AutoSource::Config.json_schema(loose: true)
|
|
111
127
|
schema[:default] = DeepStringifier.call(Html2rss::AutoSource::DEFAULT_CONFIG)
|
|
112
128
|
schema
|
|
113
129
|
end
|
|
114
130
|
|
|
131
|
+
# @return [Hash{Symbol => Object}] schema fragment for selectors configuration
|
|
115
132
|
def selectors
|
|
116
133
|
Selectors.schema
|
|
117
134
|
end
|
|
@@ -122,8 +139,10 @@ module Html2rss
|
|
|
122
139
|
module Selectors
|
|
123
140
|
module_function
|
|
124
141
|
|
|
142
|
+
# Pattern used for dynamic selector keys excluding reserved selector names.
|
|
125
143
|
RESERVED_SELECTOR_PATTERN = '^(?!items$|enclosure$|guid$|categories$).+$'
|
|
126
144
|
|
|
145
|
+
# @return [Hash{Symbol => Object}] schema fragment for selectors root object
|
|
127
146
|
def schema
|
|
128
147
|
{
|
|
129
148
|
type: 'object',
|
|
@@ -135,6 +154,7 @@ module Html2rss
|
|
|
135
154
|
end
|
|
136
155
|
|
|
137
156
|
# rubocop:disable Layout/LineLength
|
|
157
|
+
# @return [Hash{Symbol => Object}] schema map for reserved selector properties
|
|
138
158
|
def selector_properties
|
|
139
159
|
{
|
|
140
160
|
items: items_schema,
|
|
@@ -145,22 +165,26 @@ module Html2rss
|
|
|
145
165
|
end
|
|
146
166
|
# rubocop:enable Layout/LineLength
|
|
147
167
|
|
|
168
|
+
# @return [Hash{String => Object}] schema map for dynamic selector keys
|
|
148
169
|
def pattern_properties
|
|
149
170
|
{ RESERVED_SELECTOR_PATTERN => dynamic_selector_schema }
|
|
150
171
|
end
|
|
151
172
|
|
|
173
|
+
# @return [Hash{Symbol => Object}] schema fragment for dynamic selector entries
|
|
152
174
|
def dynamic_selector_schema
|
|
153
175
|
Html2rss::Selectors::Config::Selector.new.schema.json_schema(loose: true).merge(
|
|
154
176
|
description: 'Dynamic selector definition keyed by attribute name.'
|
|
155
177
|
)
|
|
156
178
|
end
|
|
157
179
|
|
|
180
|
+
# @return [Hash{Symbol => Object}] schema fragment for `items` selector configuration
|
|
158
181
|
def items_schema
|
|
159
182
|
Html2rss::Selectors::Config::Items.new.schema.json_schema(loose: true).merge(
|
|
160
183
|
description: 'Defines the items selector and optional enhancement settings.'
|
|
161
184
|
)
|
|
162
185
|
end
|
|
163
186
|
|
|
187
|
+
# @return [Hash{Symbol => Object}] schema fragment for `enclosure` selector configuration
|
|
164
188
|
def enclosure_schema
|
|
165
189
|
Html2rss::Selectors::Config::Enclosure.new.schema.json_schema(loose: true).merge(
|
|
166
190
|
description: 'Describes enclosure extraction settings.'
|
|
@@ -170,6 +194,8 @@ module Html2rss
|
|
|
170
194
|
# JSON Schema can enforce non-empty reference arrays, while runtime
|
|
171
195
|
# validation remains authoritative for checking that each entry points
|
|
172
196
|
# to an existing sibling selector key.
|
|
197
|
+
# @param description [String] human-readable description for the reference field
|
|
198
|
+
# @return [Hash{Symbol => Object}] JSON schema fragment for selector references
|
|
173
199
|
def reference_array(description)
|
|
174
200
|
{
|
|
175
201
|
type: 'array',
|
|
@@ -188,17 +214,23 @@ module Html2rss
|
|
|
188
214
|
module DeepStringifier
|
|
189
215
|
module_function
|
|
190
216
|
|
|
217
|
+
# @param object [Hash, Array, Object] nested data to normalize
|
|
218
|
+
# @return [Hash, Array, Object] deep copy with stringified hash keys
|
|
191
219
|
def call(object)
|
|
192
220
|
case object
|
|
193
221
|
when Hash
|
|
194
222
|
stringify_hash(object)
|
|
195
223
|
when Array
|
|
196
224
|
object.map { |value| call(value) }
|
|
225
|
+
when Symbol
|
|
226
|
+
object.to_s
|
|
197
227
|
else
|
|
198
228
|
object
|
|
199
229
|
end
|
|
200
230
|
end
|
|
201
231
|
|
|
232
|
+
# @param object [Hash{Object => Object}] hash whose keys should become strings
|
|
233
|
+
# @return [Hash{String => Object}] hash with recursively normalized values
|
|
202
234
|
def stringify_hash(object)
|
|
203
235
|
object.to_h { |key, value| [key.to_s, call(value)] }
|
|
204
236
|
end
|
|
@@ -6,11 +6,17 @@ module Html2rss
|
|
|
6
6
|
class Config
|
|
7
7
|
# Validates the configuration hash using Dry::Validation.
|
|
8
8
|
# The configuration options adhere to the documented schema in README.md.
|
|
9
|
-
class Validator < Dry::Validation::Contract
|
|
9
|
+
class Validator < Dry::Validation::Contract # rubocop:disable Metrics/ClassLength
|
|
10
|
+
# URI format used for channel URL validation.
|
|
10
11
|
URI_REGEXP = Url::URI_REGEXP
|
|
12
|
+
# Allowed stylesheet MIME types.
|
|
11
13
|
STYLESHEET_TYPES = RssBuilder::Stylesheet::TYPES
|
|
14
|
+
# Optional language/region format (`en` or `en-US`).
|
|
12
15
|
LANGUAGE_FORMAT_REGEX = /\A[a-z]{2}(-[A-Z]{2})?\z/
|
|
16
|
+
# Baseline strategy enum exported in static schema artifacts.
|
|
17
|
+
BASE_STRATEGY_OPTIONS = ([:auto] + Html2rss::RequestService.strategy_names.map(&:to_sym)).uniq.freeze
|
|
13
18
|
|
|
19
|
+
# Contract for the top-level `channel` section.
|
|
14
20
|
ChannelConfig = Dry::Schema.Params do
|
|
15
21
|
required(:url).filled(:string, format?: URI_REGEXP)
|
|
16
22
|
optional(:title).maybe(:string)
|
|
@@ -20,41 +26,66 @@ module Html2rss
|
|
|
20
26
|
optional(:time_zone).maybe(:string)
|
|
21
27
|
end
|
|
22
28
|
|
|
29
|
+
# Contract for a stylesheet entry in `stylesheets`.
|
|
23
30
|
StylesheetConfig = Dry::Schema.Params do
|
|
24
31
|
required(:href).filled(:string)
|
|
25
32
|
required(:type).filled(:string, included_in?: STYLESHEET_TYPES)
|
|
26
33
|
optional(:media).maybe(:string)
|
|
27
34
|
end
|
|
28
35
|
|
|
36
|
+
# Contract for Browserless click-preload options.
|
|
29
37
|
BrowserlessPreloadClickSelectorConfig = Dry::Schema.Params do
|
|
30
38
|
required(:selector).filled(:string)
|
|
31
39
|
optional(:max_clicks).filled(:integer, gt?: 0)
|
|
32
40
|
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
33
41
|
end
|
|
34
42
|
|
|
43
|
+
# Contract for Browserless scroll-preload options.
|
|
35
44
|
BrowserlessPreloadScrollConfig = Dry::Schema.Params do
|
|
36
45
|
optional(:iterations).filled(:integer, gt?: 0)
|
|
37
46
|
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
38
47
|
end
|
|
39
48
|
|
|
49
|
+
# Contract for Browserless preload orchestration options.
|
|
40
50
|
BrowserlessPreloadConfig = Dry::Schema.Params do
|
|
41
51
|
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
42
52
|
optional(:click_selectors).array(BrowserlessPreloadClickSelectorConfig)
|
|
43
53
|
optional(:scroll_down).hash(BrowserlessPreloadScrollConfig)
|
|
44
54
|
end
|
|
45
55
|
|
|
56
|
+
# Contract for Browserless-specific request options.
|
|
46
57
|
BrowserlessRequestConfig = Dry::Schema.Params do
|
|
47
58
|
optional(:preload).hash(BrowserlessPreloadConfig)
|
|
48
59
|
end
|
|
49
60
|
|
|
61
|
+
# Contract for Botasaurus-specific request options.
|
|
62
|
+
BotasaurusRequestConfig = Dry::Schema.Params do
|
|
63
|
+
config.validate_keys = true
|
|
64
|
+
|
|
65
|
+
optional(:navigation_mode).filled(:string, included_in?: %w[auto get google_get google_get_bypass])
|
|
66
|
+
optional(:max_retries).filled(:integer, gteq?: 0, lteq?: 3)
|
|
67
|
+
optional(:wait_for_selector).maybe(:string)
|
|
68
|
+
optional(:wait_timeout_seconds).filled(:integer, gt?: 0)
|
|
69
|
+
optional(:block_images).filled(:bool)
|
|
70
|
+
optional(:block_images_and_css).filled(:bool)
|
|
71
|
+
optional(:wait_for_complete_page_load).filled(:bool)
|
|
72
|
+
optional(:headless).filled(:bool)
|
|
73
|
+
optional(:proxy).filled(:string)
|
|
74
|
+
optional(:user_agent).filled(:string)
|
|
75
|
+
optional(:window_size).value(:array, min_size?: 2, max_size?: 2).each(:integer, gt?: 0)
|
|
76
|
+
optional(:lang).filled(:string)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Contract for the top-level `request` section.
|
|
50
80
|
RequestConfig = Dry::Schema.Params do
|
|
51
81
|
optional(:max_redirects).filled(:integer, gteq?: 0)
|
|
52
82
|
optional(:max_requests).filled(:integer, gt?: 0)
|
|
53
83
|
optional(:browserless).hash(BrowserlessRequestConfig)
|
|
84
|
+
optional(:botasaurus).hash(BotasaurusRequestConfig)
|
|
54
85
|
end
|
|
55
86
|
|
|
56
87
|
params do
|
|
57
|
-
|
|
88
|
+
optional(:strategy).filled(:symbol)
|
|
58
89
|
required(:channel).hash(ChannelConfig)
|
|
59
90
|
optional(:headers).hash
|
|
60
91
|
optional(:stylesheets).array(StylesheetConfig)
|
|
@@ -76,6 +107,13 @@ module Html2rss
|
|
|
76
107
|
base.failure(value) if value
|
|
77
108
|
end
|
|
78
109
|
|
|
110
|
+
rule(:strategy) do
|
|
111
|
+
next if value.nil?
|
|
112
|
+
next if value == :auto || Html2rss::RequestService.strategy_registered?(value)
|
|
113
|
+
|
|
114
|
+
key.failure("must be one of: #{BASE_STRATEGY_OPTIONS.join(', ')}")
|
|
115
|
+
end
|
|
116
|
+
|
|
79
117
|
# Ensure at least one of :selectors or :auto_source is present.
|
|
80
118
|
rule(:selectors, :auto_source) do
|
|
81
119
|
unless values.key?(:selectors) || values.key?(:auto_source)
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -11,6 +11,7 @@ module Html2rss
|
|
|
11
11
|
#
|
|
12
12
|
# Configuration is validated during initialization.
|
|
13
13
|
class Config
|
|
14
|
+
# Raised when a configuration hash fails runtime validation.
|
|
14
15
|
class InvalidConfig < Html2rss::Error; end
|
|
15
16
|
extend ClassMethods
|
|
16
17
|
|
|
@@ -19,7 +20,7 @@ module Html2rss
|
|
|
19
20
|
#
|
|
20
21
|
# Processes deprecated attributes, applies default values, and validates the configuration.
|
|
21
22
|
#
|
|
22
|
-
# @param config [Hash
|
|
23
|
+
# @param config [Hash{Symbol => Object}] the configuration hash.
|
|
23
24
|
# @raise [InvalidConfig] if the configuration fails validation.
|
|
24
25
|
def initialize(config)
|
|
25
26
|
@request_controls = RequestControls.from_config(config)
|
|
@@ -34,9 +35,13 @@ module Html2rss
|
|
|
34
35
|
)
|
|
35
36
|
end
|
|
36
37
|
|
|
38
|
+
# @return [Symbol, nil] selected request strategy
|
|
37
39
|
def strategy = request_controls.strategy
|
|
40
|
+
# @return [Integer, nil] configured redirect budget
|
|
38
41
|
def max_redirects = request_controls.max_redirects
|
|
42
|
+
# @return [Integer, nil] configured request budget
|
|
39
43
|
def max_requests = request_controls.max_requests
|
|
44
|
+
# @return [Array<Hash>] stylesheet definitions
|
|
40
45
|
def stylesheets = config[:stylesheets]
|
|
41
46
|
|
|
42
47
|
##
|
|
@@ -49,14 +54,21 @@ module Html2rss
|
|
|
49
54
|
# @return [Html2rss::RequestControls] request controls with provenance
|
|
50
55
|
attr_reader :request_controls
|
|
51
56
|
|
|
57
|
+
# @return [Hash{String => String}] normalized HTTP headers
|
|
52
58
|
def headers = config[:headers]
|
|
59
|
+
# @return [Hash{Symbol => Object}] channel configuration
|
|
53
60
|
def channel = config[:channel]
|
|
61
|
+
# @return [String] source channel URL
|
|
54
62
|
def url = config.dig(:channel, :url)
|
|
63
|
+
# @return [String, nil] configured channel time zone
|
|
55
64
|
def time_zone = config.dig(:channel, :time_zone)
|
|
56
65
|
|
|
66
|
+
# @return [Hash{Symbol => Object}] request envelope configuration
|
|
57
67
|
def request = config[:request]
|
|
58
68
|
|
|
69
|
+
# @return [Hash{Symbol => Object}, nil] selectors configuration
|
|
59
70
|
def selectors = config[:selectors]
|
|
71
|
+
# @return [Hash{Symbol => Object}, nil] auto-source configuration
|
|
60
72
|
def auto_source = config[:auto_source]
|
|
61
73
|
|
|
62
74
|
private
|
|
@@ -66,8 +78,8 @@ module Html2rss
|
|
|
66
78
|
# Normalizes raw config input before validation.
|
|
67
79
|
class Preparer
|
|
68
80
|
##
|
|
69
|
-
# @param config [Hash
|
|
70
|
-
# @return [Hash
|
|
81
|
+
# @param config [Hash{Symbol => Object}] raw config input
|
|
82
|
+
# @return [Hash{Symbol => Object}] config with defaults and deprecations applied
|
|
71
83
|
def call(config)
|
|
72
84
|
config = config.dup if config.frozen?
|
|
73
85
|
|
|
@@ -82,7 +94,7 @@ module Html2rss
|
|
|
82
94
|
private
|
|
83
95
|
|
|
84
96
|
def handle_deprecated_channel_attributes(config)
|
|
85
|
-
{ strategy:
|
|
97
|
+
{ strategy: Config.default_strategy_name, headers: {} }.each_pair do |key, default_value|
|
|
86
98
|
if !config[key] && (value = config.dig(:channel, key))
|
|
87
99
|
Log.warn("The `channel.#{key}` key is deprecated. Please move the definition of `#{key}` to the top level.")
|
|
88
100
|
config[key] = value
|
|
@@ -95,21 +107,15 @@ module Html2rss
|
|
|
95
107
|
end
|
|
96
108
|
|
|
97
109
|
def apply_default_config(config)
|
|
98
|
-
deep_merge(Config.default_config, config)
|
|
110
|
+
HashUtil.deep_merge(Config.default_config, config)
|
|
99
111
|
end
|
|
100
112
|
|
|
101
113
|
def apply_default_selectors_config(config)
|
|
102
|
-
deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
|
|
114
|
+
HashUtil.deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
|
|
103
115
|
end
|
|
104
116
|
|
|
105
117
|
def apply_default_auto_source_config(config)
|
|
106
|
-
deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
|
|
107
|
-
end
|
|
108
|
-
|
|
109
|
-
def deep_merge(base_config, override_config)
|
|
110
|
-
base_config.merge(override_config) do |_key, oldval, newval|
|
|
111
|
-
oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
|
|
112
|
-
end
|
|
118
|
+
HashUtil.deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
|
|
113
119
|
end
|
|
114
120
|
end
|
|
115
121
|
|
data/lib/html2rss/error.rb
CHANGED
|
@@ -3,4 +3,29 @@
|
|
|
3
3
|
module Html2rss
|
|
4
4
|
# The Html2rss::Error base class.
|
|
5
5
|
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when auto fallback exhausts all concrete tiers and extractors find no feed items.
|
|
8
|
+
class NoFeedItemsExtracted < Error
|
|
9
|
+
##
|
|
10
|
+
# @param attempts [Array<Hash{Symbol => Object}>] tier attempt diagnostics
|
|
11
|
+
def initialize(attempts:)
|
|
12
|
+
@attempts = attempts
|
|
13
|
+
super(build_message)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @return [Array<Hash{Symbol => Object}>] tier attempt diagnostics
|
|
17
|
+
attr_reader :attempts
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def build_message
|
|
22
|
+
summaries = attempts.map do |attempt|
|
|
23
|
+
details = attempt[:items_count].nil? ? "#{attempt[:error_class]} error" : "#{attempt[:items_count]} items"
|
|
24
|
+
"#{attempt[:strategy]} (#{details})"
|
|
25
|
+
end.join(', ')
|
|
26
|
+
|
|
27
|
+
"No feed items extracted after auto fallback across strategies: #{summaries}. " \
|
|
28
|
+
'Try a more specific listing URL or provide explicit selectors.'
|
|
29
|
+
end
|
|
30
|
+
end
|
|
6
31
|
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
##
|
|
4
|
+
# The Html2rss namespace.
|
|
5
|
+
module Html2rss
|
|
6
|
+
##
|
|
7
|
+
# Coordinates feed generation pipeline stages.
|
|
8
|
+
class FeedPipeline
|
|
9
|
+
# Retries feed extraction across concrete request strategies for :auto mode.
|
|
10
|
+
class AutoFallback
|
|
11
|
+
# Ordered list of concrete request strategies attempted by auto mode.
|
|
12
|
+
CHAIN = %i[faraday botasaurus browserless].freeze
|
|
13
|
+
|
|
14
|
+
# Error classes that should abort auto fallback immediately.
|
|
15
|
+
NON_FALLBACK_ERRORS = [
|
|
16
|
+
RequestService::UnknownStrategy,
|
|
17
|
+
RequestService::InvalidUrl,
|
|
18
|
+
RequestService::UnsupportedUrlScheme,
|
|
19
|
+
RequestService::UnsupportedResponseContentType,
|
|
20
|
+
RequestService::RequestBudgetExceeded,
|
|
21
|
+
RequestService::PrivateNetworkDenied,
|
|
22
|
+
RequestService::CrossOriginFollowUpDenied,
|
|
23
|
+
RequestService::ResponseTooLarge,
|
|
24
|
+
RequestService::BrowserlessConfigurationError
|
|
25
|
+
].freeze
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# @param strategies [Array<Symbol>] ordered concrete strategies for fallback
|
|
29
|
+
# @param budget [RequestService::Budget] shared request budget across retries
|
|
30
|
+
# @param session_for [Proc] request session factory proc
|
|
31
|
+
# @param articles_for [Proc] article extraction proc
|
|
32
|
+
# @return [void]
|
|
33
|
+
def initialize(strategies:, budget:, session_for:, articles_for:)
|
|
34
|
+
@strategies = strategies
|
|
35
|
+
@budget = budget
|
|
36
|
+
@session_for = session_for
|
|
37
|
+
@articles_for = articles_for
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
##
|
|
41
|
+
# @return [Hash{Symbol => Object}] pipeline state containing :response and :articles
|
|
42
|
+
def call
|
|
43
|
+
state, attempts = run_attempts
|
|
44
|
+
return state if state
|
|
45
|
+
|
|
46
|
+
finalize_failure(attempts:)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
attr_reader :strategies, :budget, :session_for, :articles_for
|
|
52
|
+
|
|
53
|
+
def run_attempts
|
|
54
|
+
state = { result: nil, attempts: [] }
|
|
55
|
+
strategies.each_with_index do |strategy, index|
|
|
56
|
+
run_attempt_for(strategy:, next_strategy: strategies[index + 1], state:)
|
|
57
|
+
break if state.fetch(:result)
|
|
58
|
+
end
|
|
59
|
+
[state.fetch(:result), state.fetch(:attempts)]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def run_attempt_for(strategy:, next_strategy:, state:)
|
|
63
|
+
result, attempts = attempt(
|
|
64
|
+
strategy:,
|
|
65
|
+
next_strategy:,
|
|
66
|
+
state: { attempts: state.fetch(:attempts) }
|
|
67
|
+
)
|
|
68
|
+
state[:result] = result
|
|
69
|
+
state[:attempts] = attempts
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def attempt(strategy:, next_strategy:, state:)
|
|
73
|
+
request_session = session_for.call(strategy:, budget:)
|
|
74
|
+
response, state = fetch_response(
|
|
75
|
+
request_session:,
|
|
76
|
+
strategy:,
|
|
77
|
+
next_strategy:,
|
|
78
|
+
state:
|
|
79
|
+
)
|
|
80
|
+
return [nil, state.fetch(:attempts)] unless response
|
|
81
|
+
|
|
82
|
+
process_response(response:, strategy:, next_strategy:, request_session:, state:)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def fetch_response(request_session:, strategy:, next_strategy:, state:)
|
|
86
|
+
[request_session.fetch_initial_response, state]
|
|
87
|
+
rescue *NON_FALLBACK_ERRORS
|
|
88
|
+
raise
|
|
89
|
+
rescue StandardError => error
|
|
90
|
+
state[:attempts] << { strategy:, items_count: nil, error_class: error.class.name }
|
|
91
|
+
log_warn_fallback_error(strategy:, next_strategy:, error:) if next_strategy
|
|
92
|
+
Log.debug("#{self.class}: strategy=#{strategy} error=#{error.class}: #{error.message}")
|
|
93
|
+
[nil, state]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def process_response(response:, strategy:, next_strategy:, request_session:, state:)
|
|
97
|
+
articles = articles_for.call(response:, request_session:)
|
|
98
|
+
items_count = articles.size
|
|
99
|
+
state[:attempts] << { strategy:, items_count:, error_class: nil }
|
|
100
|
+
Log.debug("#{self.class}: strategy=#{strategy} items=#{items_count}")
|
|
101
|
+
return success_state(response:, strategy:, articles:, state:) if items_count.positive?
|
|
102
|
+
|
|
103
|
+
log_info_fallback_zero_items(strategy:, next_strategy:) if next_strategy
|
|
104
|
+
[nil, state.fetch(:attempts)]
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def success_state(response:, strategy:, articles:, state:)
|
|
108
|
+
if state.fetch(:attempts).size > 1
|
|
109
|
+
Log.info("#{self.class}: auto selected strategy=#{strategy} after attempts=#{state.fetch(:attempts).size}")
|
|
110
|
+
end
|
|
111
|
+
[{ response:, articles: }, state.fetch(:attempts)]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def finalize_failure(attempts:)
|
|
115
|
+
raise NoFeedItemsExtracted.new(attempts:)
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def log_warn_fallback_error(strategy:, next_strategy:, error:)
|
|
119
|
+
Log.warn("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after error=#{error.class}")
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def log_info_fallback_zero_items(strategy:, next_strategy:)
|
|
123
|
+
Log.info("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after zero extracted items")
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|