html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
##
|
|
6
|
+
# Builds the exported configuration JSON Schema from the runtime validators.
|
|
7
|
+
module Schema
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
# Canonical filename for the exported config JSON schema artifact.
|
|
11
|
+
SCHEMA_FILENAME = 'html2rss-config.schema.json'
|
|
12
|
+
|
|
13
|
+
##
|
|
14
|
+
# Returns the exported configuration JSON Schema.
|
|
15
|
+
#
|
|
16
|
+
# @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
|
|
17
|
+
def json_schema
|
|
18
|
+
load_json_schema_extension!
|
|
19
|
+
Builder.call
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Resolves the packaged schema path used by downstream tools.
|
|
24
|
+
#
|
|
25
|
+
# @return [String] absolute path to the packaged JSON schema file
|
|
26
|
+
def path
|
|
27
|
+
search_path = File.expand_path(__dir__)
|
|
28
|
+
|
|
29
|
+
loop do
|
|
30
|
+
candidate = File.join(search_path, 'schema', SCHEMA_FILENAME)
|
|
31
|
+
return candidate if File.exist?(candidate)
|
|
32
|
+
|
|
33
|
+
parent_path = File.dirname(search_path)
|
|
34
|
+
break if parent_path == search_path
|
|
35
|
+
|
|
36
|
+
search_path = parent_path
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
File.expand_path("../../../schema/#{SCHEMA_FILENAME}", __dir__)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# @return [void]
|
|
43
|
+
def load_json_schema_extension!
|
|
44
|
+
require 'dry/schema/extensions/json_schema'
|
|
45
|
+
Dry::Schema.load_extensions(:json_schema)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# Orchestrates schema assembly from runtime validator contracts plus
|
|
50
|
+
# client-facing overlays.
|
|
51
|
+
class Builder
|
|
52
|
+
class << self
|
|
53
|
+
# @return [Hash{String => Object}] fully assembled JSON schema hash
|
|
54
|
+
def call
|
|
55
|
+
new.call
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# @return [Hash{String => Object}] fully assembled JSON schema hash
|
|
60
|
+
def call
|
|
61
|
+
schema = validator_schema
|
|
62
|
+
apply_top_level(schema)
|
|
63
|
+
assign_properties(schema.fetch(:properties))
|
|
64
|
+
DeepStringifier.call(schema)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
private
|
|
68
|
+
|
|
69
|
+
def validator_schema
|
|
70
|
+
Html2rss::Config::Validator.new.schema.json_schema(loose: true)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def apply_top_level(schema)
|
|
74
|
+
schema['$schema'] = 'https://json-schema.org/draft/2020-12/schema'
|
|
75
|
+
schema[:anyOf] = [
|
|
76
|
+
{ 'required' => ['selectors'] },
|
|
77
|
+
{ 'required' => ['auto_source'] }
|
|
78
|
+
]
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def assign_properties(properties)
|
|
82
|
+
properties.merge!(
|
|
83
|
+
strategy: Components.strategy,
|
|
84
|
+
headers: Components.headers,
|
|
85
|
+
stylesheets: Components.stylesheets,
|
|
86
|
+
auto_source: Components.auto_source,
|
|
87
|
+
selectors: Components.selectors
|
|
88
|
+
)
|
|
89
|
+
properties.delete(:dynamic_params_error)
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
##
|
|
94
|
+
# Exposes schema fragments that populate the top-level configuration schema.
|
|
95
|
+
module Components
|
|
96
|
+
module_function
|
|
97
|
+
|
|
98
|
+
# @return [Hash{Symbol => Object}] schema fragment for strategy selection
|
|
99
|
+
def strategy
|
|
100
|
+
{
|
|
101
|
+
type: 'string',
|
|
102
|
+
not: { type: 'null' }
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @return [Hash{Symbol => Object}] schema fragment for headers
|
|
107
|
+
def headers
|
|
108
|
+
{
|
|
109
|
+
type: 'object',
|
|
110
|
+
description: 'HTTP headers applied to every request.',
|
|
111
|
+
additionalProperties: { type: 'string' }
|
|
112
|
+
}
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# @return [Hash{Symbol => Object}] schema fragment for stylesheet definitions
|
|
116
|
+
def stylesheets
|
|
117
|
+
{
|
|
118
|
+
type: 'array',
|
|
119
|
+
description: 'Collection of stylesheets to attach to the RSS feed.',
|
|
120
|
+
items: Html2rss::Config::Validator::StylesheetConfig.json_schema(loose: true)
|
|
121
|
+
}
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# @return [Hash{Symbol => Object}] schema fragment for auto_source configuration
|
|
125
|
+
def auto_source
|
|
126
|
+
schema = Html2rss::AutoSource::Config.json_schema(loose: true)
|
|
127
|
+
schema[:default] = DeepStringifier.call(Html2rss::AutoSource::DEFAULT_CONFIG)
|
|
128
|
+
schema
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# @return [Hash{Symbol => Object}] schema fragment for selectors configuration
|
|
132
|
+
def selectors
|
|
133
|
+
Selectors.schema
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
##
|
|
138
|
+
# Provides schema fragments that document selector configuration.
|
|
139
|
+
module Selectors
|
|
140
|
+
module_function
|
|
141
|
+
|
|
142
|
+
# Pattern used for dynamic selector keys excluding reserved selector names.
|
|
143
|
+
RESERVED_SELECTOR_PATTERN = '^(?!items$|enclosure$|guid$|categories$).+$'
|
|
144
|
+
|
|
145
|
+
# @return [Hash{Symbol => Object}] schema fragment for selectors root object
|
|
146
|
+
def schema
|
|
147
|
+
{
|
|
148
|
+
type: 'object',
|
|
149
|
+
description: 'Selectors used to extract article attributes.',
|
|
150
|
+
properties: selector_properties,
|
|
151
|
+
patternProperties: pattern_properties,
|
|
152
|
+
additionalProperties: true
|
|
153
|
+
}
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# rubocop:disable Layout/LineLength
|
|
157
|
+
# @return [Hash{Symbol => Object}] schema map for reserved selector properties
|
|
158
|
+
def selector_properties
|
|
159
|
+
{
|
|
160
|
+
items: items_schema,
|
|
161
|
+
enclosure: enclosure_schema,
|
|
162
|
+
guid: reference_array('List of selector keys used to build the GUID. Each entry must reference a sibling selector key; runtime validation enforces those references.'),
|
|
163
|
+
categories: reference_array('List of selector keys whose values will be used as categories. Each entry must reference a sibling selector key; runtime validation enforces those references.')
|
|
164
|
+
}
|
|
165
|
+
end
|
|
166
|
+
# rubocop:enable Layout/LineLength
|
|
167
|
+
|
|
168
|
+
# @return [Hash{String => Object}] schema map for dynamic selector keys
|
|
169
|
+
def pattern_properties
|
|
170
|
+
{ RESERVED_SELECTOR_PATTERN => dynamic_selector_schema }
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# @return [Hash{Symbol => Object}] schema fragment for dynamic selector entries
|
|
174
|
+
def dynamic_selector_schema
|
|
175
|
+
Html2rss::Selectors::Config::Selector.new.schema.json_schema(loose: true).merge(
|
|
176
|
+
description: 'Dynamic selector definition keyed by attribute name.'
|
|
177
|
+
)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
# @return [Hash{Symbol => Object}] schema fragment for `items` selector configuration
|
|
181
|
+
def items_schema
|
|
182
|
+
Html2rss::Selectors::Config::Items.new.schema.json_schema(loose: true).merge(
|
|
183
|
+
description: 'Defines the items selector and optional enhancement settings.'
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# @return [Hash{Symbol => Object}] schema fragment for `enclosure` selector configuration
|
|
188
|
+
def enclosure_schema
|
|
189
|
+
Html2rss::Selectors::Config::Enclosure.new.schema.json_schema(loose: true).merge(
|
|
190
|
+
description: 'Describes enclosure extraction settings.'
|
|
191
|
+
)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# JSON Schema can enforce non-empty reference arrays, while runtime
|
|
195
|
+
# validation remains authoritative for checking that each entry points
|
|
196
|
+
# to an existing sibling selector key.
|
|
197
|
+
# @param description [String] human-readable description for the reference field
|
|
198
|
+
# @return [Hash{Symbol => Object}] JSON schema fragment for selector references
|
|
199
|
+
def reference_array(description)
|
|
200
|
+
{
|
|
201
|
+
type: 'array',
|
|
202
|
+
description:,
|
|
203
|
+
minItems: 1,
|
|
204
|
+
items: {
|
|
205
|
+
type: 'string',
|
|
206
|
+
description: 'Selector key defined elsewhere in this object.'
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
##
|
|
213
|
+
# Converts nested hash keys to strings so the resulting schema serializes cleanly.
|
|
214
|
+
module DeepStringifier
|
|
215
|
+
module_function
|
|
216
|
+
|
|
217
|
+
# @param object [Hash, Array, Object] nested data to normalize
|
|
218
|
+
# @return [Hash, Array, Object] deep copy with stringified hash keys
|
|
219
|
+
def call(object)
|
|
220
|
+
case object
|
|
221
|
+
when Hash
|
|
222
|
+
stringify_hash(object)
|
|
223
|
+
when Array
|
|
224
|
+
object.map { |value| call(value) }
|
|
225
|
+
when Symbol
|
|
226
|
+
object.to_s
|
|
227
|
+
else
|
|
228
|
+
object
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
# @param object [Hash{Object => Object}] hash whose keys should become strings
|
|
233
|
+
# @return [Hash{String => Object}] hash with recursively normalized values
|
|
234
|
+
def stringify_hash(object)
|
|
235
|
+
object.to_h { |key, value| [key.to_s, call(value)] }
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'dry-validation'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
class Config
|
|
7
|
+
# Validates the configuration hash using Dry::Validation.
|
|
8
|
+
# The configuration options adhere to the documented schema in README.md.
|
|
9
|
+
class Validator < Dry::Validation::Contract # rubocop:disable Metrics/ClassLength
|
|
10
|
+
# URI format used for channel URL validation.
|
|
11
|
+
URI_REGEXP = Url::URI_REGEXP
|
|
12
|
+
# Allowed stylesheet MIME types.
|
|
13
|
+
STYLESHEET_TYPES = RssBuilder::Stylesheet::TYPES
|
|
14
|
+
# Optional language/region format (`en` or `en-US`).
|
|
15
|
+
LANGUAGE_FORMAT_REGEX = /\A[a-z]{2}(-[A-Z]{2})?\z/
|
|
16
|
+
# Baseline strategy enum exported in static schema artifacts.
|
|
17
|
+
BASE_STRATEGY_OPTIONS = ([:auto] + Html2rss::RequestService.strategy_names.map(&:to_sym)).uniq.freeze
|
|
18
|
+
|
|
19
|
+
# Contract for the top-level `channel` section.
|
|
20
|
+
ChannelConfig = Dry::Schema.Params do
|
|
21
|
+
required(:url).filled(:string, format?: URI_REGEXP)
|
|
22
|
+
optional(:title).maybe(:string)
|
|
23
|
+
optional(:description).maybe(:string)
|
|
24
|
+
optional(:language).maybe(:string, format?: LANGUAGE_FORMAT_REGEX)
|
|
25
|
+
optional(:ttl).maybe(:integer, gt?: 0)
|
|
26
|
+
optional(:time_zone).maybe(:string)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Contract for a stylesheet entry in `stylesheets`.
|
|
30
|
+
StylesheetConfig = Dry::Schema.Params do
|
|
31
|
+
required(:href).filled(:string)
|
|
32
|
+
required(:type).filled(:string, included_in?: STYLESHEET_TYPES)
|
|
33
|
+
optional(:media).maybe(:string)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Contract for Browserless click-preload options.
|
|
37
|
+
BrowserlessPreloadClickSelectorConfig = Dry::Schema.Params do
|
|
38
|
+
required(:selector).filled(:string)
|
|
39
|
+
optional(:max_clicks).filled(:integer, gt?: 0)
|
|
40
|
+
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Contract for Browserless scroll-preload options.
|
|
44
|
+
BrowserlessPreloadScrollConfig = Dry::Schema.Params do
|
|
45
|
+
optional(:iterations).filled(:integer, gt?: 0)
|
|
46
|
+
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Contract for Browserless preload orchestration options.
|
|
50
|
+
BrowserlessPreloadConfig = Dry::Schema.Params do
|
|
51
|
+
optional(:wait_after_ms).filled(:integer, gteq?: 0)
|
|
52
|
+
optional(:click_selectors).array(BrowserlessPreloadClickSelectorConfig)
|
|
53
|
+
optional(:scroll_down).hash(BrowserlessPreloadScrollConfig)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Contract for Browserless-specific request options.
|
|
57
|
+
BrowserlessRequestConfig = Dry::Schema.Params do
|
|
58
|
+
optional(:preload).hash(BrowserlessPreloadConfig)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Contract for Botasaurus-specific request options.
|
|
62
|
+
BotasaurusRequestConfig = Dry::Schema.Params do
|
|
63
|
+
config.validate_keys = true
|
|
64
|
+
|
|
65
|
+
optional(:navigation_mode).filled(:string, included_in?: %w[auto get google_get google_get_bypass])
|
|
66
|
+
optional(:max_retries).filled(:integer, gteq?: 0, lteq?: 3)
|
|
67
|
+
optional(:wait_for_selector).maybe(:string)
|
|
68
|
+
optional(:wait_timeout_seconds).filled(:integer, gt?: 0)
|
|
69
|
+
optional(:block_images).filled(:bool)
|
|
70
|
+
optional(:block_images_and_css).filled(:bool)
|
|
71
|
+
optional(:wait_for_complete_page_load).filled(:bool)
|
|
72
|
+
optional(:headless).filled(:bool)
|
|
73
|
+
optional(:proxy).filled(:string)
|
|
74
|
+
optional(:user_agent).filled(:string)
|
|
75
|
+
optional(:window_size).value(:array, min_size?: 2, max_size?: 2).each(:integer, gt?: 0)
|
|
76
|
+
optional(:lang).filled(:string)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Contract for the top-level `request` section.
|
|
80
|
+
RequestConfig = Dry::Schema.Params do
|
|
81
|
+
optional(:max_redirects).filled(:integer, gteq?: 0)
|
|
82
|
+
optional(:max_requests).filled(:integer, gt?: 0)
|
|
83
|
+
optional(:browserless).hash(BrowserlessRequestConfig)
|
|
84
|
+
optional(:botasaurus).hash(BotasaurusRequestConfig)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
params do
|
|
88
|
+
optional(:strategy).filled(:symbol)
|
|
89
|
+
required(:channel).hash(ChannelConfig)
|
|
90
|
+
optional(:headers).hash
|
|
91
|
+
optional(:stylesheets).array(StylesheetConfig)
|
|
92
|
+
optional(:auto_source).hash(AutoSource::Config)
|
|
93
|
+
optional(:selectors).hash
|
|
94
|
+
optional(:dynamic_params_error).maybe(:string)
|
|
95
|
+
optional(:request).hash(RequestConfig)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
rule(:headers) do
|
|
99
|
+
value&.each do |key, header_value|
|
|
100
|
+
unless header_value.is_a?(String)
|
|
101
|
+
key([:headers, key]).failure("must be a String, but got #{header_value.class}")
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
rule(:dynamic_params_error) do
|
|
107
|
+
base.failure(value) if value
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
rule(:strategy) do
|
|
111
|
+
next if value.nil?
|
|
112
|
+
next if value == :auto || Html2rss::RequestService.strategy_registered?(value)
|
|
113
|
+
|
|
114
|
+
key.failure("must be one of: #{BASE_STRATEGY_OPTIONS.join(', ')}")
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Ensure at least one of :selectors or :auto_source is present.
|
|
118
|
+
rule(:selectors, :auto_source) do
|
|
119
|
+
unless values.key?(:selectors) || values.key?(:auto_source)
|
|
120
|
+
base.failure("Configuration must include at least 'selectors' or 'auto_source'")
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
rule(:selectors) do
|
|
125
|
+
next unless value
|
|
126
|
+
|
|
127
|
+
errors = Html2rss::Selectors::Config.call(value).errors
|
|
128
|
+
errors.each { |error| key(:selectors).failure(error.text) } unless errors.empty?
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# URL validation delegated to Url class
|
|
132
|
+
rule(:channel) do
|
|
133
|
+
next unless values[:channel]&.key?(:url)
|
|
134
|
+
|
|
135
|
+
url_string = values[:channel][:url]
|
|
136
|
+
next if url_string.nil? || url_string.empty?
|
|
137
|
+
|
|
138
|
+
begin
|
|
139
|
+
Html2rss::Url.for_channel(url_string)
|
|
140
|
+
rescue ArgumentError => error
|
|
141
|
+
key(%i[channel url]).failure(error.message)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
end
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -1,82 +1,139 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require '
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'yaml'
|
|
4
5
|
|
|
5
6
|
module Html2rss
|
|
6
7
|
##
|
|
7
|
-
# The
|
|
8
|
-
# provides
|
|
8
|
+
# The provided configuration is used to generate the RSS feed.
|
|
9
|
+
# This class provides methods to load and process configuration from a YAML file,
|
|
10
|
+
# supporting both single and multiple feed configurations.
|
|
11
|
+
#
|
|
12
|
+
# Configuration is validated during initialization.
|
|
9
13
|
class Config
|
|
10
|
-
|
|
14
|
+
# Raised when a configuration hash fails runtime validation.
|
|
15
|
+
class InvalidConfig < Html2rss::Error; end
|
|
16
|
+
extend ClassMethods
|
|
11
17
|
|
|
12
18
|
##
|
|
13
|
-
#
|
|
14
|
-
# were passed to Config.
|
|
15
|
-
class ParamsMissing < Html2rss::Error; end
|
|
16
|
-
|
|
17
|
-
##
|
|
18
|
-
# Thrown when the feed config does not contain a value at `:channel`.
|
|
19
|
-
class ChannelMissing < Html2rss::Error; end
|
|
20
|
-
|
|
21
|
-
def_delegator :@channel, :author
|
|
22
|
-
def_delegator :@channel, :ttl
|
|
23
|
-
def_delegator :@channel, :title
|
|
24
|
-
def_delegator :@channel, :language
|
|
25
|
-
def_delegator :@channel, :description
|
|
26
|
-
def_delegator :@channel, :url
|
|
27
|
-
def_delegator :@channel, :url, :link
|
|
28
|
-
def_delegator :@channel, :time_zone
|
|
29
|
-
def_delegator :@channel, :json?
|
|
30
|
-
def_delegator :@channel, :strategy
|
|
31
|
-
|
|
32
|
-
def_delegator :@selectors, :item_selector_names
|
|
33
|
-
def_delegator :@selectors, :selector?
|
|
34
|
-
def_delegator :@selectors, :category_selector_names
|
|
35
|
-
def_delegator :@selectors, :guid_selector_names
|
|
36
|
-
def_delegator :@selectors, :items_order
|
|
37
|
-
def_delegator :@selectors, :selector_string
|
|
38
|
-
|
|
39
|
-
##
|
|
40
|
-
# Initializes the Config object with feed configuration, global settings, and parameters.
|
|
19
|
+
# Initializes the configuration object.
|
|
41
20
|
#
|
|
42
|
-
#
|
|
43
|
-
#
|
|
44
|
-
# @param
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@
|
|
21
|
+
# Processes deprecated attributes, applies default values, and validates the configuration.
|
|
22
|
+
#
|
|
23
|
+
# @param config [Hash{Symbol => Object}] the configuration hash.
|
|
24
|
+
# @raise [InvalidConfig] if the configuration fails validation.
|
|
25
|
+
def initialize(config)
|
|
26
|
+
@request_controls = RequestControls.from_config(config)
|
|
27
|
+
prepared_config = Preparer.new.call(config)
|
|
28
|
+
validated_config = validated_config_for(prepared_config)
|
|
29
|
+
|
|
30
|
+
@config = validated_config.freeze
|
|
31
|
+
@request_controls = request_controls.with_effective_values(
|
|
32
|
+
strategy: validated_config[:strategy],
|
|
33
|
+
max_redirects: validated_config.dig(:request, :max_redirects),
|
|
34
|
+
max_requests: validated_config.dig(:request, :max_requests)
|
|
35
|
+
)
|
|
52
36
|
end
|
|
53
37
|
|
|
38
|
+
# @return [Symbol, nil] selected request strategy
|
|
39
|
+
def strategy = request_controls.strategy
|
|
40
|
+
# @return [Integer, nil] configured redirect budget
|
|
41
|
+
def max_redirects = request_controls.max_redirects
|
|
42
|
+
# @return [Integer, nil] configured request budget
|
|
43
|
+
def max_requests = request_controls.max_requests
|
|
44
|
+
# @return [Array<Hash>] stylesheet definitions
|
|
45
|
+
def stylesheets = config[:stylesheets]
|
|
46
|
+
|
|
54
47
|
##
|
|
55
|
-
#
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# @return [Hash<Symbol, Object>] Merged attributes hash.
|
|
59
|
-
def selector_attributes_with_channel(name)
|
|
60
|
-
@selectors.selector(name).to_h.merge(channel: @channel)
|
|
48
|
+
# @return [Boolean] whether max_requests was explicitly configured by the caller
|
|
49
|
+
def explicit_max_requests?
|
|
50
|
+
request_controls.explicit?(:max_requests)
|
|
61
51
|
end
|
|
62
52
|
|
|
63
53
|
##
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
54
|
+
# @return [Html2rss::RequestControls] request controls with provenance
|
|
55
|
+
attr_reader :request_controls
|
|
56
|
+
|
|
57
|
+
# @return [Hash{String => String}] normalized HTTP headers
|
|
58
|
+
def headers = config[:headers]
|
|
59
|
+
# @return [Hash{Symbol => Object}] channel configuration
|
|
60
|
+
def channel = config[:channel]
|
|
61
|
+
# @return [String] source channel URL
|
|
62
|
+
def url = config.dig(:channel, :url)
|
|
63
|
+
# @return [String, nil] configured channel time zone
|
|
64
|
+
def time_zone = config.dig(:channel, :time_zone)
|
|
65
|
+
|
|
66
|
+
# @return [Hash{Symbol => Object}] request envelope configuration
|
|
67
|
+
def request = config[:request]
|
|
68
|
+
|
|
69
|
+
# @return [Hash{Symbol => Object}, nil] selectors configuration
|
|
70
|
+
def selectors = config[:selectors]
|
|
71
|
+
# @return [Hash{Symbol => Object}, nil] auto-source configuration
|
|
72
|
+
def auto_source = config[:auto_source]
|
|
73
|
+
|
|
74
|
+
private
|
|
75
|
+
|
|
76
|
+
attr_reader :config
|
|
77
|
+
|
|
78
|
+
# Normalizes raw config input before validation.
|
|
79
|
+
class Preparer
|
|
80
|
+
##
|
|
81
|
+
# @param config [Hash{Symbol => Object}] raw config input
|
|
82
|
+
# @return [Hash{Symbol => Object}] config with defaults and deprecations applied
|
|
83
|
+
def call(config)
|
|
84
|
+
config = config.dup if config.frozen?
|
|
85
|
+
|
|
86
|
+
config = handle_deprecated_channel_attributes(config)
|
|
87
|
+
config = apply_default_config(config)
|
|
88
|
+
config = apply_default_selectors_config(config) if config[:selectors]
|
|
89
|
+
config = apply_default_auto_source_config(config) if config[:auto_source]
|
|
90
|
+
|
|
91
|
+
config
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
def handle_deprecated_channel_attributes(config)
|
|
97
|
+
{ strategy: Config.default_strategy_name, headers: {} }.each_pair do |key, default_value|
|
|
98
|
+
if !config[key] && (value = config.dig(:channel, key))
|
|
99
|
+
Log.warn("The `channel.#{key}` key is deprecated. Please move the definition of `#{key}` to the top level.")
|
|
100
|
+
config[key] = value
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
config[key] ||= default_value
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
config
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def apply_default_config(config)
|
|
110
|
+
HashUtil.deep_merge(Config.default_config, config)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def apply_default_selectors_config(config)
|
|
114
|
+
HashUtil.deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def apply_default_auto_source_config(config)
|
|
118
|
+
HashUtil.deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
|
|
119
|
+
end
|
|
69
120
|
end
|
|
70
121
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
122
|
+
def validated_config_for(config)
|
|
123
|
+
validator = Validator.new.call(config)
|
|
124
|
+
|
|
125
|
+
raise InvalidConfig, "Invalid configuration: #{validator.errors.to_h}" unless validator.success?
|
|
126
|
+
|
|
127
|
+
normalized_headers(validator.to_h)
|
|
77
128
|
end
|
|
78
129
|
|
|
79
|
-
|
|
80
|
-
|
|
130
|
+
def normalized_headers(validated_config)
|
|
131
|
+
validated_config[:headers] = RequestHeaders.normalize(
|
|
132
|
+
validated_config[:headers],
|
|
133
|
+
channel_language: validated_config.dig(:channel, :language),
|
|
134
|
+
url: validated_config.dig(:channel, :url)
|
|
135
|
+
)
|
|
136
|
+
validated_config
|
|
137
|
+
end
|
|
81
138
|
end
|
|
82
139
|
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
# The Html2rss::Error base class.
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# Raised when auto fallback exhausts all concrete tiers and extractors find no feed items.
|
|
8
|
+
class NoFeedItemsExtracted < Error
|
|
9
|
+
##
|
|
10
|
+
# @param attempts [Array<Hash{Symbol => Object}>] tier attempt diagnostics
|
|
11
|
+
def initialize(attempts:)
|
|
12
|
+
@attempts = attempts
|
|
13
|
+
super(build_message)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# @return [Array<Hash{Symbol => Object}>] tier attempt diagnostics
|
|
17
|
+
attr_reader :attempts
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def build_message
|
|
22
|
+
summaries = attempts.map do |attempt|
|
|
23
|
+
details = attempt[:items_count].nil? ? "#{attempt[:error_class]} error" : "#{attempt[:items_count]} items"
|
|
24
|
+
"#{attempt[:strategy]} (#{details})"
|
|
25
|
+
end.join(', ')
|
|
26
|
+
|
|
27
|
+
"No feed items extracted after auto fallback across strategies: #{summaries}. " \
|
|
28
|
+
'Try a more specific listing URL or provide explicit selectors.'
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|