html2rss 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -656
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +115 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Shared anti-bot/interstitial signatures used by request and auto-source flows.
|
|
6
|
+
#
|
|
7
|
+
# This module centralizes signature matching so request-time guards and
|
|
8
|
+
# auto-source surface classification stay consistent.
|
|
9
|
+
module BlockedSurface
|
|
10
|
+
INTERSTITIAL_SIGNATURES = [
|
|
11
|
+
{
|
|
12
|
+
key: :cloudflare_interstitial,
|
|
13
|
+
min_matches: 2,
|
|
14
|
+
patterns: [
|
|
15
|
+
%r{<title>\s*just a moment\.\.\.\s*</title>}i,
|
|
16
|
+
/checking your browser before accessing/i,
|
|
17
|
+
/please (?:enable|turn on) javascript and cookies/i,
|
|
18
|
+
%r{cdn-cgi/challenge-platform}i,
|
|
19
|
+
/cloudflare ray id/i
|
|
20
|
+
],
|
|
21
|
+
message: 'Blocked surface detected: Cloudflare anti-bot interstitial page. ' \
|
|
22
|
+
'Retry with --strategy browserless, try a more specific public listing URL, ' \
|
|
23
|
+
'or run from an environment that can complete anti-bot checks.'
|
|
24
|
+
}
|
|
25
|
+
].freeze
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# Returns the first matching interstitial signature for the provided body.
|
|
29
|
+
#
|
|
30
|
+
# @param body [String, nil] response body candidate
|
|
31
|
+
# @return [Hash, nil] signature hash when matched, otherwise nil
|
|
32
|
+
def self.interstitial_signature_for(body)
|
|
33
|
+
normalized_body = normalize_body(body)
|
|
34
|
+
INTERSTITIAL_SIGNATURES.find { |signature| interstitial_signature_match?(normalized_body, signature) }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
##
|
|
38
|
+
# @param body [String, nil] response body candidate
|
|
39
|
+
# @return [Boolean] true when body matches a known interstitial signature
|
|
40
|
+
def self.interstitial?(body)
|
|
41
|
+
!interstitial_signature_for(body).nil?
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def self.interstitial_signature_match?(body, signature)
|
|
45
|
+
min_matches = signature.fetch(:min_matches, 1)
|
|
46
|
+
matches = 0
|
|
47
|
+
|
|
48
|
+
signature.fetch(:patterns).each do |pattern|
|
|
49
|
+
matches += 1 if pattern.match?(body)
|
|
50
|
+
return true if matches >= min_matches
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
false
|
|
54
|
+
end
|
|
55
|
+
private_class_method :interstitial_signature_match?
|
|
56
|
+
|
|
57
|
+
def self.normalize_body(body)
|
|
58
|
+
body.to_s.encode(Encoding::UTF_8, invalid: :replace, undef: :replace, replace: '')
|
|
59
|
+
rescue Encoding::CompatibilityError, Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
60
|
+
body.to_s.force_encoding(Encoding::UTF_8).scrub
|
|
61
|
+
end
|
|
62
|
+
private_class_method :normalize_body
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# CategoryExtractor is responsible for extracting categories from HTML elements
|
|
6
|
+
# by looking for CSS class names containing common category-related terms.
|
|
7
|
+
class CategoryExtractor
|
|
8
|
+
# Common category-related terms to look for in class names
|
|
9
|
+
CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
|
|
10
|
+
|
|
11
|
+
# CSS selectors to find elements with category-related class names
|
|
12
|
+
CATEGORY_SELECTORS = CATEGORY_TERMS.map { |term| "[class*=\"#{term}\"]" }.freeze
|
|
13
|
+
|
|
14
|
+
# Regex pattern for matching category-related attribute names
|
|
15
|
+
CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
|
|
16
|
+
|
|
17
|
+
##
|
|
18
|
+
# Extracts categories from the given article tag by looking for elements
|
|
19
|
+
# with class names containing common category-related terms.
|
|
20
|
+
#
|
|
21
|
+
# @param article_tag [Nokogiri::XML::Element] The article element to extract categories from
|
|
22
|
+
# @return [Array<String>] Array of category strings, empty if none found
|
|
23
|
+
def self.call(article_tag)
|
|
24
|
+
return [] unless article_tag
|
|
25
|
+
|
|
26
|
+
# Single optimized traversal that extracts all category types
|
|
27
|
+
extract_all_categories(article_tag)
|
|
28
|
+
.map(&:strip)
|
|
29
|
+
.reject(&:empty?)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# Optimized single DOM traversal that extracts all category types.
|
|
34
|
+
#
|
|
35
|
+
# @param article_tag [Nokogiri::XML::Element] The article element
|
|
36
|
+
# @return [Set<String>] Set of category strings
|
|
37
|
+
def self.extract_all_categories(article_tag)
|
|
38
|
+
Set.new.tap do |categories|
|
|
39
|
+
article_tag.css('*').each do |element|
|
|
40
|
+
# Extract text categories from elements with category-related class names
|
|
41
|
+
categories.merge(extract_text_categories(element)) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
|
|
42
|
+
|
|
43
|
+
# Extract data categories from all elements
|
|
44
|
+
categories.merge(extract_element_data_categories(element))
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
##
|
|
50
|
+
# Extracts categories from data attributes of a single element.
|
|
51
|
+
#
|
|
52
|
+
# @param element [Nokogiri::XML::Element] The element to process
|
|
53
|
+
# @return [Set<String>] Set of category strings
|
|
54
|
+
def self.extract_element_data_categories(element)
|
|
55
|
+
Set.new.tap do |categories|
|
|
56
|
+
element.attributes.each_value do |attr|
|
|
57
|
+
next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
|
|
58
|
+
|
|
59
|
+
value = attr.value&.strip
|
|
60
|
+
categories.add(value) if value && !value.empty?
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
##
|
|
66
|
+
# Extracts text-based categories from elements, splitting content into discrete values.
|
|
67
|
+
#
|
|
68
|
+
# @param element [Nokogiri::XML::Element] The element to process
|
|
69
|
+
# @return [Set<String>] Set of category strings
|
|
70
|
+
def self.extract_text_categories(element)
|
|
71
|
+
anchor_values = element.css('a').filter_map do |node|
|
|
72
|
+
HtmlExtractor.extract_visible_text(node)
|
|
73
|
+
end
|
|
74
|
+
return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
|
|
75
|
+
|
|
76
|
+
text = HtmlExtractor.extract_visible_text(element)
|
|
77
|
+
return Set.new unless text
|
|
78
|
+
|
|
79
|
+
Set.new(text.split(/\n+/).map(&:strip).reject(&:empty?))
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
data/lib/html2rss/cli.rb
CHANGED
|
@@ -1,46 +1,193 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'json'
|
|
4
5
|
require 'thor'
|
|
5
6
|
|
|
6
7
|
##
|
|
7
8
|
# The Html2rss namespace / command line interface.
|
|
8
9
|
module Html2rss
|
|
9
|
-
Log = Logger.new($stderr)
|
|
10
|
-
|
|
11
10
|
##
|
|
12
11
|
# The Html2rss command line interface.
|
|
13
|
-
class CLI < Thor
|
|
12
|
+
class CLI < Thor # rubocop:disable Metrics/ClassLength
|
|
13
|
+
check_unknown_options!
|
|
14
|
+
|
|
14
15
|
def self.exit_on_failure?
|
|
15
16
|
true
|
|
16
17
|
end
|
|
17
18
|
|
|
18
|
-
desc 'feed YAML_FILE [
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
19
|
+
desc 'feed YAML_FILE [feed_name]', 'Print RSS built from the YAML_FILE file to stdout'
|
|
20
|
+
method_option :params,
|
|
21
|
+
type: :hash,
|
|
22
|
+
optional: true,
|
|
23
|
+
required: false,
|
|
24
|
+
default: {}
|
|
25
|
+
method_option :strategy,
|
|
26
|
+
type: :string,
|
|
27
|
+
desc: 'The strategy to request the URL',
|
|
28
|
+
enum: %w[faraday browserless]
|
|
29
|
+
method_option :max_redirects,
|
|
30
|
+
type: :numeric,
|
|
31
|
+
desc: 'Maximum redirects to follow per request'
|
|
32
|
+
method_option :max_requests,
|
|
33
|
+
type: :numeric,
|
|
34
|
+
desc: 'Maximum requests to allow for this feed build'
|
|
35
|
+
def feed(yaml_file, feed_name = nil)
|
|
36
|
+
config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
|
|
37
|
+
config[:params] = options[:params] || {}
|
|
38
|
+
apply_runtime_request_overrides!(config)
|
|
30
39
|
|
|
31
|
-
puts Html2rss.
|
|
40
|
+
puts(execute_feed { Html2rss.feed(config) })
|
|
32
41
|
end
|
|
33
42
|
|
|
34
|
-
desc 'auto URL', 'Automatically sources an RSS feed from the URL'
|
|
43
|
+
desc 'auto [URL]', 'Automatically sources an RSS feed from the URL'
|
|
35
44
|
method_option :strategy,
|
|
36
45
|
type: :string,
|
|
37
46
|
desc: 'The strategy to request the URL',
|
|
38
|
-
enum:
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
47
|
+
enum: %w[faraday browserless]
|
|
48
|
+
method_option :format,
|
|
49
|
+
type: :string,
|
|
50
|
+
desc: 'Output format for the auto-sourced feed',
|
|
51
|
+
enum: %w[rss jsonfeed],
|
|
52
|
+
default: 'rss'
|
|
53
|
+
method_option :items_selector, type: :string, desc: 'CSS selector for items (will be enhanced) (optional)'
|
|
54
|
+
method_option :max_redirects,
|
|
55
|
+
type: :numeric,
|
|
56
|
+
desc: 'Maximum redirects to follow per request'
|
|
57
|
+
method_option :max_requests,
|
|
58
|
+
type: :numeric,
|
|
59
|
+
desc: 'Maximum requests to allow for this feed build'
|
|
60
|
+
def auto(url) # rubocop:disable Metrics/MethodLength
|
|
61
|
+
format = options.fetch(:format, 'rss')
|
|
62
|
+
source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
|
|
63
|
+
|
|
64
|
+
result = execute_feed do
|
|
65
|
+
source_method.call(
|
|
66
|
+
url,
|
|
67
|
+
strategy: current_strategy,
|
|
68
|
+
items_selector: options[:items_selector],
|
|
69
|
+
max_redirects: options[:max_redirects],
|
|
70
|
+
max_requests: options[:max_requests]
|
|
71
|
+
)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
desc 'schema', 'Print the exported config JSON Schema'
|
|
78
|
+
method_option :pretty,
|
|
79
|
+
type: :boolean,
|
|
80
|
+
desc: 'Pretty-print the schema JSON',
|
|
81
|
+
default: true
|
|
82
|
+
method_option :write,
|
|
83
|
+
type: :string,
|
|
84
|
+
desc: 'Write the schema JSON to the given file path'
|
|
85
|
+
##
|
|
86
|
+
# Prints or writes the exported configuration JSON Schema.
|
|
87
|
+
#
|
|
88
|
+
# @return [void]
|
|
89
|
+
def schema
|
|
90
|
+
schema_json = Html2rss::Config.json_schema_json(pretty: options.fetch(:pretty, true))
|
|
91
|
+
|
|
92
|
+
if options[:write]
|
|
93
|
+
FileUtils.mkdir_p(File.dirname(options[:write]))
|
|
94
|
+
File.write(options[:write], "#{schema_json}\n")
|
|
95
|
+
puts options[:write]
|
|
96
|
+
return
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
puts schema_json
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
desc 'validate YAML_FILE [feed_name]', 'Validate a YAML config with the runtime validator'
|
|
103
|
+
method_option :params,
|
|
104
|
+
type: :hash,
|
|
105
|
+
optional: true,
|
|
106
|
+
required: false,
|
|
107
|
+
default: {}
|
|
108
|
+
##
|
|
109
|
+
# Validates a YAML config and prints the result.
|
|
110
|
+
#
|
|
111
|
+
# @param yaml_file [String] the YAML file to validate
|
|
112
|
+
# @param feed_name [String, nil] optional feed name for multi-feed files
|
|
113
|
+
# @return [void]
|
|
114
|
+
def validate(yaml_file, feed_name = nil)
|
|
115
|
+
result = Html2rss::Config.validate_yaml(yaml_file, feed_name, params: options[:params] || {})
|
|
116
|
+
|
|
117
|
+
raise Thor::Error, "Invalid configuration: #{result.errors.to_h}" unless result.success?
|
|
118
|
+
|
|
119
|
+
puts 'Configuration is valid'
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
private
|
|
123
|
+
|
|
124
|
+
def apply_runtime_request_overrides!(config)
|
|
125
|
+
clear_blank_request_overrides!(config)
|
|
126
|
+
request_controls.apply_to(config)
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def clear_blank_request_overrides!(config)
|
|
130
|
+
config.delete(:strategy) if config[:strategy].nil?
|
|
131
|
+
|
|
132
|
+
request_config = config[:request]
|
|
133
|
+
return unless request_config.is_a?(Hash)
|
|
134
|
+
|
|
135
|
+
%i[max_redirects max_requests].each do |key|
|
|
136
|
+
request_config.delete(key) if request_config[key].nil?
|
|
137
|
+
end
|
|
138
|
+
config.delete(:request) if request_config.empty?
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def request_controls
|
|
142
|
+
Html2rss::RequestControls.new(
|
|
143
|
+
strategy: options[:strategy]&.to_sym,
|
|
144
|
+
max_redirects: options[:max_redirects],
|
|
145
|
+
max_requests: options[:max_requests],
|
|
146
|
+
explicit_keys: explicit_request_control_keys
|
|
147
|
+
)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def explicit_request_control_keys
|
|
151
|
+
keys = []
|
|
152
|
+
keys << :strategy if options[:strategy]
|
|
153
|
+
keys << :max_redirects unless options[:max_redirects].nil?
|
|
154
|
+
keys << :max_requests unless options[:max_requests].nil?
|
|
155
|
+
keys
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def current_strategy
|
|
159
|
+
options[:strategy]&.to_sym || :faraday
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def current_max_redirects
|
|
163
|
+
options.fetch(:max_redirects, Html2rss::RequestService::Policy::DEFAULTS[:max_redirects])
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def current_max_requests
|
|
167
|
+
options.fetch(:max_requests, Html2rss::RequestService::Policy::DEFAULTS[:max_requests])
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def suggested_max_redirects
|
|
171
|
+
current_max_redirects + 1
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def suggested_max_requests
|
|
175
|
+
current_max_requests + 1
|
|
176
|
+
end
|
|
42
177
|
|
|
43
|
-
|
|
178
|
+
def execute_feed # rubocop:disable Metrics/MethodLength
|
|
179
|
+
yield
|
|
180
|
+
rescue Faraday::FollowRedirects::RedirectLimitReached => error
|
|
181
|
+
raise Thor::Error,
|
|
182
|
+
"#{error.message}. retry with --max-redirects #{suggested_max_redirects} or use the final URL directly."
|
|
183
|
+
rescue Html2rss::RequestService::RequestBudgetExceeded => error
|
|
184
|
+
raise Thor::Error,
|
|
185
|
+
"#{error.message}. retry with --max-requests #{suggested_max_requests} " \
|
|
186
|
+
'or increase request.max_requests in the config.'
|
|
187
|
+
rescue Html2rss::RequestService::BrowserlessConfigurationError,
|
|
188
|
+
Html2rss::RequestService::BrowserlessConnectionFailed,
|
|
189
|
+
Html2rss::RequestService::BlockedSurfaceDetected => error
|
|
190
|
+
raise Thor::Error, error.message
|
|
44
191
|
end
|
|
45
192
|
end
|
|
46
193
|
end
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
##
|
|
6
|
+
# Public class-level helpers for loading, validating, and exporting config.
|
|
7
|
+
module ClassMethods
|
|
8
|
+
UNSET = Object.new.freeze
|
|
9
|
+
|
|
10
|
+
##
|
|
11
|
+
# Returns the exported JSON Schema for html2rss configuration.
|
|
12
|
+
#
|
|
13
|
+
# @return [Hash<String, Object>] JSON Schema represented as a Ruby hash
|
|
14
|
+
def json_schema
|
|
15
|
+
Schema.json_schema
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
##
|
|
19
|
+
# Returns the exported JSON Schema as JSON.
|
|
20
|
+
#
|
|
21
|
+
# @param pretty [Boolean] whether to pretty-print the JSON output
|
|
22
|
+
# @return [String] serialized JSON Schema
|
|
23
|
+
def json_schema_json(pretty: true)
|
|
24
|
+
pretty ? JSON.pretty_generate(json_schema) : JSON.generate(json_schema)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# Validates a configuration hash with the runtime validator.
|
|
29
|
+
#
|
|
30
|
+
# @param config [Hash<Symbol, Object>] the configuration hash
|
|
31
|
+
# @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting
|
|
32
|
+
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
33
|
+
def validate(config, params: UNSET)
|
|
34
|
+
prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
|
|
35
|
+
|
|
36
|
+
Validator.new.call(prepared_config)
|
|
37
|
+
rescue DynamicParams::ParamsMissing => error
|
|
38
|
+
prepared_config = prepare_for_validation(deep_dup(config))
|
|
39
|
+
prepared_config[:dynamic_params_error] = error.message
|
|
40
|
+
|
|
41
|
+
Validator.new.call(prepared_config)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# Returns the packaged JSON Schema file path.
|
|
46
|
+
#
|
|
47
|
+
# @return [String] absolute path to the packaged JSON Schema file
|
|
48
|
+
def schema_path
|
|
49
|
+
Schema.path
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
# Loads and validates a YAML configuration file.
|
|
54
|
+
#
|
|
55
|
+
# @param file [String] the YAML file to load
|
|
56
|
+
# @param feed_name [String, nil] optional feed name for multi-feed files
|
|
57
|
+
# @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
|
|
58
|
+
# @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting
|
|
59
|
+
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
60
|
+
def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
|
|
61
|
+
validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
##
|
|
65
|
+
# Loads the feed configuration from a YAML file.
|
|
66
|
+
#
|
|
67
|
+
# Supports multiple feeds defined under the specified key (default :feeds).
|
|
68
|
+
#
|
|
69
|
+
# @param file [String] the YAML file to load.
|
|
70
|
+
# @param feed_name [String, nil] the feed name when using multiple feeds.
|
|
71
|
+
# @param multiple_feeds_key [Symbol] the key under which multiple feeds are defined.
|
|
72
|
+
# @return [Hash<Symbol, Object>] the configuration hash.
|
|
73
|
+
# @raise [ArgumentError] if the file doesn't exist or feed is not found.
|
|
74
|
+
# rubocop:disable Metrics/MethodLength
|
|
75
|
+
def load_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS)
|
|
76
|
+
raise ArgumentError, "File '#{file}' does not exist" unless File.exist?(file)
|
|
77
|
+
raise ArgumentError, "`#{multiple_feeds_key}` is a reserved feed name" if feed_name == multiple_feeds_key
|
|
78
|
+
|
|
79
|
+
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
|
80
|
+
|
|
81
|
+
return yaml unless yaml.key?(multiple_feeds_key)
|
|
82
|
+
|
|
83
|
+
unless feed_name
|
|
84
|
+
available_feeds = yaml.fetch(multiple_feeds_key).keys.join(', ')
|
|
85
|
+
raise ArgumentError,
|
|
86
|
+
"Feed name is required under `#{multiple_feeds_key}`. Available feeds: #{available_feeds}"
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
config = yaml.dig(multiple_feeds_key, feed_name.to_sym)
|
|
90
|
+
raise ArgumentError, "Feed '#{feed_name}' not found under `#{multiple_feeds_key}` key." unless config
|
|
91
|
+
|
|
92
|
+
MultipleFeedsConfig.to_single_feed(config, yaml, multiple_feeds_key:)
|
|
93
|
+
end
|
|
94
|
+
# rubocop:enable Metrics/MethodLength
|
|
95
|
+
|
|
96
|
+
##
|
|
97
|
+
# Processes the provided configuration hash, applying dynamic parameters if given,
|
|
98
|
+
# and returns a new configuration object.
|
|
99
|
+
#
|
|
100
|
+
# @param config [Hash<Symbol, Object>] the configuration hash.
|
|
101
|
+
# @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting.
|
|
102
|
+
# @return [Html2rss::Config] the configuration object.
|
|
103
|
+
def from_hash(config, params: UNSET)
|
|
104
|
+
new(resolve_effective_config(config, params:))
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
##
|
|
108
|
+
# Builds a top-level auto-source feed config for the public shortcut APIs.
|
|
109
|
+
#
|
|
110
|
+
# @param url [String] source page URL
|
|
111
|
+
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
112
|
+
# @param request_controls [Html2rss::RequestControls, nil] explicit request controls to write
|
|
113
|
+
# @return [Hash<Symbol, Object>] feed config hash ready for {from_hash}
|
|
114
|
+
def auto_source_config(url:, items_selector: nil, request_controls: nil)
|
|
115
|
+
config = {
|
|
116
|
+
channel: default_config[:channel].merge(url:),
|
|
117
|
+
auto_source: AutoSource::DEFAULT_CONFIG
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
request_controls ||= Html2rss::RequestControls.new
|
|
121
|
+
request_controls.apply_to(config)
|
|
122
|
+
|
|
123
|
+
config[:selectors] = { items: { selector: items_selector, enhance: true } } if items_selector
|
|
124
|
+
config
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
##
|
|
128
|
+
# Provides a default configuration.
|
|
129
|
+
#
|
|
130
|
+
# @return [Hash<Symbol, Object>] a hash with default configuration values.
|
|
131
|
+
def default_config
|
|
132
|
+
{
|
|
133
|
+
strategy: RequestService.default_strategy_name,
|
|
134
|
+
request: {
|
|
135
|
+
max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
|
|
136
|
+
max_requests: RequestService::Policy::DEFAULTS[:max_requests]
|
|
137
|
+
},
|
|
138
|
+
channel: { time_zone: 'UTC' },
|
|
139
|
+
headers: RequestHeaders.browser_defaults,
|
|
140
|
+
stylesheets: []
|
|
141
|
+
}
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
private
|
|
145
|
+
|
|
146
|
+
def resolve_effective_config(config, params:)
|
|
147
|
+
effective_config = deep_dup(config)
|
|
148
|
+
resolved_params = parameter_defaults(effective_config)
|
|
149
|
+
resolved_params.merge!(params) unless params.equal?(UNSET) || params.nil?
|
|
150
|
+
|
|
151
|
+
DynamicParams.call(effective_config[:headers], resolved_params)
|
|
152
|
+
DynamicParams.call(effective_config[:channel], resolved_params)
|
|
153
|
+
|
|
154
|
+
effective_config
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def parameter_defaults(config)
|
|
158
|
+
config.fetch(:parameters, {})
|
|
159
|
+
.filter_map do |name, definition|
|
|
160
|
+
[name, definition[:default]] if definition.is_a?(Hash) && definition.key?(:default)
|
|
161
|
+
end
|
|
162
|
+
.to_h
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def prepare_for_validation(config)
|
|
166
|
+
Config::Preparer.new.call(deep_dup(config))
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# rubocop:disable Metrics/MethodLength
|
|
170
|
+
def deep_dup(object)
|
|
171
|
+
case object
|
|
172
|
+
when Hash
|
|
173
|
+
object.transform_values do |value|
|
|
174
|
+
deep_dup(value)
|
|
175
|
+
end
|
|
176
|
+
when Array
|
|
177
|
+
object.map { |value| deep_dup(value) }
|
|
178
|
+
else
|
|
179
|
+
begin
|
|
180
|
+
object.dup
|
|
181
|
+
rescue TypeError
|
|
182
|
+
object
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
# rubocop:enable Metrics/MethodLength
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
# Processes and applies dynamic parameter formatting in configuration values.
|
|
6
|
+
class DynamicParams
|
|
7
|
+
class ParamsMissing < Html2rss::Error; end
|
|
8
|
+
|
|
9
|
+
class << self
|
|
10
|
+
# Recursively traverses the given value and formats any strings containing
|
|
11
|
+
# placeholders with values from the provided params.
|
|
12
|
+
#
|
|
13
|
+
# @param value [String, Hash, Enumerable, Object] The value to process.
|
|
14
|
+
# @param params [Hash] The parameters for substitution.
|
|
15
|
+
# @param getter [Proc, nil] Optional proc to retrieve a key's value.
|
|
16
|
+
# @param replace_missing_with [Object, nil] Value to substitute if a key is missing.
|
|
17
|
+
# @return [Object] The processed value.
|
|
18
|
+
def call(value, params = {}, getter: nil, replace_missing_with: nil)
|
|
19
|
+
case value
|
|
20
|
+
when String
|
|
21
|
+
from_string(value, params, getter:, replace_missing_with:)
|
|
22
|
+
when Hash
|
|
23
|
+
from_hash(value, params, getter:, replace_missing_with:)
|
|
24
|
+
when Enumerable
|
|
25
|
+
from_enumerable(value, params, getter:, replace_missing_with:)
|
|
26
|
+
else
|
|
27
|
+
value
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def format_params(params, getter:, replace_missing_with:)
|
|
34
|
+
Hash.new do |hash, key|
|
|
35
|
+
hash[key] = if getter
|
|
36
|
+
getter.call(key)
|
|
37
|
+
else
|
|
38
|
+
params.fetch(key.to_sym) { params[key.to_s] }
|
|
39
|
+
end
|
|
40
|
+
hash[key] = replace_missing_with if hash[key].nil? && !replace_missing_with.nil?
|
|
41
|
+
hash[key]
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def from_string(string, params, getter:, replace_missing_with:)
|
|
46
|
+
# Return the original string if no format placeholders are found.
|
|
47
|
+
return string unless /%\{[^{}]*\}|%<[^<>]*>/.match?(string)
|
|
48
|
+
|
|
49
|
+
mapping = format_params(params, getter:, replace_missing_with:)
|
|
50
|
+
format(string, mapping)
|
|
51
|
+
rescue KeyError => error
|
|
52
|
+
raise ParamsMissing, "Missing parameter for formatting: #{error.message}" if replace_missing_with.nil?
|
|
53
|
+
|
|
54
|
+
string
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def from_hash(hash, params, getter:, replace_missing_with:)
|
|
58
|
+
hash.transform_keys!(&:to_sym)
|
|
59
|
+
hash.transform_values! { |value| call(value, params, getter:, replace_missing_with:) }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def from_enumerable(enumerable, params, getter:, replace_missing_with:)
|
|
63
|
+
enumerable.map! { |value| call(value, params, getter:, replace_missing_with:) }
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
# Handles multiple feeds within a single configuration hash.
|
|
6
|
+
# Individual feed configurations should be placed under the :feeds key,
|
|
7
|
+
# where each feed name is the key for its feed configuration.
|
|
8
|
+
# All global configuration keys (outside :feeds) are merged into each feed's settings.
|
|
9
|
+
class MultipleFeedsConfig
|
|
10
|
+
CONFIG_KEY_FEEDS = :feeds
|
|
11
|
+
|
|
12
|
+
class << self
|
|
13
|
+
# Merges global configuration into each feed's configuration.
|
|
14
|
+
#
|
|
15
|
+
# @param config [Hash] The feed-specific configuration.
|
|
16
|
+
# @param yaml [Hash] The full YAML configuration.
|
|
17
|
+
# @param multiple_feeds_key [Symbol] The key under which multiple feeds are defined.
|
|
18
|
+
# @return [Hash] The merged configuration.
|
|
19
|
+
def to_single_feed(config, yaml, multiple_feeds_key: CONFIG_KEY_FEEDS)
|
|
20
|
+
global_keys = yaml.keys - [multiple_feeds_key]
|
|
21
|
+
global_keys.each do |key|
|
|
22
|
+
config[key] = merge_key(config, yaml, key)
|
|
23
|
+
end
|
|
24
|
+
config
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
# Merges a specific global key from the YAML configuration into the feed configuration.
|
|
30
|
+
#
|
|
31
|
+
# @param config [Hash] The feed-specific configuration.
|
|
32
|
+
# @param yaml [Hash] The full YAML configuration.
|
|
33
|
+
# @param key [Symbol] The global configuration key to merge.
|
|
34
|
+
# @return [Object] The merged value for the key.
|
|
35
|
+
def merge_key(config, yaml, key)
|
|
36
|
+
global_value = yaml.fetch(key, nil)
|
|
37
|
+
local_value = config[key]
|
|
38
|
+
case local_value
|
|
39
|
+
when Hash
|
|
40
|
+
global_value.is_a?(Hash) ? global_value.merge(local_value) : local_value
|
|
41
|
+
when Array
|
|
42
|
+
global_value.is_a?(Array) ? global_value + local_value : local_value
|
|
43
|
+
else
|
|
44
|
+
global_value
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|