html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -6,13 +6,22 @@ require 'forwardable'
|
|
|
6
6
|
module Html2rss
|
|
7
7
|
##
|
|
8
8
|
# Requests website URLs to retrieve their HTML for further processing.
|
|
9
|
-
# Provides strategies,
|
|
9
|
+
# Provides strategies, e.g. integrating Browserless.io.
|
|
10
10
|
class RequestService
|
|
11
11
|
include Singleton
|
|
12
12
|
|
|
13
13
|
class UnknownStrategy < Html2rss::Error; end
|
|
14
14
|
class InvalidUrl < Html2rss::Error; end
|
|
15
15
|
class UnsupportedUrlScheme < Html2rss::Error; end
|
|
16
|
+
class UnsupportedResponseContentType < Html2rss::Error; end
|
|
17
|
+
class RequestBudgetExceeded < Html2rss::Error; end
|
|
18
|
+
class PrivateNetworkDenied < Html2rss::Error; end
|
|
19
|
+
class CrossOriginFollowUpDenied < Html2rss::Error; end
|
|
20
|
+
class ResponseTooLarge < Html2rss::Error; end
|
|
21
|
+
class BlockedSurfaceDetected < Html2rss::Error; end
|
|
22
|
+
class RequestTimedOut < Html2rss::Error; end
|
|
23
|
+
class BrowserlessConfigurationError < Html2rss::Error; end
|
|
24
|
+
class BrowserlessConnectionFailed < Html2rss::Error; end
|
|
16
25
|
|
|
17
26
|
class << self
|
|
18
27
|
extend Forwardable
|
|
@@ -55,9 +64,12 @@ module Html2rss
|
|
|
55
64
|
##
|
|
56
65
|
# Registers a new strategy.
|
|
57
66
|
# @param name [Symbol] the name of the strategy
|
|
58
|
-
# @param strategy_class [Class] the class
|
|
67
|
+
# @param strategy_class [Class] the class implementing the strategy
|
|
68
|
+
# @raise [ArgumentError] if strategy_class is not a Class
|
|
59
69
|
def register_strategy(name, strategy_class)
|
|
60
|
-
|
|
70
|
+
unless strategy_class.is_a?(Class)
|
|
71
|
+
raise ArgumentError, "Expected a Class for strategy, got #{strategy_class.class}"
|
|
72
|
+
end
|
|
61
73
|
|
|
62
74
|
@strategies[name.to_sym] = strategy_class
|
|
63
75
|
end
|
|
@@ -65,7 +77,7 @@ module Html2rss
|
|
|
65
77
|
##
|
|
66
78
|
# Checks if a strategy is registered.
|
|
67
79
|
# @param name [Symbol] the name of the strategy
|
|
68
|
-
# @return [Boolean] true if the strategy is registered, false otherwise
|
|
80
|
+
# @return [Boolean] true if the strategy is registered, false otherwise.
|
|
69
81
|
def strategy_registered?(name)
|
|
70
82
|
@strategies.key?(name.to_sym)
|
|
71
83
|
end
|
|
@@ -73,24 +85,28 @@ module Html2rss
|
|
|
73
85
|
##
|
|
74
86
|
# Unregisters a strategy.
|
|
75
87
|
# @param name [Symbol] the name of the strategy
|
|
76
|
-
# @return [Boolean] true if the strategy was unregistered, false otherwise
|
|
77
|
-
|
|
78
|
-
|
|
88
|
+
# @return [Boolean] true if the strategy was unregistered, false otherwise.
|
|
89
|
+
# @raise [ArgumentError] if attempting to unregister the default strategy.
|
|
90
|
+
def unregister_strategy(name) # rubocop:disable Naming/PredicateMethod
|
|
91
|
+
name_sym = name.to_sym
|
|
92
|
+
raise ArgumentError, 'Cannot unregister the default strategy.' if name_sym == @default_strategy_name
|
|
79
93
|
|
|
80
|
-
!!@strategies.delete(
|
|
94
|
+
!!@strategies.delete(name_sym)
|
|
81
95
|
end
|
|
82
96
|
|
|
83
97
|
##
|
|
84
|
-
# Executes the request.
|
|
85
|
-
# @param ctx [Context] the context for the request
|
|
86
|
-
# @param strategy [Symbol] the strategy to use
|
|
87
|
-
# @return [Response] the response from the strategy
|
|
88
|
-
# @raise [
|
|
98
|
+
# Executes the request using the specified strategy.
|
|
99
|
+
# @param ctx [Context] the context for the request.
|
|
100
|
+
# @param strategy [Symbol] the strategy to use (defaults to the default strategy).
|
|
101
|
+
# @return [Response] the response from the executed strategy.
|
|
102
|
+
# @raise [ArgumentError] if the context is nil.
|
|
103
|
+
# @raise [UnknownStrategy] if the strategy is not registered.
|
|
89
104
|
def execute(ctx, strategy: default_strategy_name)
|
|
90
|
-
strategy_class = @strategies.fetch(strategy) do
|
|
105
|
+
strategy_class = @strategies.fetch(strategy.to_sym) do
|
|
91
106
|
raise UnknownStrategy,
|
|
92
|
-
"The strategy '#{strategy}' is not known. Available strategies
|
|
107
|
+
"The strategy '#{strategy}' is not known. Available strategies: #{strategy_names.join(', ')}"
|
|
93
108
|
end
|
|
109
|
+
|
|
94
110
|
strategy_class.new(ctx).execute
|
|
95
111
|
end
|
|
96
112
|
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestSession
|
|
5
|
+
##
|
|
6
|
+
# Traverses a rel=next pagination chain for selector-driven extraction.
|
|
7
|
+
class RelNextPager
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
##
|
|
11
|
+
# @param session [RequestSession] request session used to execute follow-ups
|
|
12
|
+
# @param initial_response [RequestService::Response] first page response
|
|
13
|
+
# @param max_pages [Integer] configured page budget, including the initial page
|
|
14
|
+
# @param logger [Logger] logger used for pagination stop reasons
|
|
15
|
+
def initialize(session:, initial_response:, max_pages:, logger: Html2rss::Log)
|
|
16
|
+
@session = session
|
|
17
|
+
@initial_response = initial_response
|
|
18
|
+
@max_pages = max_pages
|
|
19
|
+
@logger = logger
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Iterates over all paginated responses, beginning with the initial response.
|
|
24
|
+
#
|
|
25
|
+
# @yield [RequestService::Response] each page response
|
|
26
|
+
# @return [Enumerator] enumerator when no block is given
|
|
27
|
+
def each
|
|
28
|
+
return enum_for(:each) unless block_given?
|
|
29
|
+
|
|
30
|
+
yield initial_response
|
|
31
|
+
|
|
32
|
+
current_response = initial_response
|
|
33
|
+
session.effective_page_budget(max_pages).pred.times do
|
|
34
|
+
next_url = next_page_url(current_response)
|
|
35
|
+
break unless follow_up_allowed?(next_url)
|
|
36
|
+
|
|
37
|
+
current_response = fetch_follow_up_response_or_stop(next_url, current_response.url)
|
|
38
|
+
break unless current_response
|
|
39
|
+
|
|
40
|
+
yield current_response
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
attr_reader :session, :initial_response, :max_pages, :logger
|
|
47
|
+
|
|
48
|
+
def next_page_url(page_response)
|
|
49
|
+
href = page_response.parsed_body.at_css('link[rel~="next"][href], a[rel~="next"][href]')&.[]('href')
|
|
50
|
+
return nil if href.nil? || href.empty?
|
|
51
|
+
|
|
52
|
+
Html2rss::Url.from_relative(href, page_response.url)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def follow_up_allowed?(next_url)
|
|
56
|
+
next_url && !session.visited?(next_url)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def fetch_follow_up_response_or_stop(next_url, origin_url)
|
|
60
|
+
session.follow_up(url: next_url, relation: :pagination, origin_url:)
|
|
61
|
+
rescue RequestService::RequestBudgetExceeded => error
|
|
62
|
+
logger.warn(
|
|
63
|
+
"#{self.class}: pagination stopped at #{next_url} - #{error.message}. " \
|
|
64
|
+
"Retry with --max-requests #{session.max_requests + 1} or increase request.max_requests in the config."
|
|
65
|
+
)
|
|
66
|
+
nil
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestSession
|
|
5
|
+
##
|
|
6
|
+
# Carries the runtime request inputs needed to build a RequestSession.
|
|
7
|
+
class RuntimeInput
|
|
8
|
+
##
|
|
9
|
+
# @param config [Html2rss::Config] validated feed config
|
|
10
|
+
# @return [RuntimeInput] runtime request inputs derived from the config
|
|
11
|
+
def self.from_config(config)
|
|
12
|
+
new(
|
|
13
|
+
url: config.url,
|
|
14
|
+
headers: config.headers,
|
|
15
|
+
request: config.request,
|
|
16
|
+
strategy: config.strategy,
|
|
17
|
+
request_policy: RuntimePolicy.from_config(config)
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# @param url [String, Html2rss::Url] initial request URL
|
|
23
|
+
# @param headers [Hash] normalized request headers
|
|
24
|
+
# @param request [Hash] validated request options for strategies
|
|
25
|
+
# @param strategy [Symbol] request strategy to use for the session
|
|
26
|
+
# @param request_policy [RequestService::Policy] request policy for the session
|
|
27
|
+
def initialize(url:, headers:, request:, strategy:, request_policy:)
|
|
28
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
29
|
+
@headers = headers.freeze
|
|
30
|
+
@request = request.freeze
|
|
31
|
+
@strategy = strategy
|
|
32
|
+
@request_policy = request_policy
|
|
33
|
+
freeze
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# @return [Html2rss::Url] initial request URL
|
|
38
|
+
attr_reader :url
|
|
39
|
+
|
|
40
|
+
##
|
|
41
|
+
# @return [Hash] normalized request headers
|
|
42
|
+
attr_reader :headers
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# @return [Hash] validated request options for strategies
|
|
46
|
+
attr_reader :request
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# @return [Symbol] request strategy to use for the session
|
|
50
|
+
attr_reader :strategy
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
# @return [RequestService::Policy] policy derived from the runtime request inputs
|
|
54
|
+
attr_reader :request_policy
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestSession
|
|
5
|
+
##
|
|
6
|
+
# Builds the runtime request policy for a feed run.
|
|
7
|
+
class RuntimePolicy
|
|
8
|
+
##
|
|
9
|
+
# @param config [Html2rss::Config] validated feed config
|
|
10
|
+
# @return [Html2rss::RequestService::Policy] request policy derived from runtime config
|
|
11
|
+
def self.from_config(config)
|
|
12
|
+
RequestService::Policy.new(
|
|
13
|
+
max_requests: effective_max_requests_for(config),
|
|
14
|
+
max_redirects: config.max_redirects
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class << self
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def effective_max_requests_for(config)
|
|
22
|
+
return config.max_requests if config.explicit_max_requests?
|
|
23
|
+
|
|
24
|
+
[baseline_request_budget_for(config), config.max_requests].max
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Reserve enough budget for the initial request plus predictable follow-ups
|
|
28
|
+
# that the top-level pipeline may trigger during a normal feed build.
|
|
29
|
+
def baseline_request_budget_for(config)
|
|
30
|
+
1 + pagination_follow_up_budget_for(config) +
|
|
31
|
+
known_auto_source_follow_up_budget_for(config) +
|
|
32
|
+
browserless_preload_budget_for(config)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def pagination_follow_up_budget_for(config)
|
|
36
|
+
[config.selectors&.dig(:items, :pagination, :max_pages).to_i - 1, 0].max
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def known_auto_source_follow_up_budget_for(config)
|
|
40
|
+
config.auto_source&.dig(:scraper, :wordpress_api, :enabled) ? 1 : 0
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def browserless_preload_budget_for(config)
|
|
44
|
+
preload = config.request.dig(:browserless, :preload)
|
|
45
|
+
return 0 unless preload
|
|
46
|
+
|
|
47
|
+
top_level_preload_wait_budget(preload) +
|
|
48
|
+
click_selector_preload_budget(preload) +
|
|
49
|
+
scroll_preload_budget(preload)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def top_level_preload_wait_budget(preload)
|
|
53
|
+
preload[:wait_after_ms] ? 2 : 0
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def click_selector_preload_budget(preload)
|
|
57
|
+
preload.fetch(:click_selectors, []).sum { preload_action_budget(_1, :max_clicks) }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def scroll_preload_budget(preload)
|
|
61
|
+
scroll = preload[:scroll_down]
|
|
62
|
+
return 0 unless scroll
|
|
63
|
+
|
|
64
|
+
preload_action_budget(scroll, :iterations)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def preload_action_budget(config, count_key)
|
|
68
|
+
action_count = config.fetch(count_key, 1)
|
|
69
|
+
wait_budget = config[:wait_after_ms] ? action_count : 0
|
|
70
|
+
|
|
71
|
+
action_count + wait_budget
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Coordinates multi-request feed builds on top of RequestService.
|
|
6
|
+
class RequestSession
|
|
7
|
+
class << self
|
|
8
|
+
##
|
|
9
|
+
# Builds a request session from translated runtime request inputs.
|
|
10
|
+
#
|
|
11
|
+
# @param runtime_input [RuntimeInput] translated runtime request inputs
|
|
12
|
+
# @param logger [Logger] logger used for operational warnings
|
|
13
|
+
# @return [RequestSession] configured request session
|
|
14
|
+
def from_runtime_input(runtime_input, logger: Html2rss::Log)
|
|
15
|
+
new(
|
|
16
|
+
context: RequestService::Context.new(
|
|
17
|
+
url: runtime_input.url,
|
|
18
|
+
headers: runtime_input.headers,
|
|
19
|
+
request: runtime_input.request,
|
|
20
|
+
policy: runtime_input.request_policy
|
|
21
|
+
),
|
|
22
|
+
strategy: runtime_input.strategy,
|
|
23
|
+
logger:
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
##
|
|
29
|
+
# @param context [RequestService::Context] initial request context
|
|
30
|
+
# @param strategy [Symbol] request strategy to use for all requests in the session
|
|
31
|
+
# @param logger [Logger] logger used for operational warnings
|
|
32
|
+
def initialize(context:, strategy:, logger: Html2rss::Log)
|
|
33
|
+
@context = context
|
|
34
|
+
@strategy = strategy
|
|
35
|
+
@logger = logger
|
|
36
|
+
@visited_urls = Set.new
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
##
|
|
40
|
+
# Executes the initial request for the session.
|
|
41
|
+
#
|
|
42
|
+
# @return [RequestService::Response] initial response
|
|
43
|
+
def fetch_initial_response
|
|
44
|
+
execute(context).tap { |response| remember!(response.url) }
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
##
|
|
48
|
+
# Executes a follow-up request sharing policy, headers, and budget.
|
|
49
|
+
#
|
|
50
|
+
# @param url [String, Html2rss::Url] follow-up request url
|
|
51
|
+
# @param relation [Symbol] why the follow-up is being made
|
|
52
|
+
# @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
|
|
53
|
+
# @return [RequestService::Response] follow-up response
|
|
54
|
+
def follow_up(url:, relation:, origin_url:)
|
|
55
|
+
execute(context.follow_up(url:, relation:, origin_url:)).tap { |response| remember!(response.url) }
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
##
|
|
59
|
+
# Returns the effective page budget after applying the policy ceiling.
|
|
60
|
+
#
|
|
61
|
+
# @param requested_pages [Integer] configured page budget
|
|
62
|
+
# @return [Integer] effective page budget for the session
|
|
63
|
+
def effective_page_budget(requested_pages)
|
|
64
|
+
effective_pages = [requested_pages, context.policy.max_requests].min
|
|
65
|
+
return effective_pages if effective_pages == requested_pages
|
|
66
|
+
|
|
67
|
+
logger.warn(
|
|
68
|
+
"#{self.class}: pagination max_pages=#{requested_pages} " \
|
|
69
|
+
"exceeds system ceiling=#{context.policy.max_requests}; " \
|
|
70
|
+
"clamping to #{effective_pages}"
|
|
71
|
+
)
|
|
72
|
+
effective_pages
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
##
|
|
76
|
+
# Returns the configured request budget for the session.
|
|
77
|
+
#
|
|
78
|
+
# @return [Integer] maximum requests allowed for the feed build
|
|
79
|
+
def max_requests
|
|
80
|
+
context.policy.max_requests
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
##
|
|
84
|
+
# @param url [String, Html2rss::Url] url to query
|
|
85
|
+
# @return [Boolean] whether the url was already visited in this session
|
|
86
|
+
def visited?(url)
|
|
87
|
+
visited_urls.include?(normalize_url(url))
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
##
|
|
91
|
+
# Records a visited url in the session.
|
|
92
|
+
#
|
|
93
|
+
# @param url [String, Html2rss::Url] url to track
|
|
94
|
+
# @return [Set<Html2rss::Url>] visited urls
|
|
95
|
+
def remember!(url)
|
|
96
|
+
visited_urls.add(normalize_url(url))
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
private
|
|
100
|
+
|
|
101
|
+
attr_reader :context, :strategy, :logger, :visited_urls
|
|
102
|
+
|
|
103
|
+
def execute(request_context)
|
|
104
|
+
RequestService.execute(request_context, strategy:).tap do |response|
|
|
105
|
+
logger.debug(
|
|
106
|
+
"#{self.class}: relation=#{request_context.relation} " \
|
|
107
|
+
"request_url=#{request_context.url} final_url=#{response.url} " \
|
|
108
|
+
"status=#{response.status || 'unknown'} content_type=#{response.content_type.inspect} " \
|
|
109
|
+
"bytes=#{response.body.bytesize}"
|
|
110
|
+
)
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def normalize_url(url)
|
|
115
|
+
Html2rss::Url.from_absolute(url)
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
end
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'zlib'
|
|
4
|
+
require 'sanitize'
|
|
5
|
+
require 'nokogiri'
|
|
6
|
+
|
|
7
|
+
module Html2rss
|
|
8
|
+
class RssBuilder
|
|
9
|
+
##
|
|
10
|
+
# Article is a simple data object representing an article extracted from a page.
|
|
11
|
+
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
|
12
|
+
class Article
|
|
13
|
+
include Enumerable
|
|
14
|
+
include Comparable
|
|
15
|
+
|
|
16
|
+
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
|
|
17
|
+
DEDUP_FINGERPRINT_SEPARATOR = '#!/'
|
|
18
|
+
|
|
19
|
+
# @param options [Hash<Symbol, String>]
|
|
20
|
+
def initialize(**options)
|
|
21
|
+
@to_h = {}
|
|
22
|
+
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
|
23
|
+
@to_h.freeze
|
|
24
|
+
|
|
25
|
+
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
|
26
|
+
|
|
27
|
+
Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Checks if the article is valid based on the presence of URL, ID, and either title or description.
|
|
31
|
+
# @return [Boolean] True if the article is valid, otherwise false.
|
|
32
|
+
def valid?
|
|
33
|
+
!url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# @yield [key, value]
|
|
37
|
+
# @return [Enumerator] if no block is given
|
|
38
|
+
def each
|
|
39
|
+
return enum_for(:each) unless block_given?
|
|
40
|
+
|
|
41
|
+
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def id = blank_string_to_nil(@to_h[:id])
|
|
45
|
+
|
|
46
|
+
def title = blank_string_to_nil(@to_h[:title])
|
|
47
|
+
|
|
48
|
+
def description
|
|
49
|
+
@description ||= Rendering::DescriptionBuilder.new(
|
|
50
|
+
base: @to_h[:description],
|
|
51
|
+
title:,
|
|
52
|
+
url:,
|
|
53
|
+
enclosures:,
|
|
54
|
+
image:
|
|
55
|
+
).call
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# @return [Url, nil]
|
|
59
|
+
def url
|
|
60
|
+
@url ||= Url.sanitize(@to_h[:url])
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# @return [Url, nil]
|
|
64
|
+
def image
|
|
65
|
+
@image ||= Url.sanitize(@to_h[:image])
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @return [String, nil]
|
|
69
|
+
def author = blank_string_to_nil(@to_h[:author])
|
|
70
|
+
|
|
71
|
+
# Generates a unique identifier based on the URL and ID using CRC32.
|
|
72
|
+
# @return [String]
|
|
73
|
+
def guid
|
|
74
|
+
@guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
##
|
|
78
|
+
# Returns a deterministic fingerprint used to detect duplicate articles.
|
|
79
|
+
#
|
|
80
|
+
# @return [String, Integer]
|
|
81
|
+
def deduplication_fingerprint
|
|
82
|
+
dedup_from_url || dedup_from_id || dedup_from_guid || hash
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def enclosures
|
|
86
|
+
@enclosures ||= Array(@to_h[:enclosures])
|
|
87
|
+
.map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# @return [Html2rss::RssBuilder::Enclosure, nil]
|
|
91
|
+
def enclosure
|
|
92
|
+
return @enclosure if defined?(@enclosure)
|
|
93
|
+
|
|
94
|
+
case (object = @to_h[:enclosures]&.first)
|
|
95
|
+
when Hash
|
|
96
|
+
@enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
|
|
97
|
+
when nil
|
|
98
|
+
@enclosure = Html2rss::RssBuilder::Enclosure.new(url: image) if image
|
|
99
|
+
else
|
|
100
|
+
Log.warn "Article: unknown enclosure type: #{object.class}"
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def categories
|
|
105
|
+
@categories ||= @to_h[:categories].dup.to_a.tap do |categories|
|
|
106
|
+
categories.map! { |category| category.to_s.strip }
|
|
107
|
+
categories.reject!(&:empty?)
|
|
108
|
+
categories.uniq!
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
# Parses and returns the published_at time.
|
|
113
|
+
# @return [DateTime, nil]
|
|
114
|
+
def published_at
|
|
115
|
+
return if (string = @to_h[:published_at].to_s.strip).empty?
|
|
116
|
+
|
|
117
|
+
@published_at ||= DateTime.parse(string)
|
|
118
|
+
rescue ArgumentError
|
|
119
|
+
nil
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def scraper
|
|
123
|
+
@to_h[:scraper]
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def <=>(other)
|
|
127
|
+
return nil unless other.is_a?(Article)
|
|
128
|
+
|
|
129
|
+
0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
private
|
|
133
|
+
|
|
134
|
+
def dedup_from_url
|
|
135
|
+
return unless (value = url)
|
|
136
|
+
|
|
137
|
+
[value.to_s, id].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def dedup_from_id
|
|
141
|
+
return if id.to_s.empty?
|
|
142
|
+
|
|
143
|
+
id
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def dedup_from_guid
|
|
147
|
+
value = guid
|
|
148
|
+
return if value.to_s.empty?
|
|
149
|
+
|
|
150
|
+
[value, title, description].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def fetch_guid
|
|
154
|
+
guid = @to_h[:guid].map { |s| s.to_s.strip }.reject(&:empty?).join if @to_h[:guid].is_a?(Array)
|
|
155
|
+
|
|
156
|
+
guid || [url, id].join('#!/')
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def blank_string_to_nil(value)
|
|
160
|
+
return if value.is_a?(String) && value.strip.empty?
|
|
161
|
+
|
|
162
|
+
value
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
@@ -1,20 +1,105 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
module Html2rss
|
|
4
|
-
|
|
4
|
+
class RssBuilder
|
|
5
5
|
##
|
|
6
|
-
#
|
|
6
|
+
# Extracts channel information from
|
|
7
|
+
# 1. the HTML document's <head>.
|
|
8
|
+
# 2. the HTTP response
|
|
7
9
|
class Channel
|
|
10
|
+
DEFAULT_TTL_IN_MINUTES = 360
|
|
11
|
+
DEFAULT_DESCRIPTION_TEMPLATE = 'Latest items from %<url>s'
|
|
12
|
+
|
|
8
13
|
##
|
|
9
|
-
#
|
|
10
|
-
# @param
|
|
11
|
-
# @param
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
14
|
+
#
|
|
15
|
+
# @param response [Html2rss::RequestService::Response]
|
|
16
|
+
# @param overrides [Hash<Symbol, String>] - Optional, overrides for any channel attribute
|
|
17
|
+
def initialize(response, overrides: {})
|
|
18
|
+
@response = response
|
|
19
|
+
@overrides = overrides
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def title
|
|
23
|
+
@title ||= fetch_title
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def url = @url ||= Html2rss::Url.from_absolute(@response.url)
|
|
27
|
+
|
|
28
|
+
def description
|
|
29
|
+
return overrides[:description] unless overrides[:description].to_s.empty?
|
|
30
|
+
|
|
31
|
+
description = parsed_body.at_css('meta[name="description"]')&.[]('content') if html_response?
|
|
32
|
+
|
|
33
|
+
return format(DEFAULT_DESCRIPTION_TEMPLATE, url:) if description.to_s.empty?
|
|
34
|
+
|
|
35
|
+
description
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def ttl
|
|
39
|
+
return overrides[:ttl] if overrides[:ttl]
|
|
40
|
+
|
|
41
|
+
if (ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1))
|
|
42
|
+
return ttl.to_i.fdiv(60).ceil
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
DEFAULT_TTL_IN_MINUTES
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def language
|
|
49
|
+
return overrides[:language] if overrides[:language]
|
|
50
|
+
|
|
51
|
+
if (language_code = headers['content-language']&.match(/^([a-z]{2})/))
|
|
52
|
+
return language_code[0]
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
return unless html_response?
|
|
56
|
+
|
|
57
|
+
parsed_body['lang'] || parsed_body.at_css('[lang]')&.[]('lang')
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def author
|
|
61
|
+
return overrides[:author] if overrides[:author]
|
|
62
|
+
|
|
63
|
+
return unless html_response?
|
|
64
|
+
|
|
65
|
+
parsed_body.at_css('meta[name="author"]')&.[]('content')
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def last_build_date = headers['last-modified'] || Time.now
|
|
69
|
+
|
|
70
|
+
def image
|
|
71
|
+
return overrides[:image] if overrides[:image]
|
|
72
|
+
|
|
73
|
+
return unless html_response?
|
|
74
|
+
|
|
75
|
+
if (image_url = parsed_body.at_css('meta[property="og:image"]')&.[]('content'))
|
|
76
|
+
Url.sanitize(image_url)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
attr_reader :overrides
|
|
83
|
+
|
|
84
|
+
def parsed_body = @parsed_body ||= @response.parsed_body
|
|
85
|
+
def headers = @headers ||= @response.headers
|
|
86
|
+
def html_response? = @html_response ||= @response.html_response?
|
|
87
|
+
|
|
88
|
+
def fetch_title
|
|
89
|
+
override_title = overrides[:title]
|
|
90
|
+
return override_title if override_title
|
|
91
|
+
return parsed_title if parsed_title
|
|
92
|
+
|
|
93
|
+
url.channel_titleized
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def parsed_title
|
|
97
|
+
return unless html_response?
|
|
98
|
+
|
|
99
|
+
title = parsed_body.at_css('head > title')&.text.to_s
|
|
100
|
+
return if title.empty?
|
|
101
|
+
|
|
102
|
+
title.gsub(/\s+/, ' ').strip
|
|
18
103
|
end
|
|
19
104
|
end
|
|
20
105
|
end
|