html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -6,13 +6,22 @@ require 'forwardable'
6
6
  module Html2rss
7
7
  ##
8
8
  # Requests website URLs to retrieve their HTML for further processing.
9
- # Provides strategies, i.e. to integrate Browserless.io.
9
+ # Provides strategies, e.g. integrating Browserless.io.
10
10
  class RequestService
11
11
  include Singleton
12
12
 
13
13
  class UnknownStrategy < Html2rss::Error; end
14
14
  class InvalidUrl < Html2rss::Error; end
15
15
  class UnsupportedUrlScheme < Html2rss::Error; end
16
+ class UnsupportedResponseContentType < Html2rss::Error; end
17
+ class RequestBudgetExceeded < Html2rss::Error; end
18
+ class PrivateNetworkDenied < Html2rss::Error; end
19
+ class CrossOriginFollowUpDenied < Html2rss::Error; end
20
+ class ResponseTooLarge < Html2rss::Error; end
21
+ class BlockedSurfaceDetected < Html2rss::Error; end
22
+ class RequestTimedOut < Html2rss::Error; end
23
+ class BrowserlessConfigurationError < Html2rss::Error; end
24
+ class BrowserlessConnectionFailed < Html2rss::Error; end
16
25
 
17
26
  class << self
18
27
  extend Forwardable
@@ -55,9 +64,12 @@ module Html2rss
55
64
  ##
56
65
  # Registers a new strategy.
57
66
  # @param name [Symbol] the name of the strategy
58
- # @param strategy_class [Class] the class of the strategy
67
+ # @param strategy_class [Class] the class implementing the strategy
68
+ # @raise [ArgumentError] if strategy_class is not a Class
59
69
  def register_strategy(name, strategy_class)
60
- raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
70
+ unless strategy_class.is_a?(Class)
71
+ raise ArgumentError, "Expected a Class for strategy, got #{strategy_class.class}"
72
+ end
61
73
 
62
74
  @strategies[name.to_sym] = strategy_class
63
75
  end
@@ -65,7 +77,7 @@ module Html2rss
65
77
  ##
66
78
  # Checks if a strategy is registered.
67
79
  # @param name [Symbol] the name of the strategy
68
- # @return [Boolean] true if the strategy is registered, false otherwise
80
+ # @return [Boolean] true if the strategy is registered, false otherwise.
69
81
  def strategy_registered?(name)
70
82
  @strategies.key?(name.to_sym)
71
83
  end
@@ -73,24 +85,28 @@ module Html2rss
73
85
  ##
74
86
  # Unregisters a strategy.
75
87
  # @param name [Symbol] the name of the strategy
76
- # @return [Boolean] true if the strategy was unregistered, false otherwise
77
- def unregister_strategy(name)
78
- raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
88
+ # @return [Boolean] true if the strategy was unregistered, false otherwise.
89
+ # @raise [ArgumentError] if attempting to unregister the default strategy.
90
+ def unregister_strategy(name) # rubocop:disable Naming/PredicateMethod
91
+ name_sym = name.to_sym
92
+ raise ArgumentError, 'Cannot unregister the default strategy.' if name_sym == @default_strategy_name
79
93
 
80
- !!@strategies.delete(name.to_sym)
94
+ !!@strategies.delete(name_sym)
81
95
  end
82
96
 
83
97
  ##
84
- # Executes the request.
85
- # @param ctx [Context] the context for the request
86
- # @param strategy [Symbol] the strategy to use
87
- # @return [Response] the response from the strategy
88
- # @raise [UnknownStrategy] if the strategy is not known
98
+ # Executes the request using the specified strategy.
99
+ # @param ctx [Context] the context for the request.
100
+ # @param strategy [Symbol] the strategy to use (defaults to the default strategy).
101
+ # @return [Response] the response from the executed strategy.
102
+ # @raise [ArgumentError] if the context is nil.
103
+ # @raise [UnknownStrategy] if the strategy is not registered.
89
104
  def execute(ctx, strategy: default_strategy_name)
90
- strategy_class = @strategies.fetch(strategy) do
105
+ strategy_class = @strategies.fetch(strategy.to_sym) do
91
106
  raise UnknownStrategy,
92
- "The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
107
+ "The strategy '#{strategy}' is not known. Available strategies: #{strategy_names.join(', ')}"
93
108
  end
109
+
94
110
  strategy_class.new(ctx).execute
95
111
  end
96
112
  end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestSession
5
+ ##
6
+ # Traverses a rel=next pagination chain for selector-driven extraction.
7
+ class RelNextPager
8
+ include Enumerable
9
+
10
+ ##
11
+ # @param session [RequestSession] request session used to execute follow-ups
12
+ # @param initial_response [RequestService::Response] first page response
13
+ # @param max_pages [Integer] configured page budget, including the initial page
14
+ # @param logger [Logger] logger used for pagination stop reasons
15
+ def initialize(session:, initial_response:, max_pages:, logger: Html2rss::Log)
16
+ @session = session
17
+ @initial_response = initial_response
18
+ @max_pages = max_pages
19
+ @logger = logger
20
+ end
21
+
22
+ ##
23
+ # Iterates over all paginated responses, beginning with the initial response.
24
+ #
25
+ # @yield [RequestService::Response] each page response
26
+ # @return [Enumerator] enumerator when no block is given
27
+ def each
28
+ return enum_for(:each) unless block_given?
29
+
30
+ yield initial_response
31
+
32
+ current_response = initial_response
33
+ session.effective_page_budget(max_pages).pred.times do
34
+ next_url = next_page_url(current_response)
35
+ break unless follow_up_allowed?(next_url)
36
+
37
+ current_response = fetch_follow_up_response_or_stop(next_url, current_response.url)
38
+ break unless current_response
39
+
40
+ yield current_response
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ attr_reader :session, :initial_response, :max_pages, :logger
47
+
48
+ def next_page_url(page_response)
49
+ href = page_response.parsed_body.at_css('link[rel~="next"][href], a[rel~="next"][href]')&.[]('href')
50
+ return nil if href.nil? || href.empty?
51
+
52
+ Html2rss::Url.from_relative(href, page_response.url)
53
+ end
54
+
55
+ def follow_up_allowed?(next_url)
56
+ next_url && !session.visited?(next_url)
57
+ end
58
+
59
+ def fetch_follow_up_response_or_stop(next_url, origin_url)
60
+ session.follow_up(url: next_url, relation: :pagination, origin_url:)
61
+ rescue RequestService::RequestBudgetExceeded => error
62
+ logger.warn(
63
+ "#{self.class}: pagination stopped at #{next_url} - #{error.message}. " \
64
+ "Retry with --max-requests #{session.max_requests + 1} or increase request.max_requests in the config."
65
+ )
66
+ nil
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestSession
5
+ ##
6
+ # Carries the runtime request inputs needed to build a RequestSession.
7
+ class RuntimeInput
8
+ ##
9
+ # @param config [Html2rss::Config] validated feed config
10
+ # @return [RuntimeInput] runtime request inputs derived from the config
11
+ def self.from_config(config)
12
+ new(
13
+ url: config.url,
14
+ headers: config.headers,
15
+ request: config.request,
16
+ strategy: config.strategy,
17
+ request_policy: RuntimePolicy.from_config(config)
18
+ )
19
+ end
20
+
21
+ ##
22
+ # @param url [String, Html2rss::Url] initial request URL
23
+ # @param headers [Hash] normalized request headers
24
+ # @param request [Hash] validated request options for strategies
25
+ # @param strategy [Symbol] request strategy to use for the session
26
+ # @param request_policy [RequestService::Policy] request policy for the session
27
+ def initialize(url:, headers:, request:, strategy:, request_policy:)
28
+ @url = Html2rss::Url.from_absolute(url)
29
+ @headers = headers.freeze
30
+ @request = request.freeze
31
+ @strategy = strategy
32
+ @request_policy = request_policy
33
+ freeze
34
+ end
35
+
36
+ ##
37
+ # @return [Html2rss::Url] initial request URL
38
+ attr_reader :url
39
+
40
+ ##
41
+ # @return [Hash] normalized request headers
42
+ attr_reader :headers
43
+
44
+ ##
45
+ # @return [Hash] validated request options for strategies
46
+ attr_reader :request
47
+
48
+ ##
49
+ # @return [Symbol] request strategy to use for the session
50
+ attr_reader :strategy
51
+
52
+ ##
53
+ # @return [RequestService::Policy] policy derived from the runtime request inputs
54
+ attr_reader :request_policy
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestSession
5
+ ##
6
+ # Builds the runtime request policy for a feed run.
7
+ class RuntimePolicy
8
+ ##
9
+ # @param config [Html2rss::Config] validated feed config
10
+ # @return [Html2rss::RequestService::Policy] request policy derived from runtime config
11
+ def self.from_config(config)
12
+ RequestService::Policy.new(
13
+ max_requests: effective_max_requests_for(config),
14
+ max_redirects: config.max_redirects
15
+ )
16
+ end
17
+
18
+ class << self
19
+ private
20
+
21
+ def effective_max_requests_for(config)
22
+ return config.max_requests if config.explicit_max_requests?
23
+
24
+ [baseline_request_budget_for(config), config.max_requests].max
25
+ end
26
+
27
+ # Reserve enough budget for the initial request plus predictable follow-ups
28
+ # that the top-level pipeline may trigger during a normal feed build.
29
+ def baseline_request_budget_for(config)
30
+ 1 + pagination_follow_up_budget_for(config) +
31
+ known_auto_source_follow_up_budget_for(config) +
32
+ browserless_preload_budget_for(config)
33
+ end
34
+
35
+ def pagination_follow_up_budget_for(config)
36
+ [config.selectors&.dig(:items, :pagination, :max_pages).to_i - 1, 0].max
37
+ end
38
+
39
+ def known_auto_source_follow_up_budget_for(config)
40
+ config.auto_source&.dig(:scraper, :wordpress_api, :enabled) ? 1 : 0
41
+ end
42
+
43
+ def browserless_preload_budget_for(config)
44
+ preload = config.request.dig(:browserless, :preload)
45
+ return 0 unless preload
46
+
47
+ top_level_preload_wait_budget(preload) +
48
+ click_selector_preload_budget(preload) +
49
+ scroll_preload_budget(preload)
50
+ end
51
+
52
+ def top_level_preload_wait_budget(preload)
53
+ preload[:wait_after_ms] ? 2 : 0
54
+ end
55
+
56
+ def click_selector_preload_budget(preload)
57
+ preload.fetch(:click_selectors, []).sum { preload_action_budget(_1, :max_clicks) }
58
+ end
59
+
60
+ def scroll_preload_budget(preload)
61
+ scroll = preload[:scroll_down]
62
+ return 0 unless scroll
63
+
64
+ preload_action_budget(scroll, :iterations)
65
+ end
66
+
67
+ def preload_action_budget(config, count_key)
68
+ action_count = config.fetch(count_key, 1)
69
+ wait_budget = config[:wait_after_ms] ? action_count : 0
70
+
71
+ action_count + wait_budget
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Coordinates multi-request feed builds on top of RequestService.
6
+ class RequestSession
7
+ class << self
8
+ ##
9
+ # Builds a request session from translated runtime request inputs.
10
+ #
11
+ # @param runtime_input [RuntimeInput] translated runtime request inputs
12
+ # @param logger [Logger] logger used for operational warnings
13
+ # @return [RequestSession] configured request session
14
+ def from_runtime_input(runtime_input, logger: Html2rss::Log)
15
+ new(
16
+ context: RequestService::Context.new(
17
+ url: runtime_input.url,
18
+ headers: runtime_input.headers,
19
+ request: runtime_input.request,
20
+ policy: runtime_input.request_policy
21
+ ),
22
+ strategy: runtime_input.strategy,
23
+ logger:
24
+ )
25
+ end
26
+ end
27
+
28
+ ##
29
+ # @param context [RequestService::Context] initial request context
30
+ # @param strategy [Symbol] request strategy to use for all requests in the session
31
+ # @param logger [Logger] logger used for operational warnings
32
+ def initialize(context:, strategy:, logger: Html2rss::Log)
33
+ @context = context
34
+ @strategy = strategy
35
+ @logger = logger
36
+ @visited_urls = Set.new
37
+ end
38
+
39
+ ##
40
+ # Executes the initial request for the session.
41
+ #
42
+ # @return [RequestService::Response] initial response
43
+ def fetch_initial_response
44
+ execute(context).tap { |response| remember!(response.url) }
45
+ end
46
+
47
+ ##
48
+ # Executes a follow-up request sharing policy, headers, and budget.
49
+ #
50
+ # @param url [String, Html2rss::Url] follow-up request url
51
+ # @param relation [Symbol] why the follow-up is being made
52
+ # @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
53
+ # @return [RequestService::Response] follow-up response
54
+ def follow_up(url:, relation:, origin_url:)
55
+ execute(context.follow_up(url:, relation:, origin_url:)).tap { |response| remember!(response.url) }
56
+ end
57
+
58
+ ##
59
+ # Returns the effective page budget after applying the policy ceiling.
60
+ #
61
+ # @param requested_pages [Integer] configured page budget
62
+ # @return [Integer] effective page budget for the session
63
+ def effective_page_budget(requested_pages)
64
+ effective_pages = [requested_pages, context.policy.max_requests].min
65
+ return effective_pages if effective_pages == requested_pages
66
+
67
+ logger.warn(
68
+ "#{self.class}: pagination max_pages=#{requested_pages} " \
69
+ "exceeds system ceiling=#{context.policy.max_requests}; " \
70
+ "clamping to #{effective_pages}"
71
+ )
72
+ effective_pages
73
+ end
74
+
75
+ ##
76
+ # Returns the configured request budget for the session.
77
+ #
78
+ # @return [Integer] maximum requests allowed for the feed build
79
+ def max_requests
80
+ context.policy.max_requests
81
+ end
82
+
83
+ ##
84
+ # @param url [String, Html2rss::Url] url to query
85
+ # @return [Boolean] whether the url was already visited in this session
86
+ def visited?(url)
87
+ visited_urls.include?(normalize_url(url))
88
+ end
89
+
90
+ ##
91
+ # Records a visited url in the session.
92
+ #
93
+ # @param url [String, Html2rss::Url] url to track
94
+ # @return [Set<Html2rss::Url>] visited urls
95
+ def remember!(url)
96
+ visited_urls.add(normalize_url(url))
97
+ end
98
+
99
+ private
100
+
101
+ attr_reader :context, :strategy, :logger, :visited_urls
102
+
103
+ def execute(request_context)
104
+ RequestService.execute(request_context, strategy:).tap do |response|
105
+ logger.debug(
106
+ "#{self.class}: relation=#{request_context.relation} " \
107
+ "request_url=#{request_context.url} final_url=#{response.url} " \
108
+ "status=#{response.status || 'unknown'} content_type=#{response.content_type.inspect} " \
109
+ "bytes=#{response.body.bytesize}"
110
+ )
111
+ end
112
+ end
113
+
114
+ def normalize_url(url)
115
+ Html2rss::Url.from_absolute(url)
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,166 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zlib'
4
+ require 'sanitize'
5
+ require 'nokogiri'
6
+
7
+ module Html2rss
8
+ class RssBuilder
9
+ ##
10
+ # Article is a simple data object representing an article extracted from a page.
11
+ # It is enumerable and responds to all keys specified in PROVIDED_KEYS.
12
+ class Article
13
+ include Enumerable
14
+ include Comparable
15
+
16
+ PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
17
+ DEDUP_FINGERPRINT_SEPARATOR = '#!/'
18
+
19
+ # @param options [Hash<Symbol, String>]
20
+ def initialize(**options)
21
+ @to_h = {}
22
+ options.each_pair { |key, value| @to_h[key] = value.freeze if value }
23
+ @to_h.freeze
24
+
25
+ return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
26
+
27
+ Log.warn "Article: unknown keys found: #{unknown_keys.join(', ')}"
28
+ end
29
+
30
+ # Checks if the article is valid based on the presence of URL, ID, and either title or description.
31
+ # @return [Boolean] True if the article is valid, otherwise false.
32
+ def valid?
33
+ !url.to_s.empty? && (!title.to_s.empty? || !description.to_s.empty?) && !id.to_s.empty?
34
+ end
35
+
36
+ # @yield [key, value]
37
+ # @return [Enumerator] if no block is given
38
+ def each
39
+ return enum_for(:each) unless block_given?
40
+
41
+ PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
42
+ end
43
+
44
+ def id = blank_string_to_nil(@to_h[:id])
45
+
46
+ def title = blank_string_to_nil(@to_h[:title])
47
+
48
+ def description
49
+ @description ||= Rendering::DescriptionBuilder.new(
50
+ base: @to_h[:description],
51
+ title:,
52
+ url:,
53
+ enclosures:,
54
+ image:
55
+ ).call
56
+ end
57
+
58
+ # @return [Url, nil]
59
+ def url
60
+ @url ||= Url.sanitize(@to_h[:url])
61
+ end
62
+
63
+ # @return [Url, nil]
64
+ def image
65
+ @image ||= Url.sanitize(@to_h[:image])
66
+ end
67
+
68
+ # @return [String, nil]
69
+ def author = blank_string_to_nil(@to_h[:author])
70
+
71
+ # Generates a unique identifier based on the URL and ID using CRC32.
72
+ # @return [String]
73
+ def guid
74
+ @guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
75
+ end
76
+
77
+ ##
78
+ # Returns a deterministic fingerprint used to detect duplicate articles.
79
+ #
80
+ # @return [String, Integer]
81
+ def deduplication_fingerprint
82
+ dedup_from_url || dedup_from_id || dedup_from_guid || hash
83
+ end
84
+
85
+ def enclosures
86
+ @enclosures ||= Array(@to_h[:enclosures])
87
+ .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
88
+ end
89
+
90
+ # @return [Html2rss::RssBuilder::Enclosure, nil]
91
+ def enclosure
92
+ return @enclosure if defined?(@enclosure)
93
+
94
+ case (object = @to_h[:enclosures]&.first)
95
+ when Hash
96
+ @enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
97
+ when nil
98
+ @enclosure = Html2rss::RssBuilder::Enclosure.new(url: image) if image
99
+ else
100
+ Log.warn "Article: unknown enclosure type: #{object.class}"
101
+ end
102
+ end
103
+
104
+ def categories
105
+ @categories ||= @to_h[:categories].dup.to_a.tap do |categories|
106
+ categories.map! { |category| category.to_s.strip }
107
+ categories.reject!(&:empty?)
108
+ categories.uniq!
109
+ end
110
+ end
111
+
112
+ # Parses and returns the published_at time.
113
+ # @return [DateTime, nil]
114
+ def published_at
115
+ return if (string = @to_h[:published_at].to_s.strip).empty?
116
+
117
+ @published_at ||= DateTime.parse(string)
118
+ rescue ArgumentError
119
+ nil
120
+ end
121
+
122
+ def scraper
123
+ @to_h[:scraper]
124
+ end
125
+
126
+ def <=>(other)
127
+ return nil unless other.is_a?(Article)
128
+
129
+ 0 if other.all? { |key, value| value == public_send(key) ? public_send(key) <=> value : false }
130
+ end
131
+
132
+ private
133
+
134
+ def dedup_from_url
135
+ return unless (value = url)
136
+
137
+ [value.to_s, id].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
138
+ end
139
+
140
+ def dedup_from_id
141
+ return if id.to_s.empty?
142
+
143
+ id
144
+ end
145
+
146
+ def dedup_from_guid
147
+ value = guid
148
+ return if value.to_s.empty?
149
+
150
+ [value, title, description].compact.join(DEDUP_FINGERPRINT_SEPARATOR)
151
+ end
152
+
153
+ def fetch_guid
154
+ guid = @to_h[:guid].map { |s| s.to_s.strip }.reject(&:empty?).join if @to_h[:guid].is_a?(Array)
155
+
156
+ guid || [url, id].join('#!/')
157
+ end
158
+
159
+ def blank_string_to_nil(value)
160
+ return if value.is_a?(String) && value.strip.empty?
161
+
162
+ value
163
+ end
164
+ end
165
+ end
166
+ end
@@ -1,20 +1,105 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Html2rss
4
- module RssBuilder
4
+ class RssBuilder
5
5
  ##
6
- # Builds the <channel> tag (with the provided maker).
6
+ # Extracts channel information from
7
+ # 1. the HTML document's <head>.
8
+ # 2. the HTTP response
7
9
  class Channel
10
+ DEFAULT_TTL_IN_MINUTES = 360
11
+ DEFAULT_DESCRIPTION_TEMPLATE = 'Latest items from %<url>s'
12
+
8
13
  ##
9
- # @param maker [RSS::Maker::RSS20::Channel]
10
- # @param config [Html2rss::Config]
11
- # @param tags [Set<Symbol>]
12
- # @return nil
13
- def self.add(maker, config, tags)
14
- tags.each { |tag| maker.public_send(:"#{tag}=", config.public_send(tag)) }
15
-
16
- maker.generator = "html2rss V. #{::Html2rss::VERSION}"
17
- maker.lastBuildDate = Time.now
14
+ #
15
+ # @param response [Html2rss::RequestService::Response]
16
+ # @param overrides [Hash<Symbol, String>] - Optional, overrides for any channel attribute
17
+ def initialize(response, overrides: {})
18
+ @response = response
19
+ @overrides = overrides
20
+ end
21
+
22
+ def title
23
+ @title ||= fetch_title
24
+ end
25
+
26
+ def url = @url ||= Html2rss::Url.from_absolute(@response.url)
27
+
28
+ def description
29
+ return overrides[:description] unless overrides[:description].to_s.empty?
30
+
31
+ description = parsed_body.at_css('meta[name="description"]')&.[]('content') if html_response?
32
+
33
+ return format(DEFAULT_DESCRIPTION_TEMPLATE, url:) if description.to_s.empty?
34
+
35
+ description
36
+ end
37
+
38
+ def ttl
39
+ return overrides[:ttl] if overrides[:ttl]
40
+
41
+ if (ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1))
42
+ return ttl.to_i.fdiv(60).ceil
43
+ end
44
+
45
+ DEFAULT_TTL_IN_MINUTES
46
+ end
47
+
48
+ def language
49
+ return overrides[:language] if overrides[:language]
50
+
51
+ if (language_code = headers['content-language']&.match(/^([a-z]{2})/))
52
+ return language_code[0]
53
+ end
54
+
55
+ return unless html_response?
56
+
57
+ parsed_body['lang'] || parsed_body.at_css('[lang]')&.[]('lang')
58
+ end
59
+
60
+ def author
61
+ return overrides[:author] if overrides[:author]
62
+
63
+ return unless html_response?
64
+
65
+ parsed_body.at_css('meta[name="author"]')&.[]('content')
66
+ end
67
+
68
+ def last_build_date = headers['last-modified'] || Time.now
69
+
70
+ def image
71
+ return overrides[:image] if overrides[:image]
72
+
73
+ return unless html_response?
74
+
75
+ if (image_url = parsed_body.at_css('meta[property="og:image"]')&.[]('content'))
76
+ Url.sanitize(image_url)
77
+ end
78
+ end
79
+
80
+ private
81
+
82
+ attr_reader :overrides
83
+
84
+ def parsed_body = @parsed_body ||= @response.parsed_body
85
+ def headers = @headers ||= @response.headers
86
+ def html_response? = @html_response ||= @response.html_response?
87
+
88
+ def fetch_title
89
+ override_title = overrides[:title]
90
+ return override_title if override_title
91
+ return parsed_title if parsed_title
92
+
93
+ url.channel_titleized
94
+ end
95
+
96
+ def parsed_title
97
+ return unless html_response?
98
+
99
+ title = parsed_body.at_css('head > title')&.text.to_s
100
+ return if title.empty?
101
+
102
+ title.gsub(/\s+/, ' ').strip
18
103
  end
19
104
  end
20
105
  end