html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ ##
4
+ # The Html2rss namespace.
5
+ module Html2rss
6
+ ##
7
+ # Coordinates feed generation pipeline stages.
8
+ class FeedPipeline
9
+ # Retries feed extraction across concrete request strategies for :auto mode.
10
+ class AutoFallback
11
+ # Ordered list of concrete request strategies attempted by auto mode.
12
+ CHAIN = %i[faraday botasaurus browserless].freeze
13
+
14
+ # Error classes that should abort auto fallback immediately.
15
+ NON_FALLBACK_ERRORS = [
16
+ RequestService::UnknownStrategy,
17
+ RequestService::InvalidUrl,
18
+ RequestService::UnsupportedUrlScheme,
19
+ RequestService::UnsupportedResponseContentType,
20
+ RequestService::RequestBudgetExceeded,
21
+ RequestService::PrivateNetworkDenied,
22
+ RequestService::CrossOriginFollowUpDenied,
23
+ RequestService::ResponseTooLarge,
24
+ RequestService::BrowserlessConfigurationError
25
+ ].freeze
26
+
27
+ ##
28
+ # @param strategies [Array<Symbol>] ordered concrete strategies for fallback
29
+ # @param budget [RequestService::Budget] shared request budget across retries
30
+ # @param session_for [Proc] request session factory proc
31
+ # @param articles_for [Proc] article extraction proc
32
+ # @return [void]
33
+ def initialize(strategies:, budget:, session_for:, articles_for:)
34
+ @strategies = strategies
35
+ @budget = budget
36
+ @session_for = session_for
37
+ @articles_for = articles_for
38
+ end
39
+
40
+ ##
41
+ # @return [Hash{Symbol => Object}] pipeline state containing :response and :articles
42
+ def call
43
+ state, attempts = run_attempts
44
+ return state if state
45
+
46
+ finalize_failure(attempts:)
47
+ end
48
+
49
+ private
50
+
51
+ attr_reader :strategies, :budget, :session_for, :articles_for
52
+
53
+ def run_attempts
54
+ state = { result: nil, attempts: [] }
55
+ strategies.each_with_index do |strategy, index|
56
+ run_attempt_for(strategy:, next_strategy: strategies[index + 1], state:)
57
+ break if state.fetch(:result)
58
+ end
59
+ [state.fetch(:result), state.fetch(:attempts)]
60
+ end
61
+
62
+ def run_attempt_for(strategy:, next_strategy:, state:)
63
+ result, attempts = attempt(
64
+ strategy:,
65
+ next_strategy:,
66
+ state: { attempts: state.fetch(:attempts) }
67
+ )
68
+ state[:result] = result
69
+ state[:attempts] = attempts
70
+ end
71
+
72
+ def attempt(strategy:, next_strategy:, state:)
73
+ request_session = session_for.call(strategy:, budget:)
74
+ response, state = fetch_response(
75
+ request_session:,
76
+ strategy:,
77
+ next_strategy:,
78
+ state:
79
+ )
80
+ return [nil, state.fetch(:attempts)] unless response
81
+
82
+ process_response(response:, strategy:, next_strategy:, request_session:, state:)
83
+ end
84
+
85
+ def fetch_response(request_session:, strategy:, next_strategy:, state:)
86
+ [request_session.fetch_initial_response, state]
87
+ rescue *NON_FALLBACK_ERRORS
88
+ raise
89
+ rescue StandardError => error
90
+ state[:attempts] << { strategy:, items_count: nil, error_class: error.class.name }
91
+ log_warn_fallback_error(strategy:, next_strategy:, error:) if next_strategy
92
+ Log.debug("#{self.class}: strategy=#{strategy} error=#{error.class}: #{error.message}")
93
+ [nil, state]
94
+ end
95
+
96
+ def process_response(response:, strategy:, next_strategy:, request_session:, state:)
97
+ articles = articles_for.call(response:, request_session:)
98
+ items_count = articles.size
99
+ state[:attempts] << { strategy:, items_count:, error_class: nil }
100
+ Log.debug("#{self.class}: strategy=#{strategy} items=#{items_count}")
101
+ return success_state(response:, strategy:, articles:, state:) if items_count.positive?
102
+
103
+ log_info_fallback_zero_items(strategy:, next_strategy:) if next_strategy
104
+ [nil, state.fetch(:attempts)]
105
+ end
106
+
107
+ def success_state(response:, strategy:, articles:, state:)
108
+ if state.fetch(:attempts).size > 1
109
+ Log.info("#{self.class}: auto selected strategy=#{strategy} after attempts=#{state.fetch(:attempts).size}")
110
+ end
111
+ [{ response:, articles: }, state.fetch(:attempts)]
112
+ end
113
+
114
+ def finalize_failure(attempts:)
115
+ raise NoFeedItemsExtracted.new(attempts:)
116
+ end
117
+
118
+ def log_warn_fallback_error(strategy:, next_strategy:, error:)
119
+ Log.warn("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after error=#{error.class}")
120
+ end
121
+
122
+ def log_info_fallback_zero_items(strategy:, next_strategy:)
123
+ Log.info("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after zero extracted items")
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Builds feeds from validated config through request, extraction, and rendering stages.
6
+ class FeedPipeline
7
+ ##
8
+ # @param raw_config [Hash{Symbol => Object}] user-provided feed config
9
+ def initialize(raw_config)
10
+ @raw_config = raw_config
11
+ end
12
+
13
+ ##
14
+ # @return [RSS::Rss] generated RSS feed
15
+ def to_rss
16
+ run do |response:, config:, articles:|
17
+ channel = RssBuilder::Channel.new(response, overrides: config.channel)
18
+ RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
19
+ end
20
+ end
21
+
22
+ ##
23
+ # @return [Hash] generated JSONFeed 1.1 payload
24
+ def to_json_feed
25
+ run do |response:, config:, articles:|
26
+ channel = RssBuilder::Channel.new(response, overrides: config.channel)
27
+ JsonFeedBuilder.new(channel:, articles:).call
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :raw_config
34
+
35
+ def run
36
+ config = Config.from_hash(raw_config, params: raw_config[:params])
37
+ state = pipeline_state_for(config)
38
+ yield response: state.fetch(:response), config:, articles: state.fetch(:articles)
39
+ end
40
+
41
+ def pipeline_state_for(config)
42
+ if config.strategy == :auto
43
+ run_auto_pipeline(config)
44
+ else
45
+ run_pipeline_for_strategy(config, strategy: config.strategy)
46
+ end
47
+ end
48
+
49
+ def run_pipeline_for_strategy(config, strategy:, budget: nil)
50
+ request_session = request_session_for(config, strategy:, budget:)
51
+ response = request_session.fetch_initial_response
52
+ articles = deduplicated_articles(response:, config:, request_session:)
53
+ { response:, articles: }
54
+ end
55
+
56
+ def request_session_for(config, strategy:, budget: nil)
57
+ RequestSession.from_runtime_input(runtime_input_for(config, strategy:), budget:)
58
+ end
59
+
60
+ def runtime_input_for(config, strategy:)
61
+ RequestSession::RuntimeInput.new(
62
+ url: config.url,
63
+ headers: config.headers,
64
+ request: config.request,
65
+ strategy:,
66
+ request_policy: RequestSession::RuntimePolicy.from_config(config)
67
+ )
68
+ end
69
+
70
+ def deduplicated_articles(response:, config:, request_session:)
71
+ Articles::Deduplicator.new(
72
+ collect_articles(response:, config:, request_session:)
73
+ ).call
74
+ end
75
+
76
+ def run_auto_pipeline(config)
77
+ auto_fallback_for(config).call
78
+ end
79
+
80
+ def auto_fallback_for(config)
81
+ AutoFallback.new(
82
+ strategies: AutoFallback::CHAIN,
83
+ budget: auto_pipeline_budget(config),
84
+ session_for: lambda do |strategy:, budget:|
85
+ request_session_for(config, strategy:, budget:)
86
+ end,
87
+ articles_for: lambda do |response:, request_session:|
88
+ deduplicated_articles(response:, config:, request_session:)
89
+ end
90
+ )
91
+ end
92
+
93
+ def auto_pipeline_budget(config)
94
+ max_requests = RequestSession::RuntimePolicy.from_config(config).max_requests
95
+ RequestService::Budget.new(max_requests:)
96
+ end
97
+
98
+ def collect_articles(response:, config:, request_session:)
99
+ selector_articles(response:, config:, request_session:) +
100
+ auto_source_articles(response:, config:, request_session:)
101
+ end
102
+
103
+ def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
104
+ return [] unless (selectors = config.selectors)
105
+
106
+ page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
107
+ RequestSession::RelNextPager.new(
108
+ session: request_session,
109
+ initial_response: response,
110
+ max_pages:
111
+ ).to_a
112
+ else
113
+ [response]
114
+ end
115
+
116
+ page_responses.flat_map do |page_response|
117
+ Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
118
+ end
119
+ end
120
+
121
+ def auto_source_articles(response:, config:, request_session:)
122
+ return [] unless (auto_source = config.auto_source)
123
+
124
+ AutoSource.new(response, auto_source, request_session:).articles
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ # Shared helpers for hash normalization and structural operations.
5
+ module HashUtil
6
+ module_function
7
+
8
+ # Deeply duplicates nested arrays and hashes.
9
+ #
10
+ # @param object [Object] nested value from configuration or runtime state
11
+ # @return [Object] deep duplicated object
12
+ def deep_dup(object)
13
+ case object
14
+ in Hash
15
+ object.transform_values { deep_dup(_1) }
16
+ in Array
17
+ object.map { deep_dup(_1) }
18
+ else
19
+ object.dup rescue StandardError # rubocop:disable Style/RescueModifier
20
+ end
21
+ end
22
+
23
+ # Deeply merges nested hashes while replacing non-hash values from override.
24
+ #
25
+ # @param base [Hash] base hash
26
+ # @param override [Hash] override hash
27
+ # @return [Hash] merged hash
28
+ def deep_merge(base, override)
29
+ base.merge(override) do |_key, old_val, new_val|
30
+ case [old_val, new_val]
31
+ in [Hash, Hash]
32
+ deep_merge(old_val, new_val)
33
+ else
34
+ new_val
35
+ end
36
+ end
37
+ end
38
+
39
+ # Converts string-keyed hashes to symbol-keyed hashes recursively.
40
+ #
41
+ # @param object [Object] value to normalize
42
+ # @param context [String] error context
43
+ # @return [Object] normalized value
44
+ def deep_symbolize_keys(object, context: 'hash')
45
+ case object
46
+ in Hash
47
+ object.each_with_object({}) do |(k, v), memo|
48
+ memo[symbol_key(k, context:)] = deep_symbolize_keys(v, context:)
49
+ end
50
+ in Array
51
+ object.map { deep_symbolize_keys(_1, context:) }
52
+ else
53
+ object
54
+ end
55
+ end
56
+
57
+ # Validates that hash keys are symbols.
58
+ #
59
+ # @param value [Object] candidate hash container whose keys must be symbols
60
+ # @param context [String] error context
61
+ # @param deep [Boolean] whether nested hashes should also be validated
62
+ # @return [void]
63
+ def assert_symbol_keys!(value, context: 'hash', deep: true)
64
+ return unless value in Hash
65
+
66
+ unless value.each_key.all?(Symbol)
67
+ invalid_key = value.keys.find { _1.class != Symbol }
68
+ raise ArgumentError, "#{context} must use symbol keys (found #{invalid_key.inspect})"
69
+ end
70
+
71
+ value.each_value { assert_symbol_keys!(_1, context:, deep:) } if deep
72
+ end
73
+
74
+ # Validates that hash keys are strings.
75
+ #
76
+ # @param value [Object] candidate hash container whose keys must be strings
77
+ # @param context [String] error context
78
+ # @param deep [Boolean] whether nested hashes should also be validated
79
+ # @return [void]
80
+ def assert_string_keys!(value, context: 'hash', deep: true)
81
+ return unless value in Hash
82
+
83
+ unless value.each_key.all?(String)
84
+ invalid_key = value.keys.find { _1.class != String }
85
+ raise ArgumentError, "#{context} must use string keys (found #{invalid_key.inspect})"
86
+ end
87
+
88
+ value.each_value { assert_string_keys!(_1, context:, deep:) } if deep
89
+ end
90
+
91
+ def symbol_key(key, context:)
92
+ case key
93
+ in Symbol then key
94
+ in String then key.to_sym
95
+ else
96
+ raise ArgumentError, "#{context} must use string or symbol keys (found #{key.inspect})"
97
+ end
98
+ end
99
+ private_class_method :symbol_key
100
+ end
101
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ # Extracts the earliest date from an article_tag.
6
+ class DateExtractor
7
+ # @param article_tag [Nokogiri::XML::Element] article container node
8
+ # @return [DateTime, nil]
9
+ def self.call(article_tag)
10
+ times = article_tag.css('[datetime]').filter_map do |tag|
11
+ DateTime.parse(tag['datetime'])
12
+ rescue ArgumentError, TypeError
13
+ nil
14
+ end
15
+
16
+ times.min
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ ##
6
+ # Extracts enclosures from HTML tags using various strategies.
7
+ class EnclosureExtractor
8
+ # @param article_tag [Nokogiri::XML::Element] article container node
9
+ # @param base_url [String, Html2rss::Url] base URL for relative enclosure links
10
+ # @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
11
+ def self.call(article_tag, base_url)
12
+ [
13
+ Extractors::Image,
14
+ Extractors::Media,
15
+ Extractors::Pdf,
16
+ Extractors::Iframe,
17
+ Extractors::Archive
18
+ ].flat_map { |strategy| strategy.call(article_tag, base_url:) }
19
+ end
20
+ end
21
+
22
+ # Extraction strategies for enclosure-like media/link tags.
23
+ module Extractors
24
+ # Extracts image enclosures from HTML tags.
25
+ # Finds all image sources and returns them in a format suitable for RSS.
26
+ class Image
27
+ # @param article_tag [Nokogiri::XML::Element] article container node
28
+ # @param base_url [String, Html2rss::Url] base URL for relative image sources
29
+ # @return [Array<Hash{Symbol => Object}>] image enclosure hashes
30
+ def self.call(article_tag, base_url:)
31
+ article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
32
+ src = img['src'].to_s
33
+ next if src.empty?
34
+
35
+ abs_url = Url.from_relative(src, base_url)
36
+ {
37
+ url: abs_url,
38
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
39
+ }
40
+ end
41
+ end
42
+ end
43
+
44
+ # Extracts media enclosures (video/audio) from HTML tags.
45
+ class Media
46
+ # @param article_tag [Nokogiri::XML::Element] article container node
47
+ # @param base_url [String, Html2rss::Url] base URL for relative media sources
48
+ # @return [Array<Hash{Symbol => Object}>] media enclosure hashes
49
+ def self.call(article_tag, base_url:)
50
+ article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
51
+ src = element['src'].to_s
52
+ next if src.empty?
53
+
54
+ {
55
+ url: Url.from_relative(src, base_url),
56
+ type: element['type']
57
+ }
58
+ end
59
+ end
60
+ end
61
+
62
+ # Extracts PDF enclosures from HTML tags.
63
+ class Pdf
64
+ # @param article_tag [Nokogiri::XML::Element] article container node
65
+ # @param base_url [String, Html2rss::Url] base URL for relative PDF links
66
+ # @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
67
+ def self.call(article_tag, base_url:)
68
+ article_tag.css('a[href$=".pdf"]').filter_map do |link|
69
+ href = link['href'].to_s
70
+ next if href.empty?
71
+
72
+ abs_url = Url.from_relative(href, base_url)
73
+ {
74
+ url: abs_url,
75
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
76
+ }
77
+ end
78
+ end
79
+ end
80
+
81
+ # Extracts iframe enclosures from HTML tags.
82
+ class Iframe
83
+ # @param article_tag [Nokogiri::XML::Element] article container node
84
+ # @param base_url [String, Html2rss::Url] base URL for relative iframe links
85
+ # @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
86
+ def self.call(article_tag, base_url:)
87
+ article_tag.css('iframe[src]').filter_map do |iframe|
88
+ src = iframe['src']
89
+ next if src.nil? || src.empty?
90
+
91
+ abs_url = Url.from_relative(src, base_url)
92
+ {
93
+ url: abs_url,
94
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
95
+ }
96
+ end
97
+ end
98
+ end
99
+
100
+ # Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
101
+ class Archive
102
+ # @param article_tag [Nokogiri::XML::Element] article container node
103
+ # @param base_url [String, Html2rss::Url] base URL for relative archive links
104
+ # @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
105
+ def self.call(article_tag, base_url:)
106
+ article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
107
+ href = link['href'].to_s
108
+ next if href.empty?
109
+
110
+ abs_url = Url.from_relative(href, base_url)
111
+ {
112
+ url: abs_url,
113
+ type: 'application/zip'
114
+ }
115
+ end
116
+ end
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class HtmlExtractor
5
+ ##
6
+ # Image is responsible for extracting image URLs the article_tag.
7
+ class ImageExtractor
8
+ # @param article_tag [Nokogiri::XML::Element] article container node
9
+ # @param base_url [String, Html2rss::Url] base URL for relative image URLs
10
+ # @return [Html2rss::Url, nil] best candidate image URL
11
+ def self.call(article_tag, base_url:)
12
+ img_src = from_source(article_tag) ||
13
+ from_img(article_tag) ||
14
+ from_style(article_tag)
15
+
16
+ Url.from_relative(img_src, base_url) if img_src
17
+ end
18
+
19
+ # @param article_tag [Nokogiri::XML::Element] article container node
20
+ # @return [String, nil] src attribute from first matching image tag
21
+ def self.from_img(article_tag)
22
+ article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
23
+ end
24
+
25
+ ##
26
+ # Extracts the largest image source from the srcset attribute
27
+ # of an img tag or a source tag inside a picture tag.
28
+ #
29
+ # @param article_tag [Nokogiri::XML::Element] article container node
30
+ # @return [String, nil] largest srcset URL candidate
31
+ # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
32
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
33
+ # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
34
+ def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
35
+ hash = article_tag.css('img[srcset], picture > source[srcset]').flat_map do |source|
36
+ source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)[\s,]?/).map do |url, width|
37
+ next if url.nil? || url.start_with?('data:')
38
+
39
+ width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
40
+
41
+ [width_value, url.strip]
42
+ end
43
+ end.compact.to_h
44
+
45
+ hash[hash.keys.max]
46
+ end
47
+
48
+ # @param article_tag [Nokogiri::XML::Element] article container node
49
+ # @return [String, nil] best style-based background image URL
50
+ def self.from_style(article_tag)
51
+ article_tag.css('[style*="url"]')
52
+ .filter_map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
53
+ .reject { |src| src.start_with?('data:') }
54
+ .max_by(&:size)
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,141 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
6
+ # from an article_tag.
7
+ class HtmlExtractor
8
+ # Tags ignored when extracting visible text content from article containers.
9
+ INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
10
+ # Heading tags used to prioritize title extraction.
11
+ HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
12
+ # Selector used to derive non-headline description nodes.
13
+ NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
14
+
15
+ # Anchor selector used to identify the canonical article link element.
16
+ MAIN_ANCHOR_SELECTOR = begin
17
+ buf = +'a[href]:not([href=""])'
18
+ %w[# javascript: mailto: tel: file:// sms: data:].each do |prefix|
19
+ buf << %[:not([href^="#{prefix}"])]
20
+ end
21
+ buf.freeze
22
+ end
23
+
24
+ class << self
25
+ ##
26
+ # Extracts visible text from a given node and its children.
27
+ #
28
+ # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
29
+ # @param separator [String] separator used to join text fragments (default is a space)
30
+ # @return [String, nil] the concatenated visible text, or nil if none is found
31
+ def extract_visible_text(tag, separator: ' ')
32
+ parts = tag.children.filter_map do |child|
33
+ next unless visible_child?(child)
34
+
35
+ raw_text = child.children.empty? ? child.text : extract_visible_text(child)
36
+ text = raw_text&.strip
37
+ text unless text.to_s.empty?
38
+ end
39
+
40
+ parts.join(separator).squeeze(' ').strip unless parts.empty?
41
+ end
42
+
43
+ private
44
+
45
+ def visible_child?(node)
46
+ !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
47
+ !(node.name == 'a' && node['href']&.start_with?('#'))
48
+ end
49
+ end
50
+
51
+ ##
52
+ # @param article_tag [Nokogiri::XML::Node] article-like container to extract from
53
+ # @param base_url [String, Html2rss::Url] base url used to resolve relative links
54
+ # @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
55
+ def initialize(article_tag, base_url:, selected_anchor:)
56
+ raise ArgumentError, 'article_tag is required' unless article_tag
57
+
58
+ @article_tag = article_tag
59
+ @base_url = base_url
60
+ @selected_anchor = selected_anchor
61
+ end
62
+
63
+ # @return [Hash{Symbol => Object}] extracted article attributes
64
+ def call
65
+ {
66
+ title: extract_title,
67
+ url: extract_url,
68
+ image: extract_image,
69
+ description: extract_description,
70
+ id: generate_id,
71
+ published_at: extract_published_at,
72
+ enclosures: extract_enclosures,
73
+ categories: extract_categories
74
+ }
75
+ end
76
+
77
+ private
78
+
79
+ attr_reader :article_tag, :base_url, :selected_anchor
80
+
81
+ class << self
82
+ ##
83
+ # @param article_tag [Nokogiri::XML::Node] article-like container to search within
84
+ # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
85
+ def main_anchor_for(article_tag)
86
+ return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
87
+
88
+ article_tag.at_css(MAIN_ANCHOR_SELECTOR)
89
+ end
90
+ end
91
+
92
+ def extract_url
93
+ @extract_url ||= begin
94
+ href = selected_anchor&.[]('href').to_s
95
+
96
+ Url.from_relative(href.split('#').first.strip, base_url) unless href.empty?
97
+ end
98
+ end
99
+
100
+ def extract_title
101
+ title_source = heading || selected_anchor
102
+ self.class.extract_visible_text(title_source) if title_source
103
+ end
104
+
105
+ def heading
106
+ @heading ||= begin
107
+ heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
108
+ smallest_heading = heading_tags.keys.min
109
+ if smallest_heading
110
+ heading_tags[smallest_heading]&.max_by do |tag|
111
+ self.class.extract_visible_text(tag)&.size.to_i
112
+ end
113
+ end
114
+ end
115
+ end
116
+
117
+ def extract_description
118
+ text = self.class.extract_visible_text(article_tag.css(NON_HEADLINE_SELECTOR), separator: '<br>')
119
+ return text if text && !text.empty?
120
+
121
+ description = self.class.extract_visible_text(article_tag)
122
+ return nil if description.nil? || description.strip.empty?
123
+
124
+ description.strip
125
+ end
126
+
127
+ def generate_id
128
+ [
129
+ article_tag['id'],
130
+ article_tag.at_css('[id]')&.attr('id'),
131
+ extract_url&.path,
132
+ extract_url&.query
133
+ ].compact.reject(&:empty?).first
134
+ end
135
+
136
+ def extract_image = ImageExtractor.call(article_tag, base_url:)
137
+ def extract_published_at = DateExtractor.call(article_tag)
138
+ def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
139
+ def extract_categories = CategoryExtractor.call(article_tag)
140
+ end
141
+ end