html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
data/exe/html2rss CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # frozen_string_literal: true
3
3
 
4
- require 'html2rss/cli'
4
+ require 'html2rss'
5
5
 
6
6
  Html2rss::CLI.start(ARGV)
data/html2rss.gemspec CHANGED
@@ -26,15 +26,18 @@ Gem::Specification.new do |spec|
26
26
  end
27
27
 
28
28
  spec.files = `git ls-files -z`.split("\x0").select do |f|
29
- f.match(%r{^(lib/|exe/|README.md|LICENSE|html2rss.gemspec)})
29
+ f.match(%r{^(lib/|exe/|schema/|README.md|LICENSE|html2rss.gemspec)})
30
30
  end
31
31
  spec.bindir = 'exe'
32
32
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
33
33
  spec.require_paths = ['lib']
34
34
 
35
35
  spec.add_dependency 'addressable', '~> 2.7'
36
+ spec.add_dependency 'brotli'
37
+ spec.add_dependency 'dry-validation'
36
38
  spec.add_dependency 'faraday', '> 2.0.1', '< 3.0'
37
39
  spec.add_dependency 'faraday-follow_redirects'
40
+ spec.add_dependency 'faraday-gzip', '~> 3'
38
41
  spec.add_dependency 'kramdown'
39
42
  spec.add_dependency 'mime-types', '> 3.0'
40
43
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
@@ -43,7 +46,7 @@ Gem::Specification.new do |spec|
43
46
  spec.add_dependency 'regexp_parser'
44
47
  spec.add_dependency 'reverse_markdown', '~> 3.0'
45
48
  spec.add_dependency 'rss'
46
- spec.add_dependency 'sanitize', '~> 6.0'
49
+ spec.add_dependency 'sanitize'
47
50
  spec.add_dependency 'thor'
48
51
  spec.add_dependency 'tzinfo'
49
52
  spec.add_dependency 'zeitwerk'
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set' # rubocop:disable Lint/RedundantRequireStatement
4
+
5
+ module Html2rss
6
+ module Articles
7
+ ##
8
+ # Deduplicates a list of articles while preserving their original order.
9
+ #
10
+ # The deduplicator prefers each article's URL (combined with its ID when
11
+ # available) to determine uniqueness. When no URL is present, it falls
12
+ # back to the article ID, then to the GUID enriched with title and
13
+ # description metadata. If none of these identifiers are available it
14
+ # defaults to the article object's hash to preserve the original entry.
15
+ class Deduplicator
16
+ ##
17
+ # @param articles [Array<Html2rss::RssBuilder::Article>]
18
+ # @raise [ArgumentError] if articles are not provided
19
+ def initialize(articles)
20
+ raise ArgumentError, 'articles must be provided' unless articles
21
+
22
+ @articles = articles
23
+ end
24
+
25
+ ##
26
+ # Returns the list of unique articles, preserving the order of the
27
+ # original collection and keeping the first occurrence of a duplicate.
28
+ # @return [Array<Html2rss::RssBuilder::Article>]
29
+ def call
30
+ seen = Set.new
31
+
32
+ articles.filter do |article|
33
+ fingerprint = deduplication_fingerprint_for(article) || article.hash
34
+ seen.add?(fingerprint)
35
+ end
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :articles
41
+
42
+ def deduplication_fingerprint_for(article)
43
+ return unless article.respond_to?(:deduplication_fingerprint)
44
+
45
+ article.deduplication_fingerprint
46
+ end
47
+ end
48
+ end
49
+ end
@@ -7,8 +7,15 @@ module Html2rss
7
7
  # :reek:MissingSafeMethod { enabled: false }
8
8
  # It applies various strategies to filter and refine the article list.
9
9
  class Cleanup
10
+ DEFAULT_CONFIG = {
11
+ keep_different_domain: false,
12
+ min_words_title: 3
13
+ }.freeze
14
+
15
+ VALID_SCHEMES = %w[http https].to_set.freeze
16
+
10
17
  class << self
11
- def call(articles, url:, keep_different_domain: false)
18
+ def call(articles, url:, keep_different_domain:, min_words_title:)
12
19
  Log.debug "Cleanup: start with #{articles.size} articles"
13
20
 
14
21
  articles.select!(&:valid?)
@@ -17,13 +24,12 @@ module Html2rss
17
24
 
18
25
  keep_only_http_urls!(articles)
19
26
  reject_different_domain!(articles, url) unless keep_different_domain
27
+ keep_only_with_min_words_title!(articles, min_words_title:)
20
28
 
21
29
  Log.debug "Cleanup: end with #{articles.size} articles"
22
30
  articles
23
31
  end
24
32
 
25
- private
26
-
27
33
  ##
28
34
  # Deduplicates articles by a given key.
29
35
  #
@@ -42,18 +48,40 @@ module Html2rss
42
48
  #
43
49
  # @param articles [Array<Article>] The list of articles to process.
44
50
  def keep_only_http_urls!(articles)
45
- articles.select! { |article| %w[http https].include?(article.url&.scheme) }
51
+ articles.select! { |article| VALID_SCHEMES.include?(article.url&.scheme) }
46
52
  end
47
53
 
48
54
  ##
49
55
  # Rejects articles that have a URL not on the same domain as the source.
50
56
  #
51
57
  # @param articles [Array<Article>] The list of articles to process.
52
- # @param base_url [Addressable::URI] The source URL to compare against.
58
+ # @param base_url [Html2rss::Url] The source URL to compare against.
53
59
  def reject_different_domain!(articles, base_url)
54
60
  base_host = base_url.host
55
61
  articles.select! { |article| article.url&.host == base_host }
56
62
  end
63
+
64
+ ##
65
+ # Keeps only articles with a title that is present and has at least `min_words_title` words.
66
+ #
67
+ # @param articles [Array<Article>] The list of articles to process.
68
+ # @param min_words_title [Integer] The minimum number of words in the title.
69
+ def keep_only_with_min_words_title!(articles, min_words_title:)
70
+ articles.select! do |article|
71
+ article.title ? word_count_at_least?(article.title, min_words_title) : true
72
+ end
73
+ end
74
+
75
+ private
76
+
77
+ def word_count_at_least?(str, min_words)
78
+ count = 0
79
+ str.to_s.scan(/\p{Alnum}+/) do
80
+ count += 1
81
+ return true if count >= min_words
82
+ end
83
+ false
84
+ end
57
85
  end
58
86
  end
59
87
  end
@@ -6,34 +6,58 @@ module Html2rss
6
6
  class AutoSource
7
7
  module Scraper
8
8
  ##
9
- # Scrapes articles from HTML pages by
10
- # finding similar structures around anchor tags in the parsed_body.
9
+ # Scrapes article-like blocks from plain HTML by looking for repeated link
10
+ # structures when richer structured data is unavailable.
11
+ #
12
+ # The approach is intentionally heuristic:
13
+ # 1. collect repeated anchor paths
14
+ # 2. walk upward to a shared container shape
15
+ # 3. extract the best anchor found inside each container
16
+ #
17
+ # This scraper is broader and noisier than `SemanticHtml`, so it acts as a
18
+ # fallback for pages without stronger semantic signals.
11
19
  class Html
12
20
  include Enumerable
13
21
 
14
- TAGS_TO_IGNORE = /(nav|footer|header)/i
22
+ TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
15
23
 
16
- def self.articles?(parsed_body)
17
- new(parsed_body, url: '').any?
18
- end
24
+ DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
25
+ DEFAULT_USE_TOP_SELECTORS = 5
19
26
 
20
- def self.parent_until_condition(node, condition)
21
- return nil if !node || node.document? || node.parent.name == 'html'
22
- return node if condition.call(node)
27
+ ##
28
+ # @return [Symbol] config key used to enable or configure this scraper
29
+ def self.options_key = :html
23
30
 
24
- parent_until_condition(node.parent, condition)
31
+ ##
32
+ # Probes whether the document appears to contain repeated anchor
33
+ # structures that this fallback scraper can cluster into article-like
34
+ # containers.
35
+ #
36
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
37
+ # @return [Boolean] true when the scraper can likely extract articles
38
+ def self.articles?(parsed_body)
39
+ new(parsed_body, url: '').any?
25
40
  end
26
41
 
27
42
  ##
28
43
  # Simplify an XPath selector by removing the index notation.
44
+ # This keeps repeated anchor paths comparable across sibling blocks.
45
+ #
46
+ # @param xpath [String] original XPath
47
+ # @return [String] XPath without positional indexes
29
48
  def self.simplify_xpath(xpath)
30
49
  xpath.gsub(/\[\d+\]/, '')
31
50
  end
32
51
 
33
- def initialize(parsed_body, url:)
52
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
53
+ # @param url [String] The base URL.
54
+ # @param extractor [Class] The extractor class to handle article extraction.
55
+ # @param opts [Hash] Additional options.
56
+ def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
34
57
  @parsed_body = parsed_body
35
58
  @url = url
36
- @selectors = Hash.new(0)
59
+ @extractor = extractor
60
+ @opts = opts
37
61
  end
38
62
 
39
63
  attr_reader :parsed_body
@@ -44,51 +68,102 @@ module Html2rss
44
68
  def each
45
69
  return enum_for(:each) unless block_given?
46
70
 
47
- return if frequent_selectors.empty?
71
+ each_article_tag do |article_tag|
72
+ article_hash = extract_article(article_tag)
73
+ yield article_hash if article_hash
74
+ end
75
+ end
48
76
 
49
- frequent_selectors.each do |selector|
50
- parsed_body.xpath(selector).each do |selected_tag|
51
- article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
77
+ ##
78
+ # Decides whether a traversed node has reached a useful article-like
79
+ # boundary for the generic HTML scraper.
80
+ #
81
+ # The predicate prefers containers that add surrounding link context,
82
+ # which helps the scraper move from a leaf anchor toward a repeated
83
+ # teaser/card wrapper.
84
+ #
85
+ # @param node [Nokogiri::XML::Node] candidate boundary node
86
+ # @return [Boolean] true when the node is a good extraction boundary
87
+ def article_tag_condition?(node)
88
+ # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
89
+ return false if node.path.match?(TAGS_TO_IGNORE)
90
+ return true if %w[body html].include?(node.name)
91
+ return false unless (parent = node.parent)
52
92
 
53
- if article_tag && (article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call)
54
- yield article_hash
55
- end
56
- end
93
+ anchor_count(parent) > anchor_count(node)
94
+ end
95
+
96
+ private
97
+
98
+ ##
99
+ # Find relevant anchors in root.
100
+ # @return [Set<String>] The set of XPath selectors
101
+ def selectors
102
+ @selectors ||= Hash.new(0).tap do |selectors|
103
+ each_relevant_anchor { |node| increment_selector_count(selectors, node) }
57
104
  end
58
105
  end
59
106
 
60
107
  ##
61
- # Find all the anchors in root.
62
- # @param root [Nokogiri::XML::Node] The root node to search for anchors
63
- # @return [Set<String>] The set of XPath selectors which exist at least min_frequency times
64
- def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
65
- @frequent_selectors ||= begin
66
- root.traverse do |node|
67
- next if !node.element? || node.name != 'a'
68
-
69
- @selectors[self.class.simplify_xpath(node.path)] += 1
70
- end
108
+ # Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
109
+ # @return [Array<String>] The filtered selectors
110
+ def filtered_selectors
111
+ selectors.select { |_selector, count| count >= minimum_selector_frequency }
112
+ .max_by(use_top_selectors, &:last)
113
+ .map(&:first)
114
+ end
115
+
116
+ def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
117
+ def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
118
+
119
+ def anchor_count(node)
120
+ @anchor_counts ||= {}
121
+ @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
122
+ end
123
+
124
+ def each_relevant_anchor
125
+ return enum_for(:each_relevant_anchor) unless block_given?
71
126
 
72
- @selectors.keys
73
- .select { |selector| (@selectors[selector]).to_i >= min_frequency }
74
- .to_set
127
+ traversal_root&.traverse do |node|
128
+ yield node if relevant_anchor?(node)
75
129
  end
76
130
  end
77
131
 
78
- def article_condition(node)
79
- # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
80
- return false if node.path.match?(TAGS_TO_IGNORE)
132
+ def relevant_anchor?(node)
133
+ node.element? && node.name == 'a' && !String(node['href']).empty?
134
+ end
81
135
 
82
- # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
83
- return false if self.class.parent_until_condition(node, proc do |current_node|
84
- current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
85
- end)
136
+ def increment_selector_count(selectors, node)
137
+ path = self.class.simplify_xpath(node.path)
138
+ selectors[path] += 1 unless path.match?(TAGS_TO_IGNORE)
139
+ end
86
140
 
87
- return true if %w[body html].include?(node.name)
141
+ def traversal_root
142
+ parsed_body.at_css('body, html') || parsed_body.root
143
+ end
144
+
145
+ def each_article_tag
146
+ return enum_for(:each_article_tag) unless block_given?
147
+
148
+ filtered_selectors.each do |selector|
149
+ parsed_body.xpath(selector).each do |selected_tag|
150
+ article_tag = article_tag_for(selected_tag)
151
+ yield article_tag if article_tag
152
+ end
153
+ end
154
+ end
155
+
156
+ def article_tag_for(selected_tag)
157
+ return if selected_tag.path.match?(Html::TAGS_TO_IGNORE)
158
+
159
+ HtmlNavigator.parent_until_condition(selected_tag, method(:article_tag_condition?))
160
+ end
88
161
 
89
- return true if node.parent.css('a').size > 1
162
+ def extract_article(article_tag)
163
+ selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
164
+ return unless selected_anchor
90
165
 
91
- false
166
+ @extractor.new(article_tag, base_url: @url, selected_anchor:).call
92
167
  end
93
168
  end
94
169
  end