html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
data/exe/html2rss CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # frozen_string_literal: true
3
3
 
4
- require 'html2rss/cli'
4
+ require 'html2rss'
5
5
 
6
6
  Html2rss::CLI.start(ARGV)
data/html2rss.gemspec CHANGED
@@ -26,15 +26,18 @@ Gem::Specification.new do |spec|
26
26
  end
27
27
 
28
28
  spec.files = `git ls-files -z`.split("\x0").select do |f|
29
- f.match(%r{^(lib/|exe/|README.md|LICENSE|html2rss.gemspec)})
29
+ f.match(%r{^(lib/|exe/|schema/|README.md|LICENSE|html2rss.gemspec)})
30
30
  end
31
31
  spec.bindir = 'exe'
32
32
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
33
33
  spec.require_paths = ['lib']
34
34
 
35
35
  spec.add_dependency 'addressable', '~> 2.7'
36
+ spec.add_dependency 'brotli'
37
+ spec.add_dependency 'dry-validation'
36
38
  spec.add_dependency 'faraday', '> 2.0.1', '< 3.0'
37
39
  spec.add_dependency 'faraday-follow_redirects'
40
+ spec.add_dependency 'faraday-gzip', '~> 3'
38
41
  spec.add_dependency 'kramdown'
39
42
  spec.add_dependency 'mime-types', '> 3.0'
40
43
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
@@ -43,7 +46,7 @@ Gem::Specification.new do |spec|
43
46
  spec.add_dependency 'regexp_parser'
44
47
  spec.add_dependency 'reverse_markdown', '~> 3.0'
45
48
  spec.add_dependency 'rss'
46
- spec.add_dependency 'sanitize', '~> 6.0'
49
+ spec.add_dependency 'sanitize'
47
50
  spec.add_dependency 'thor'
48
51
  spec.add_dependency 'tzinfo'
49
52
  spec.add_dependency 'zeitwerk'
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set' # rubocop:disable Lint/RedundantRequireStatement
4
+
5
+ module Html2rss
6
+ # Shared helpers that operate on `RssBuilder::Article` collections.
7
+ module Articles
8
+ ##
9
+ # Deduplicates a list of articles while preserving their original order.
10
+ #
11
+ # The deduplicator prefers each article's URL (combined with its ID when
12
+ # available) to determine uniqueness. When no URL is present, it falls
13
+ # back to the article ID, then to the GUID enriched with title and
14
+ # description metadata. If none of these identifiers are available it
15
+ # defaults to the article object's hash to preserve the original entry.
16
+ class Deduplicator
17
+ ##
18
+ # @param articles [Array<Html2rss::RssBuilder::Article>]
19
+ # @raise [ArgumentError] if articles are not provided
20
+ def initialize(articles)
21
+ raise ArgumentError, 'articles must be provided' unless articles
22
+
23
+ @articles = articles
24
+ end
25
+
26
+ ##
27
+ # Returns the list of unique articles, preserving the order of the
28
+ # original collection and keeping the first occurrence of a duplicate.
29
+ # @return [Array<Html2rss::RssBuilder::Article>]
30
+ def call
31
+ seen = Set.new
32
+
33
+ articles.filter do |article|
34
+ fingerprint = deduplication_fingerprint_for(article) || article.hash
35
+ seen.add?(fingerprint)
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ attr_reader :articles
42
+
43
+ def deduplication_fingerprint_for(article)
44
+ return unless article.respond_to?(:deduplication_fingerprint)
45
+
46
+ article.deduplication_fingerprint
47
+ end
48
+ end
49
+ end
50
+ end
@@ -7,8 +7,22 @@ module Html2rss
7
7
  # :reek:MissingSafeMethod { enabled: false }
8
8
  # It applies various strategies to filter and refine the article list.
9
9
  class Cleanup
10
+ # Default cleanup behavior for auto-sourced article lists.
11
+ DEFAULT_CONFIG = {
12
+ keep_different_domain: false,
13
+ min_words_title: 3
14
+ }.freeze
15
+
16
+ # Allowed URL schemes for article filtering.
17
+ VALID_SCHEMES = %w[http https].to_set.freeze
18
+
10
19
  class << self
11
- def call(articles, url:, keep_different_domain: false)
20
+ # @param articles [Array<Article>] extracted article candidates
21
+ # @param url [Html2rss::Url] feed source URL used for same-host filtering
22
+ # @param keep_different_domain [Boolean] whether to keep off-domain entries
23
+ # @param min_words_title [Integer] minimum word count for title filtering
24
+ # @return [Array<Article>] cleaned article list
25
+ def call(articles, url:, keep_different_domain:, min_words_title:)
12
26
  Log.debug "Cleanup: start with #{articles.size} articles"
13
27
 
14
28
  articles.select!(&:valid?)
@@ -17,18 +31,18 @@ module Html2rss
17
31
 
18
32
  keep_only_http_urls!(articles)
19
33
  reject_different_domain!(articles, url) unless keep_different_domain
34
+ keep_only_with_min_words_title!(articles, min_words_title:)
20
35
 
21
36
  Log.debug "Cleanup: end with #{articles.size} articles"
22
37
  articles
23
38
  end
24
39
 
25
- private
26
-
27
40
  ##
28
41
  # Deduplicates articles by a given key.
29
42
  #
30
43
  # @param articles [Array<Article>] The list of articles to process.
31
44
  # @param key [Symbol] The key to deduplicate by.
45
+ # @return [Array<Article>] the mutated articles array
32
46
  def deduplicate_by!(articles, key)
33
47
  seen = {}
34
48
  articles.reject! do |article|
@@ -41,19 +55,44 @@ module Html2rss
41
55
  # Keeps only articles with HTTP or HTTPS URLs.
42
56
  #
43
57
  # @param articles [Array<Article>] The list of articles to process.
58
+ # @return [Array<Article>] the mutated articles array
44
59
  def keep_only_http_urls!(articles)
45
- articles.select! { |article| %w[http https].include?(article.url&.scheme) }
60
+ articles.select! { |article| VALID_SCHEMES.include?(article.url&.scheme) }
46
61
  end
47
62
 
48
63
  ##
49
64
  # Rejects articles that have a URL not on the same domain as the source.
50
65
  #
51
66
  # @param articles [Array<Article>] The list of articles to process.
52
- # @param base_url [Addressable::URI] The source URL to compare against.
67
+ # @param base_url [Html2rss::Url] The source URL to compare against.
68
+ # @return [Array<Article>] the mutated articles array
53
69
  def reject_different_domain!(articles, base_url)
54
70
  base_host = base_url.host
55
71
  articles.select! { |article| article.url&.host == base_host }
56
72
  end
73
+
74
+ ##
75
+ # Keeps only articles with a title that is present and has at least `min_words_title` words.
76
+ #
77
+ # @param articles [Array<Article>] The list of articles to process.
78
+ # @param min_words_title [Integer] The minimum number of words in the title.
79
+ # @return [Array<Article>] the mutated articles array
80
+ def keep_only_with_min_words_title!(articles, min_words_title:)
81
+ articles.select! do |article|
82
+ article.title ? word_count_at_least?(article.title, min_words_title) : true
83
+ end
84
+ end
85
+
86
+ private
87
+
88
+ def word_count_at_least?(str, min_words)
89
+ count = 0
90
+ str.to_s.scan(/\p{Alnum}+/) do
91
+ count += 1
92
+ return true if count >= min_words
93
+ end
94
+ false
95
+ end
57
96
  end
58
97
  end
59
98
  end
@@ -6,34 +6,63 @@ module Html2rss
6
6
  class AutoSource
7
7
  module Scraper
8
8
  ##
9
- # Scrapes articles from HTML pages by
10
- # finding similar structures around anchor tags in the parsed_body.
9
+ # Scrapes article-like blocks from plain HTML by looking for repeated link
10
+ # structures when richer structured data is unavailable.
11
+ #
12
+ # The approach is intentionally heuristic:
13
+ # 1. collect repeated anchor paths
14
+ # 2. walk upward to a shared container shape
15
+ # 3. extract the best anchor found inside each container
16
+ #
17
+ # This scraper is broader and noisier than `SemanticHtml`, so it acts as a
18
+ # fallback for pages without stronger semantic signals.
11
19
  class Html
12
20
  include Enumerable
13
21
 
14
- TAGS_TO_IGNORE = /(nav|footer|header)/i
22
+ # Elements ignored when traversing potential article containers.
23
+ TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
15
24
 
16
- def self.articles?(parsed_body)
17
- new(parsed_body, url: '').any?
18
- end
25
+ # Minimum selector frequency required to treat a path as a stable list signal.
26
+ DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
27
+ # Number of most frequent selectors kept for container extraction.
28
+ DEFAULT_USE_TOP_SELECTORS = 5
19
29
 
20
- def self.parent_until_condition(node, condition)
21
- return nil if !node || node.document? || node.parent.name == 'html'
22
- return node if condition.call(node)
30
+ ##
31
+ # @return [Symbol] config key used to enable or configure this scraper
32
+ def self.options_key = :html
23
33
 
24
- parent_until_condition(node.parent, condition)
34
+ ##
35
+ # Probes whether the document appears to contain repeated anchor
36
+ # structures that this fallback scraper can cluster into article-like
37
+ # containers.
38
+ #
39
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
40
+ # @return [Boolean] true when the scraper can likely extract articles
41
+ def self.articles?(parsed_body)
42
+ new(parsed_body, url: '').any?
25
43
  end
26
44
 
27
45
  ##
28
46
  # Simplify an XPath selector by removing the index notation.
47
+ # This keeps repeated anchor paths comparable across sibling blocks.
48
+ #
49
+ # @param xpath [String] original XPath
50
+ # @return [String] XPath without positional indexes
29
51
  def self.simplify_xpath(xpath)
30
52
  xpath.gsub(/\[\d+\]/, '')
31
53
  end
32
54
 
33
- def initialize(parsed_body, url:)
55
+ # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
56
+ # @param url [String] The base URL.
57
+ # @param extractor [Class] The extractor class to handle article extraction.
58
+ # @param opts [Hash] Additional options.
59
+ # @option opts [Integer] :minimum_selector_frequency minimum count before a selector is considered stable
60
+ # @option opts [Integer] :use_top_selectors number of top selectors to keep
61
+ def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
34
62
  @parsed_body = parsed_body
35
63
  @url = url
36
- @selectors = Hash.new(0)
64
+ @extractor = extractor
65
+ @opts = opts
37
66
  end
38
67
 
39
68
  attr_reader :parsed_body
@@ -44,51 +73,102 @@ module Html2rss
44
73
  def each
45
74
  return enum_for(:each) unless block_given?
46
75
 
47
- return if frequent_selectors.empty?
76
+ each_article_tag do |article_tag|
77
+ article_hash = extract_article(article_tag)
78
+ yield article_hash if article_hash
79
+ end
80
+ end
48
81
 
49
- frequent_selectors.each do |selector|
50
- parsed_body.xpath(selector).each do |selected_tag|
51
- article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
82
+ ##
83
+ # Decides whether a traversed node has reached a useful article-like
84
+ # boundary for the generic HTML scraper.
85
+ #
86
+ # The predicate prefers containers that add surrounding link context,
87
+ # which helps the scraper move from a leaf anchor toward a repeated
88
+ # teaser/card wrapper.
89
+ #
90
+ # @param node [Nokogiri::XML::Node] candidate boundary node
91
+ # @return [Boolean] true when the node is a good extraction boundary
92
+ def article_tag_condition?(node)
93
+ # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
94
+ return false if node.path.match?(TAGS_TO_IGNORE)
95
+ return true if %w[body html].include?(node.name)
96
+ return false unless (parent = node.parent)
52
97
 
53
- if article_tag && (article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call)
54
- yield article_hash
55
- end
56
- end
98
+ anchor_count(parent) > anchor_count(node)
99
+ end
100
+
101
+ private
102
+
103
+ ##
104
+ # Find relevant anchors in root.
105
+ # @return [Set<String>] The set of XPath selectors
106
+ def selectors
107
+ @selectors ||= Hash.new(0).tap do |selectors|
108
+ each_relevant_anchor { |node| increment_selector_count(selectors, node) }
57
109
  end
58
110
  end
59
111
 
60
112
  ##
61
- # Find all the anchors in root.
62
- # @param root [Nokogiri::XML::Node] The root node to search for anchors
63
- # @return [Set<String>] The set of XPath selectors which exist at least min_frequency times
64
- def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
65
- @frequent_selectors ||= begin
66
- root.traverse do |node|
67
- next if !node.element? || node.name != 'a'
68
-
69
- @selectors[self.class.simplify_xpath(node.path)] += 1
70
- end
113
+ # Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
114
+ # @return [Array<String>] The filtered selectors
115
+ def filtered_selectors
116
+ selectors.select { |_selector, count| count >= minimum_selector_frequency }
117
+ .max_by(use_top_selectors, &:last)
118
+ .map(&:first)
119
+ end
120
+
121
+ def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
122
+ def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
123
+
124
+ def anchor_count(node)
125
+ @anchor_counts ||= {}
126
+ @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
127
+ end
128
+
129
+ def each_relevant_anchor
130
+ return enum_for(:each_relevant_anchor) unless block_given?
71
131
 
72
- @selectors.keys
73
- .select { |selector| (@selectors[selector]).to_i >= min_frequency }
74
- .to_set
132
+ traversal_root&.traverse do |node|
133
+ yield node if relevant_anchor?(node)
75
134
  end
76
135
  end
77
136
 
78
- def article_condition(node)
79
- # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
80
- return false if node.path.match?(TAGS_TO_IGNORE)
137
+ def relevant_anchor?(node)
138
+ node.element? && node.name == 'a' && !String(node['href']).empty?
139
+ end
81
140
 
82
- # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
83
- return false if self.class.parent_until_condition(node, proc do |current_node|
84
- current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
85
- end)
141
+ def increment_selector_count(selectors, node)
142
+ path = self.class.simplify_xpath(node.path)
143
+ selectors[path] += 1 unless path.match?(TAGS_TO_IGNORE)
144
+ end
86
145
 
87
- return true if %w[body html].include?(node.name)
146
+ def traversal_root
147
+ parsed_body.at_css('body, html') || parsed_body.root
148
+ end
149
+
150
+ def each_article_tag
151
+ return enum_for(:each_article_tag) unless block_given?
152
+
153
+ filtered_selectors.each do |selector|
154
+ parsed_body.xpath(selector).each do |selected_tag|
155
+ article_tag = article_tag_for(selected_tag)
156
+ yield article_tag if article_tag
157
+ end
158
+ end
159
+ end
160
+
161
+ def article_tag_for(selected_tag)
162
+ return if selected_tag.path.match?(Html::TAGS_TO_IGNORE)
163
+
164
+ HtmlNavigator.parent_until_condition(selected_tag, method(:article_tag_condition?))
165
+ end
88
166
 
89
- return true if node.parent.css('a').size > 1
167
+ def extract_article(article_tag)
168
+ selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
169
+ return unless selected_anchor
90
170
 
91
- false
171
+ @extractor.new(article_tag, base_url: @url, selected_anchor:).call
92
172
  end
93
173
  end
94
174
  end