html2rss 0.14.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
4
- data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
3
+ metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
4
+ data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
5
5
  SHA512:
6
- metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
7
- data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
6
+ metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
7
+ data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'set'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ ##
10
+ # Scrapes articles from HTML pages by
11
+ # finding similar structures around anchor tags in the parsed_body.
12
+ class Html
13
+ include Enumerable
14
+
15
+ def self.articles?(parsed_body)
16
+ new(parsed_body, url: '').any?
17
+ end
18
+
19
+ def self.parent_until_condition(node, condition)
20
+ return nil if !node || node.parent.name == 'html'
21
+ return node if condition.call(node)
22
+
23
+ parent_until_condition(node.parent, condition)
24
+ end
25
+
26
+ ##
27
+ # Simplify an XPath selector by removing the index notation.
28
+ def self.simplify_xpath(xpath)
29
+ xpath.gsub(/\[\d+\]/, '')
30
+ end
31
+
32
+ def initialize(parsed_body, url:)
33
+ @parsed_body = parsed_body
34
+ @url = url
35
+ @css_selectors = Hash.new(0)
36
+ end
37
+
38
+ attr_reader :parsed_body
39
+
40
+ ##
41
+ # @yieldparam [Hash] The scraped article hash
42
+ # @return [Enumerator] Enumerator for the scraped articles
43
+ def each
44
+ return enum_for(:each) unless block_given?
45
+
46
+ return if frequent_selectors.empty?
47
+
48
+ frequent_selectors.each do |selector|
49
+ parsed_body.xpath(selector).each do |selected_tag|
50
+ article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
51
+ article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
52
+
53
+ yield article_hash if article_hash
54
+ end
55
+ end
56
+ end
57
+
58
+ ##
59
+ # Find all the anchors in root.
60
+ # @param root [Nokogiri::XML::Node] The root node to search for anchors
61
+ # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
62
+ def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
63
+ @frequent_selectors ||= begin
64
+ root.traverse do |node|
65
+ next if !node.element? || node.name != 'a'
66
+
67
+ @css_selectors[self.class.simplify_xpath(node.path)] += 1
68
+ end
69
+
70
+ @css_selectors.keys
71
+ .select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
72
+ .to_set
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ def article_condition(node)
79
+ return true if %w[body html].include?(node.name)
80
+ return true if node.parent.css('a').size > 1
81
+
82
+ false
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -35,13 +35,13 @@ module Html2rss
35
35
  def initialize(article_tag, url:)
36
36
  @article_tag = article_tag
37
37
  @url = url
38
- @heading = find_heading
39
- @extract_url = find_url
40
38
  end
41
39
 
42
40
  # @return [Hash, nil] The scraped article or nil.
43
41
  def call
44
- return unless heading
42
+ @heading = find_heading || closest_anchor || return
43
+
44
+ @extract_url = find_url
45
45
 
46
46
  {
47
47
  title: extract_title,
@@ -71,14 +71,20 @@ module Html2rss
71
71
  times.min
72
72
  end
73
73
 
74
+ ##
75
+ # Find the heading of the article.
76
+ # @return [Nokogiri::XML::Node, nil]
74
77
  def find_heading
75
78
  heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
79
+
80
+ return if heading_tags.empty?
81
+
76
82
  smallest_heading = heading_tags.keys.min
77
83
  heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
78
84
  end
79
85
 
80
86
  def extract_title
81
- @extract_title ||= if heading.children.empty? && heading.text
87
+ @extract_title ||= if heading && (heading.children.empty? || heading.text)
82
88
  visible_text_from_tag(heading)
83
89
  else
84
90
  visible_text_from_tag(
@@ -101,9 +107,12 @@ module Html2rss
101
107
  description.empty? ? nil : description
102
108
  end
103
109
 
110
+ def closest_anchor
111
+ SemanticHtml.find_closest_selector(heading || article_tag,
112
+ selector: 'a[href]:not([href=""])')
113
+ end
114
+
104
115
  def find_url
105
- closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
106
- selector: 'a[href]:not([href=""])')
107
116
  href = closest_anchor&.[]('href')&.split('#')&.first&.strip
108
117
  Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
109
118
  end
@@ -113,8 +122,12 @@ module Html2rss
113
122
  end
114
123
 
115
124
  def generate_id
116
- [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
117
- extract_url&.path].compact.reject(&:empty?).first
125
+ [
126
+ article_tag['id'],
127
+ article_tag.at_css('[id]')&.attr('id'),
128
+ extract_url&.path,
129
+ extract_url&.query
130
+ ].compact.reject(&:empty?).first
118
131
  end
119
132
  end
120
133
  end
@@ -10,6 +10,7 @@ module Html2rss
10
10
  #
11
11
  module Scraper
12
12
  SCRAPERS = [
13
+ Html,
13
14
  Schema,
14
15
  SemanticHtml
15
16
  ].freeze
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.14.0'
6
+ VERSION = '0.15.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -5,8 +5,9 @@ require 'zeitwerk'
5
5
  loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
- require 'yaml'
8
+ require 'addressable'
9
9
  require 'logger'
10
+ require 'yaml'
10
11
 
11
12
  ##
12
13
  # The Html2rss namespace.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.0
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-10-08 00:00:00.000000000 Z
11
+ date: 2024-10-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -251,6 +251,7 @@ files:
251
251
  - lib/html2rss/auto_source/reducer.rb
252
252
  - lib/html2rss/auto_source/rss_builder.rb
253
253
  - lib/html2rss/auto_source/scraper.rb
254
+ - lib/html2rss/auto_source/scraper/html.rb
254
255
  - lib/html2rss/auto_source/scraper/schema.rb
255
256
  - lib/html2rss/auto_source/scraper/schema/base.rb
256
257
  - lib/html2rss/auto_source/scraper/semantic_html.rb
@@ -279,7 +280,7 @@ licenses:
279
280
  - MIT
280
281
  metadata:
281
282
  allowed_push_host: https://rubygems.org
282
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
283
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
283
284
  rubygems_mfa_required: 'true'
284
285
  post_install_message:
285
286
  rdoc_options: []