html2rss 0.14.0 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
4
- data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
3
+ metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
4
+ data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
5
5
  SHA512:
6
- metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
7
- data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
6
+ metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
7
+ data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'nokogiri'
4
+ require 'set'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ ##
10
+ # Scrapes articles from HTML pages by
11
+ # finding similar structures around anchor tags in the parsed_body.
12
+ class Html
13
+ include Enumerable
14
+
15
+ def self.articles?(parsed_body)
16
+ new(parsed_body, url: '').any?
17
+ end
18
+
19
+ def self.parent_until_condition(node, condition)
20
+ return nil if !node || node.parent.name == 'html'
21
+ return node if condition.call(node)
22
+
23
+ parent_until_condition(node.parent, condition)
24
+ end
25
+
26
+ ##
27
+ # Simplify an XPath selector by removing the index notation.
28
+ def self.simplify_xpath(xpath)
29
+ xpath.gsub(/\[\d+\]/, '')
30
+ end
31
+
32
+ def initialize(parsed_body, url:)
33
+ @parsed_body = parsed_body
34
+ @url = url
35
+ @css_selectors = Hash.new(0)
36
+ end
37
+
38
+ attr_reader :parsed_body
39
+
40
+ ##
41
+ # @yieldparam [Hash] The scraped article hash
42
+ # @return [Enumerator] Enumerator for the scraped articles
43
+ def each
44
+ return enum_for(:each) unless block_given?
45
+
46
+ return if frequent_selectors.empty?
47
+
48
+ frequent_selectors.each do |selector|
49
+ parsed_body.xpath(selector).each do |selected_tag|
50
+ article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
51
+ article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
52
+
53
+ yield article_hash if article_hash
54
+ end
55
+ end
56
+ end
57
+
58
+ ##
59
+ # Find all the anchors in root.
60
+ # @param root [Nokogiri::XML::Node] The root node to search for anchors
61
+ # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
62
+ def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
63
+ @frequent_selectors ||= begin
64
+ root.traverse do |node|
65
+ next if !node.element? || node.name != 'a'
66
+
67
+ @css_selectors[self.class.simplify_xpath(node.path)] += 1
68
+ end
69
+
70
+ @css_selectors.keys
71
+ .select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
72
+ .to_set
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ def article_condition(node)
79
+ return true if %w[body html].include?(node.name)
80
+ return true if node.parent.css('a').size > 1
81
+
82
+ false
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -35,13 +35,13 @@ module Html2rss
35
35
  def initialize(article_tag, url:)
36
36
  @article_tag = article_tag
37
37
  @url = url
38
- @heading = find_heading
39
- @extract_url = find_url
40
38
  end
41
39
 
42
40
  # @return [Hash, nil] The scraped article or nil.
43
41
  def call
44
- return unless heading
42
+ @heading = find_heading || closest_anchor || return
43
+
44
+ @extract_url = find_url
45
45
 
46
46
  {
47
47
  title: extract_title,
@@ -71,14 +71,20 @@ module Html2rss
71
71
  times.min
72
72
  end
73
73
 
74
+ ##
75
+ # Find the heading of the article.
76
+ # @return [Nokogiri::XML::Node, nil]
74
77
  def find_heading
75
78
  heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
79
+
80
+ return if heading_tags.empty?
81
+
76
82
  smallest_heading = heading_tags.keys.min
77
83
  heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
78
84
  end
79
85
 
80
86
  def extract_title
81
- @extract_title ||= if heading.children.empty? && heading.text
87
+ @extract_title ||= if heading && (heading.children.empty? || heading.text)
82
88
  visible_text_from_tag(heading)
83
89
  else
84
90
  visible_text_from_tag(
@@ -101,9 +107,12 @@ module Html2rss
101
107
  description.empty? ? nil : description
102
108
  end
103
109
 
110
+ def closest_anchor
111
+ SemanticHtml.find_closest_selector(heading || article_tag,
112
+ selector: 'a[href]:not([href=""])')
113
+ end
114
+
104
115
  def find_url
105
- closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
106
- selector: 'a[href]:not([href=""])')
107
116
  href = closest_anchor&.[]('href')&.split('#')&.first&.strip
108
117
  Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
109
118
  end
@@ -113,8 +122,12 @@ module Html2rss
113
122
  end
114
123
 
115
124
  def generate_id
116
- [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
117
- extract_url&.path].compact.reject(&:empty?).first
125
+ [
126
+ article_tag['id'],
127
+ article_tag.at_css('[id]')&.attr('id'),
128
+ extract_url&.path,
129
+ extract_url&.query
130
+ ].compact.reject(&:empty?).first
118
131
  end
119
132
  end
120
133
  end
@@ -10,6 +10,7 @@ module Html2rss
10
10
  #
11
11
  module Scraper
12
12
  SCRAPERS = [
13
+ Html,
13
14
  Schema,
14
15
  SemanticHtml
15
16
  ].freeze
@@ -3,6 +3,6 @@
3
3
  ##
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
- VERSION = '0.14.0'
6
+ VERSION = '0.15.0'
7
7
  public_constant :VERSION
8
8
  end
data/lib/html2rss.rb CHANGED
@@ -5,8 +5,9 @@ require 'zeitwerk'
5
5
  loader = Zeitwerk::Loader.for_gem
6
6
  loader.setup
7
7
 
8
- require 'yaml'
8
+ require 'addressable'
9
9
  require 'logger'
10
+ require 'yaml'
10
11
 
11
12
  ##
12
13
  # The Html2rss namespace.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.14.0
4
+ version: 0.15.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-10-08 00:00:00.000000000 Z
11
+ date: 2024-10-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: addressable
@@ -251,6 +251,7 @@ files:
251
251
  - lib/html2rss/auto_source/reducer.rb
252
252
  - lib/html2rss/auto_source/rss_builder.rb
253
253
  - lib/html2rss/auto_source/scraper.rb
254
+ - lib/html2rss/auto_source/scraper/html.rb
254
255
  - lib/html2rss/auto_source/scraper/schema.rb
255
256
  - lib/html2rss/auto_source/scraper/schema/base.rb
256
257
  - lib/html2rss/auto_source/scraper/semantic_html.rb
@@ -279,7 +280,7 @@ licenses:
279
280
  - MIT
280
281
  metadata:
281
282
  allowed_push_host: https://rubygems.org
282
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
283
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
283
284
  rubygems_mfa_required: 'true'
284
285
  post_install_message:
285
286
  rdoc_options: []