html2rss 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
|
4
|
+
data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
|
7
|
+
data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
module Scraper
|
9
|
+
##
|
10
|
+
# Scrapes articles from HTML pages by
|
11
|
+
# finding similar structures around anchor tags in the parsed_body.
|
12
|
+
class Html
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
def self.articles?(parsed_body)
|
16
|
+
new(parsed_body, url: '').any?
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parent_until_condition(node, condition)
|
20
|
+
return nil if !node || node.parent.name == 'html'
|
21
|
+
return node if condition.call(node)
|
22
|
+
|
23
|
+
parent_until_condition(node.parent, condition)
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Simplify an XPath selector by removing the index notation.
|
28
|
+
def self.simplify_xpath(xpath)
|
29
|
+
xpath.gsub(/\[\d+\]/, '')
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize(parsed_body, url:)
|
33
|
+
@parsed_body = parsed_body
|
34
|
+
@url = url
|
35
|
+
@css_selectors = Hash.new(0)
|
36
|
+
end
|
37
|
+
|
38
|
+
attr_reader :parsed_body
|
39
|
+
|
40
|
+
##
|
41
|
+
# @yieldparam [Hash] The scraped article hash
|
42
|
+
# @return [Enumerator] Enumerator for the scraped articles
|
43
|
+
def each
|
44
|
+
return enum_for(:each) unless block_given?
|
45
|
+
|
46
|
+
return if frequent_selectors.empty?
|
47
|
+
|
48
|
+
frequent_selectors.each do |selector|
|
49
|
+
parsed_body.xpath(selector).each do |selected_tag|
|
50
|
+
article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
|
51
|
+
article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
|
52
|
+
|
53
|
+
yield article_hash if article_hash
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
# Find all the anchors in root.
|
60
|
+
# @param root [Nokogiri::XML::Node] The root node to search for anchors
|
61
|
+
# @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
|
62
|
+
def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
|
63
|
+
@frequent_selectors ||= begin
|
64
|
+
root.traverse do |node|
|
65
|
+
next if !node.element? || node.name != 'a'
|
66
|
+
|
67
|
+
@css_selectors[self.class.simplify_xpath(node.path)] += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
@css_selectors.keys
|
71
|
+
.select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
|
72
|
+
.to_set
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def article_condition(node)
|
79
|
+
return true if %w[body html].include?(node.name)
|
80
|
+
return true if node.parent.css('a').size > 1
|
81
|
+
|
82
|
+
false
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -35,13 +35,13 @@ module Html2rss
|
|
35
35
|
def initialize(article_tag, url:)
|
36
36
|
@article_tag = article_tag
|
37
37
|
@url = url
|
38
|
-
@heading = find_heading
|
39
|
-
@extract_url = find_url
|
40
38
|
end
|
41
39
|
|
42
40
|
# @return [Hash, nil] The scraped article or nil.
|
43
41
|
def call
|
44
|
-
|
42
|
+
@heading = find_heading || closest_anchor || return
|
43
|
+
|
44
|
+
@extract_url = find_url
|
45
45
|
|
46
46
|
{
|
47
47
|
title: extract_title,
|
@@ -71,14 +71,20 @@ module Html2rss
|
|
71
71
|
times.min
|
72
72
|
end
|
73
73
|
|
74
|
+
##
|
75
|
+
# Find the heading of the article.
|
76
|
+
# @return [Nokogiri::XML::Node, nil]
|
74
77
|
def find_heading
|
75
78
|
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
79
|
+
|
80
|
+
return if heading_tags.empty?
|
81
|
+
|
76
82
|
smallest_heading = heading_tags.keys.min
|
77
83
|
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
|
78
84
|
end
|
79
85
|
|
80
86
|
def extract_title
|
81
|
-
@extract_title ||= if heading.children.empty?
|
87
|
+
@extract_title ||= if heading && (heading.children.empty? || heading.text)
|
82
88
|
visible_text_from_tag(heading)
|
83
89
|
else
|
84
90
|
visible_text_from_tag(
|
@@ -101,9 +107,12 @@ module Html2rss
|
|
101
107
|
description.empty? ? nil : description
|
102
108
|
end
|
103
109
|
|
110
|
+
def closest_anchor
|
111
|
+
SemanticHtml.find_closest_selector(heading || article_tag,
|
112
|
+
selector: 'a[href]:not([href=""])')
|
113
|
+
end
|
114
|
+
|
104
115
|
def find_url
|
105
|
-
closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
|
106
|
-
selector: 'a[href]:not([href=""])')
|
107
116
|
href = closest_anchor&.[]('href')&.split('#')&.first&.strip
|
108
117
|
Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
|
109
118
|
end
|
@@ -113,8 +122,12 @@ module Html2rss
|
|
113
122
|
end
|
114
123
|
|
115
124
|
def generate_id
|
116
|
-
[
|
117
|
-
|
125
|
+
[
|
126
|
+
article_tag['id'],
|
127
|
+
article_tag.at_css('[id]')&.attr('id'),
|
128
|
+
extract_url&.path,
|
129
|
+
extract_url&.query
|
130
|
+
].compact.reject(&:empty?).first
|
118
131
|
end
|
119
132
|
end
|
120
133
|
end
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -251,6 +251,7 @@ files:
|
|
251
251
|
- lib/html2rss/auto_source/reducer.rb
|
252
252
|
- lib/html2rss/auto_source/rss_builder.rb
|
253
253
|
- lib/html2rss/auto_source/scraper.rb
|
254
|
+
- lib/html2rss/auto_source/scraper/html.rb
|
254
255
|
- lib/html2rss/auto_source/scraper/schema.rb
|
255
256
|
- lib/html2rss/auto_source/scraper/schema/base.rb
|
256
257
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
@@ -279,7 +280,7 @@ licenses:
|
|
279
280
|
- MIT
|
280
281
|
metadata:
|
281
282
|
allowed_push_host: https://rubygems.org
|
282
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
283
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
|
283
284
|
rubygems_mfa_required: 'true'
|
284
285
|
post_install_message:
|
285
286
|
rdoc_options: []
|