html2rss 0.14.0 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
|
4
|
+
data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
|
7
|
+
data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062
|
@@ -0,0 +1,87 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'set'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
class AutoSource
|
8
|
+
module Scraper
|
9
|
+
##
|
10
|
+
# Scrapes articles from HTML pages by
|
11
|
+
# finding similar structures around anchor tags in the parsed_body.
|
12
|
+
class Html
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
def self.articles?(parsed_body)
|
16
|
+
new(parsed_body, url: '').any?
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.parent_until_condition(node, condition)
|
20
|
+
return nil if !node || node.parent.name == 'html'
|
21
|
+
return node if condition.call(node)
|
22
|
+
|
23
|
+
parent_until_condition(node.parent, condition)
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Simplify an XPath selector by removing the index notation.
|
28
|
+
def self.simplify_xpath(xpath)
|
29
|
+
xpath.gsub(/\[\d+\]/, '')
|
30
|
+
end
|
31
|
+
|
32
|
+
def initialize(parsed_body, url:)
|
33
|
+
@parsed_body = parsed_body
|
34
|
+
@url = url
|
35
|
+
@css_selectors = Hash.new(0)
|
36
|
+
end
|
37
|
+
|
38
|
+
attr_reader :parsed_body
|
39
|
+
|
40
|
+
##
|
41
|
+
# @yieldparam [Hash] The scraped article hash
|
42
|
+
# @return [Enumerator] Enumerator for the scraped articles
|
43
|
+
def each
|
44
|
+
return enum_for(:each) unless block_given?
|
45
|
+
|
46
|
+
return if frequent_selectors.empty?
|
47
|
+
|
48
|
+
frequent_selectors.each do |selector|
|
49
|
+
parsed_body.xpath(selector).each do |selected_tag|
|
50
|
+
article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
|
51
|
+
article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
|
52
|
+
|
53
|
+
yield article_hash if article_hash
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
##
|
59
|
+
# Find all the anchors in root.
|
60
|
+
# @param root [Nokogiri::XML::Node] The root node to search for anchors
|
61
|
+
# @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
|
62
|
+
def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
|
63
|
+
@frequent_selectors ||= begin
|
64
|
+
root.traverse do |node|
|
65
|
+
next if !node.element? || node.name != 'a'
|
66
|
+
|
67
|
+
@css_selectors[self.class.simplify_xpath(node.path)] += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
@css_selectors.keys
|
71
|
+
.select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
|
72
|
+
.to_set
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def article_condition(node)
|
79
|
+
return true if %w[body html].include?(node.name)
|
80
|
+
return true if node.parent.css('a').size > 1
|
81
|
+
|
82
|
+
false
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -35,13 +35,13 @@ module Html2rss
|
|
35
35
|
def initialize(article_tag, url:)
|
36
36
|
@article_tag = article_tag
|
37
37
|
@url = url
|
38
|
-
@heading = find_heading
|
39
|
-
@extract_url = find_url
|
40
38
|
end
|
41
39
|
|
42
40
|
# @return [Hash, nil] The scraped article or nil.
|
43
41
|
def call
|
44
|
-
|
42
|
+
@heading = find_heading || closest_anchor || return
|
43
|
+
|
44
|
+
@extract_url = find_url
|
45
45
|
|
46
46
|
{
|
47
47
|
title: extract_title,
|
@@ -71,14 +71,20 @@ module Html2rss
|
|
71
71
|
times.min
|
72
72
|
end
|
73
73
|
|
74
|
+
##
|
75
|
+
# Find the heading of the article.
|
76
|
+
# @return [Nokogiri::XML::Node, nil]
|
74
77
|
def find_heading
|
75
78
|
heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
|
79
|
+
|
80
|
+
return if heading_tags.empty?
|
81
|
+
|
76
82
|
smallest_heading = heading_tags.keys.min
|
77
83
|
heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
|
78
84
|
end
|
79
85
|
|
80
86
|
def extract_title
|
81
|
-
@extract_title ||= if heading.children.empty?
|
87
|
+
@extract_title ||= if heading && (heading.children.empty? || heading.text)
|
82
88
|
visible_text_from_tag(heading)
|
83
89
|
else
|
84
90
|
visible_text_from_tag(
|
@@ -101,9 +107,12 @@ module Html2rss
|
|
101
107
|
description.empty? ? nil : description
|
102
108
|
end
|
103
109
|
|
110
|
+
def closest_anchor
|
111
|
+
SemanticHtml.find_closest_selector(heading || article_tag,
|
112
|
+
selector: 'a[href]:not([href=""])')
|
113
|
+
end
|
114
|
+
|
104
115
|
def find_url
|
105
|
-
closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
|
106
|
-
selector: 'a[href]:not([href=""])')
|
107
116
|
href = closest_anchor&.[]('href')&.split('#')&.first&.strip
|
108
117
|
Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
|
109
118
|
end
|
@@ -113,8 +122,12 @@ module Html2rss
|
|
113
122
|
end
|
114
123
|
|
115
124
|
def generate_id
|
116
|
-
[
|
117
|
-
|
125
|
+
[
|
126
|
+
article_tag['id'],
|
127
|
+
article_tag.at_css('[id]')&.attr('id'),
|
128
|
+
extract_url&.path,
|
129
|
+
extract_url&.query
|
130
|
+
].compact.reject(&:empty?).first
|
118
131
|
end
|
119
132
|
end
|
120
133
|
end
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -251,6 +251,7 @@ files:
|
|
251
251
|
- lib/html2rss/auto_source/reducer.rb
|
252
252
|
- lib/html2rss/auto_source/rss_builder.rb
|
253
253
|
- lib/html2rss/auto_source/scraper.rb
|
254
|
+
- lib/html2rss/auto_source/scraper/html.rb
|
254
255
|
- lib/html2rss/auto_source/scraper/schema.rb
|
255
256
|
- lib/html2rss/auto_source/scraper/schema/base.rb
|
256
257
|
- lib/html2rss/auto_source/scraper/semantic_html.rb
|
@@ -279,7 +280,7 @@ licenses:
|
|
279
280
|
- MIT
|
280
281
|
metadata:
|
281
282
|
allowed_push_host: https://rubygems.org
|
282
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
283
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
|
283
284
|
rubygems_mfa_required: 'true'
|
284
285
|
post_install_message:
|
285
286
|
rdoc_options: []
|