curation 1.1 → 1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation.rb +55 -22
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
|
4
|
+
data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
|
7
|
+
data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
|
data/Gemfile.lock
CHANGED
data/lib/curation.rb
CHANGED
@@ -19,17 +19,13 @@ module Curation
|
|
19
19
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
20
20
|
]
|
21
21
|
|
22
|
-
def initialize(url)
|
22
|
+
def initialize(url, html = nil)
|
23
23
|
@url = url
|
24
|
+
@html = html
|
24
25
|
end
|
25
26
|
|
26
27
|
def title
|
27
|
-
|
28
|
-
json_ld.each do |ld|
|
29
|
-
return ld['headline'] if ld.has_key? 'headline'
|
30
|
-
end
|
31
|
-
end
|
32
|
-
metainspector.best_title
|
28
|
+
@title = find_title
|
33
29
|
end
|
34
30
|
|
35
31
|
def image
|
@@ -46,7 +42,7 @@ module Curation
|
|
46
42
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
47
43
|
end
|
48
44
|
end
|
49
|
-
h =
|
45
|
+
h = nokogiri.dup
|
50
46
|
BLACKLIST.each do |tag|
|
51
47
|
h.css(tag).remove
|
52
48
|
end
|
@@ -59,53 +55,90 @@ module Curation
|
|
59
55
|
|
60
56
|
protected
|
61
57
|
|
58
|
+
def find_title
|
59
|
+
if json_ld.any?
|
60
|
+
# Some sites have tables in tables
|
61
|
+
json_ld.flatten!
|
62
|
+
json_ld.each do |ld|
|
63
|
+
return ld['headline'] if ld.has_key? 'headline'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
begin
|
67
|
+
[
|
68
|
+
metainspector.best_title,
|
69
|
+
metainspector.title,
|
70
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
71
|
+
nokogiri.css('title')&.first&.inner_text
|
72
|
+
].each do |possibility|
|
73
|
+
return possibility unless possibility.blank?
|
74
|
+
end
|
75
|
+
rescue
|
76
|
+
puts 'Curation::Page find_title error'
|
77
|
+
end
|
78
|
+
return ''
|
79
|
+
end
|
80
|
+
|
62
81
|
def find_image
|
63
82
|
if json_ld.any?
|
64
83
|
json_ld.each do |ld|
|
65
84
|
if ld.has_key? 'image'
|
66
85
|
image_data = ld['image']
|
67
86
|
return image_data if image_data.is_a? String
|
68
|
-
|
87
|
+
if image_data.is_a? Array
|
88
|
+
first = image_data.first
|
89
|
+
return first if first.is_a? String
|
90
|
+
return first['url'] if first.is_a? Hash
|
91
|
+
end
|
69
92
|
return image_data['url'] if image_data.is_a? Hash
|
70
93
|
end
|
71
94
|
end
|
72
95
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
96
|
+
begin
|
97
|
+
[
|
98
|
+
metainspector.images.best,
|
99
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
100
|
+
].each do |possibility|
|
101
|
+
return possibility unless possibility.blank?
|
102
|
+
end
|
103
|
+
rescue
|
104
|
+
puts 'Curation::Page find_image error'
|
105
|
+
end
|
106
|
+
return ''
|
80
107
|
end
|
81
108
|
|
82
109
|
def json_ld
|
83
110
|
unless @json_ld
|
84
111
|
@json_ld = []
|
85
112
|
begin
|
86
|
-
options =
|
113
|
+
options = nokogiri.css('[type="application/ld+json"]')
|
87
114
|
options.each do |option|
|
88
115
|
string = option.inner_text
|
89
116
|
hash = JSON.parse(string)
|
90
117
|
@json_ld << hash
|
91
118
|
end
|
92
119
|
rescue
|
93
|
-
puts
|
120
|
+
puts 'Curation::Page json_ld error'
|
94
121
|
end
|
95
122
|
end
|
96
123
|
@json_ld
|
97
124
|
end
|
98
125
|
|
99
|
-
def
|
100
|
-
URI.open url
|
126
|
+
def html
|
127
|
+
@html ||= URI.open url
|
101
128
|
rescue
|
102
129
|
puts "Impossible to open #{url}"
|
103
130
|
end
|
104
131
|
|
132
|
+
def nokogiri
|
133
|
+
@nokogiri ||= Nokogiri::HTML html
|
134
|
+
rescue
|
135
|
+
puts 'Curation::Page nokogiri error'
|
136
|
+
end
|
137
|
+
|
105
138
|
def metainspector
|
106
|
-
@metainspector ||= MetaInspector.new url
|
139
|
+
@metainspector ||= MetaInspector.new url, document: html
|
107
140
|
rescue
|
108
|
-
puts
|
141
|
+
puts 'Curation::Page metainspector error'
|
109
142
|
end
|
110
143
|
end
|
111
144
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|