curation 1.1 → 1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation.rb +55 -22
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
|
4
|
+
data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
|
7
|
+
data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
|
data/Gemfile.lock
CHANGED
data/lib/curation.rb
CHANGED
@@ -19,17 +19,13 @@ module Curation
|
|
19
19
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
20
20
|
]
|
21
21
|
|
22
|
-
def initialize(url)
|
22
|
+
def initialize(url, html = nil)
|
23
23
|
@url = url
|
24
|
+
@html = html
|
24
25
|
end
|
25
26
|
|
26
27
|
def title
|
27
|
-
|
28
|
-
json_ld.each do |ld|
|
29
|
-
return ld['headline'] if ld.has_key? 'headline'
|
30
|
-
end
|
31
|
-
end
|
32
|
-
metainspector.best_title
|
28
|
+
@title = find_title
|
33
29
|
end
|
34
30
|
|
35
31
|
def image
|
@@ -46,7 +42,7 @@ module Curation
|
|
46
42
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
47
43
|
end
|
48
44
|
end
|
49
|
-
h =
|
45
|
+
h = nokogiri.dup
|
50
46
|
BLACKLIST.each do |tag|
|
51
47
|
h.css(tag).remove
|
52
48
|
end
|
@@ -59,53 +55,90 @@ module Curation
|
|
59
55
|
|
60
56
|
protected
|
61
57
|
|
58
|
+
def find_title
|
59
|
+
if json_ld.any?
|
60
|
+
# Some sites have tables in tables
|
61
|
+
json_ld.flatten!
|
62
|
+
json_ld.each do |ld|
|
63
|
+
return ld['headline'] if ld.has_key? 'headline'
|
64
|
+
end
|
65
|
+
end
|
66
|
+
begin
|
67
|
+
[
|
68
|
+
metainspector.best_title,
|
69
|
+
metainspector.title,
|
70
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
71
|
+
nokogiri.css('title')&.first&.inner_text
|
72
|
+
].each do |possibility|
|
73
|
+
return possibility unless possibility.blank?
|
74
|
+
end
|
75
|
+
rescue
|
76
|
+
puts 'Curation::Page find_title error'
|
77
|
+
end
|
78
|
+
return ''
|
79
|
+
end
|
80
|
+
|
62
81
|
def find_image
|
63
82
|
if json_ld.any?
|
64
83
|
json_ld.each do |ld|
|
65
84
|
if ld.has_key? 'image'
|
66
85
|
image_data = ld['image']
|
67
86
|
return image_data if image_data.is_a? String
|
68
|
-
|
87
|
+
if image_data.is_a? Array
|
88
|
+
first = image_data.first
|
89
|
+
return first if first.is_a? String
|
90
|
+
return first['url'] if first.is_a? Hash
|
91
|
+
end
|
69
92
|
return image_data['url'] if image_data.is_a? Hash
|
70
93
|
end
|
71
94
|
end
|
72
95
|
end
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
96
|
+
begin
|
97
|
+
[
|
98
|
+
metainspector.images.best,
|
99
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
100
|
+
].each do |possibility|
|
101
|
+
return possibility unless possibility.blank?
|
102
|
+
end
|
103
|
+
rescue
|
104
|
+
puts 'Curation::Page find_image error'
|
105
|
+
end
|
106
|
+
return ''
|
80
107
|
end
|
81
108
|
|
82
109
|
def json_ld
|
83
110
|
unless @json_ld
|
84
111
|
@json_ld = []
|
85
112
|
begin
|
86
|
-
options =
|
113
|
+
options = nokogiri.css('[type="application/ld+json"]')
|
87
114
|
options.each do |option|
|
88
115
|
string = option.inner_text
|
89
116
|
hash = JSON.parse(string)
|
90
117
|
@json_ld << hash
|
91
118
|
end
|
92
119
|
rescue
|
93
|
-
puts
|
120
|
+
puts 'Curation::Page json_ld error'
|
94
121
|
end
|
95
122
|
end
|
96
123
|
@json_ld
|
97
124
|
end
|
98
125
|
|
99
|
-
def
|
100
|
-
URI.open url
|
126
|
+
def html
|
127
|
+
@html ||= URI.open url
|
101
128
|
rescue
|
102
129
|
puts "Impossible to open #{url}"
|
103
130
|
end
|
104
131
|
|
132
|
+
def nokogiri
|
133
|
+
@nokogiri ||= Nokogiri::HTML html
|
134
|
+
rescue
|
135
|
+
puts 'Curation::Page nokogiri error'
|
136
|
+
end
|
137
|
+
|
105
138
|
def metainspector
|
106
|
-
@metainspector ||= MetaInspector.new url
|
139
|
+
@metainspector ||= MetaInspector.new url, document: html
|
107
140
|
rescue
|
108
|
-
puts
|
141
|
+
puts 'Curation::Page metainspector error'
|
109
142
|
end
|
110
143
|
end
|
111
144
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.6'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|