curation 1.0 → 1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation.rb +56 -25
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15ed9203a4164d0cfdfb7e973523008ddfe518ce21016731ae8a1dcddc74b0ab
|
4
|
+
data.tar.gz: e2269fbae2d6a6355f90709409de0fa1dd3e555dfd533b10d00d1073af0ac9ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72df82702be64fcee4e1e7725ecb3c1f831c036040c2ef445388feb493a43cd78b979ed63e99dd0168399c2bd014a0a00e2f7211f561b0ba88f13d0098daaa68
|
7
|
+
data.tar.gz: dd439269683719541f49204484e7d7fd42cbc0f0320e488d2f12a5a36bc472cd343343c4697d0a84a3f8746a308dad03b5e84e33b03adedd3984de73082e5bbb
|
data/Gemfile.lock
CHANGED
data/lib/curation.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require "curation/version"
|
2
|
+
require "metainspector"
|
3
|
+
require "open-uri"
|
2
4
|
|
3
5
|
module Curation
|
4
6
|
class Error < StandardError; end
|
@@ -17,22 +19,18 @@ module Curation
|
|
17
19
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
18
20
|
]
|
19
21
|
|
20
|
-
def initialize(url)
|
22
|
+
def initialize(url, html = nil)
|
21
23
|
@url = url
|
24
|
+
@html = html
|
22
25
|
end
|
23
26
|
|
24
27
|
def title
|
25
|
-
|
26
|
-
json_ld.each do |ld|
|
27
|
-
return ld['headline'] if ld.has_key? 'headline'
|
28
|
-
end
|
29
|
-
end
|
30
|
-
metainspector.best_title
|
28
|
+
@title = find_title
|
31
29
|
end
|
32
30
|
|
33
31
|
def image
|
34
32
|
@image = find_image
|
35
|
-
@image = @image.gsub('http://', 'https://')
|
33
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
36
34
|
@image
|
37
35
|
end
|
38
36
|
|
@@ -44,8 +42,7 @@ module Curation
|
|
44
42
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
45
43
|
end
|
46
44
|
end
|
47
|
-
|
48
|
-
h = html.dup
|
45
|
+
h = nokogiri.dup
|
49
46
|
BLACKLIST.each do |tag|
|
50
47
|
h.css(tag).remove
|
51
48
|
end
|
@@ -58,54 +55,88 @@ module Curation
|
|
58
55
|
|
59
56
|
protected
|
60
57
|
|
58
|
+
def find_title
|
59
|
+
if json_ld.any?
|
60
|
+
json_ld.each do |ld|
|
61
|
+
return ld['headline'] if ld.has_key? 'headline'
|
62
|
+
end
|
63
|
+
end
|
64
|
+
begin
|
65
|
+
[
|
66
|
+
metainspector.best_title,
|
67
|
+
metainspector.title,
|
68
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
69
|
+
nokogiri.css('title')&.first&.inner_text
|
70
|
+
].each do |possibility|
|
71
|
+
return possibility unless possibility.blank?
|
72
|
+
end
|
73
|
+
rescue
|
74
|
+
puts 'Curation::Page find_title error'
|
75
|
+
end
|
76
|
+
return ''
|
77
|
+
end
|
78
|
+
|
61
79
|
def find_image
|
62
80
|
if json_ld.any?
|
63
81
|
json_ld.each do |ld|
|
64
82
|
if ld.has_key? 'image'
|
65
83
|
image_data = ld['image']
|
66
84
|
return image_data if image_data.is_a? String
|
67
|
-
|
85
|
+
if image_data.is_a? Array
|
86
|
+
first = image_data.first
|
87
|
+
return first if first.is_a? String
|
88
|
+
return first['url'] if first.is_a? Hash
|
89
|
+
end
|
68
90
|
return image_data['url'] if image_data.is_a? Hash
|
69
91
|
end
|
70
92
|
end
|
71
93
|
end
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
94
|
+
begin
|
95
|
+
[
|
96
|
+
metainspector.images.best,
|
97
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
98
|
+
].each do |possibility|
|
99
|
+
return possibility unless possibility.blank?
|
100
|
+
end
|
101
|
+
rescue
|
102
|
+
puts 'Curation::Page find_image error'
|
103
|
+
end
|
104
|
+
return ''
|
79
105
|
end
|
80
106
|
|
81
107
|
def json_ld
|
82
108
|
unless @json_ld
|
83
109
|
@json_ld = []
|
84
110
|
begin
|
85
|
-
options =
|
111
|
+
options = nokogiri.css('[type="application/ld+json"]')
|
86
112
|
options.each do |option|
|
87
113
|
string = option.inner_text
|
88
114
|
hash = JSON.parse(string)
|
89
115
|
@json_ld << hash
|
90
116
|
end
|
91
117
|
rescue
|
92
|
-
puts
|
118
|
+
puts 'Curation::Page json_ld error'
|
93
119
|
end
|
94
120
|
end
|
95
121
|
@json_ld
|
96
122
|
end
|
97
123
|
|
98
|
-
def
|
99
|
-
|
100
|
-
URI.open url
|
124
|
+
def html
|
125
|
+
@html ||= URI.open url
|
101
126
|
rescue
|
102
127
|
puts "Impossible to open #{url}"
|
103
128
|
end
|
104
129
|
|
130
|
+
def nokogiri
|
131
|
+
@nokogiri ||= Nokogiri::HTML html
|
132
|
+
rescue
|
133
|
+
puts 'Curation::Page nokogiri error'
|
134
|
+
end
|
135
|
+
|
105
136
|
def metainspector
|
106
|
-
@metainspector ||= MetaInspector.new url
|
137
|
+
@metainspector ||= MetaInspector.new url, document: html
|
107
138
|
rescue
|
108
|
-
puts
|
139
|
+
puts 'Curation::Page metainspector error'
|
109
140
|
end
|
110
141
|
end
|
111
142
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.5'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|