curation 1.0 → 1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation.rb +56 -25
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 15ed9203a4164d0cfdfb7e973523008ddfe518ce21016731ae8a1dcddc74b0ab
|
4
|
+
data.tar.gz: e2269fbae2d6a6355f90709409de0fa1dd3e555dfd533b10d00d1073af0ac9ec
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72df82702be64fcee4e1e7725ecb3c1f831c036040c2ef445388feb493a43cd78b979ed63e99dd0168399c2bd014a0a00e2f7211f561b0ba88f13d0098daaa68
|
7
|
+
data.tar.gz: dd439269683719541f49204484e7d7fd42cbc0f0320e488d2f12a5a36bc472cd343343c4697d0a84a3f8746a308dad03b5e84e33b03adedd3984de73082e5bbb
|
data/Gemfile.lock
CHANGED
data/lib/curation.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
1
|
require "curation/version"
|
2
|
+
require "metainspector"
|
3
|
+
require "open-uri"
|
2
4
|
|
3
5
|
module Curation
|
4
6
|
class Error < StandardError; end
|
@@ -17,22 +19,18 @@ module Curation
|
|
17
19
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
18
20
|
]
|
19
21
|
|
20
|
-
def initialize(url)
|
22
|
+
def initialize(url, html = nil)
|
21
23
|
@url = url
|
24
|
+
@html = html
|
22
25
|
end
|
23
26
|
|
24
27
|
def title
|
25
|
-
|
26
|
-
json_ld.each do |ld|
|
27
|
-
return ld['headline'] if ld.has_key? 'headline'
|
28
|
-
end
|
29
|
-
end
|
30
|
-
metainspector.best_title
|
28
|
+
@title = find_title
|
31
29
|
end
|
32
30
|
|
33
31
|
def image
|
34
32
|
@image = find_image
|
35
|
-
@image = @image.gsub('http://', 'https://')
|
33
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
36
34
|
@image
|
37
35
|
end
|
38
36
|
|
@@ -44,8 +42,7 @@ module Curation
|
|
44
42
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
45
43
|
end
|
46
44
|
end
|
47
|
-
|
48
|
-
h = html.dup
|
45
|
+
h = nokogiri.dup
|
49
46
|
BLACKLIST.each do |tag|
|
50
47
|
h.css(tag).remove
|
51
48
|
end
|
@@ -58,54 +55,88 @@ module Curation
|
|
58
55
|
|
59
56
|
protected
|
60
57
|
|
58
|
+
def find_title
|
59
|
+
if json_ld.any?
|
60
|
+
json_ld.each do |ld|
|
61
|
+
return ld['headline'] if ld.has_key? 'headline'
|
62
|
+
end
|
63
|
+
end
|
64
|
+
begin
|
65
|
+
[
|
66
|
+
metainspector.best_title,
|
67
|
+
metainspector.title,
|
68
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
69
|
+
nokogiri.css('title')&.first&.inner_text
|
70
|
+
].each do |possibility|
|
71
|
+
return possibility unless possibility.blank?
|
72
|
+
end
|
73
|
+
rescue
|
74
|
+
puts 'Curation::Page find_title error'
|
75
|
+
end
|
76
|
+
return ''
|
77
|
+
end
|
78
|
+
|
61
79
|
def find_image
|
62
80
|
if json_ld.any?
|
63
81
|
json_ld.each do |ld|
|
64
82
|
if ld.has_key? 'image'
|
65
83
|
image_data = ld['image']
|
66
84
|
return image_data if image_data.is_a? String
|
67
|
-
|
85
|
+
if image_data.is_a? Array
|
86
|
+
first = image_data.first
|
87
|
+
return first if first.is_a? String
|
88
|
+
return first['url'] if first.is_a? Hash
|
89
|
+
end
|
68
90
|
return image_data['url'] if image_data.is_a? Hash
|
69
91
|
end
|
70
92
|
end
|
71
93
|
end
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
94
|
+
begin
|
95
|
+
[
|
96
|
+
metainspector.images.best,
|
97
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
98
|
+
].each do |possibility|
|
99
|
+
return possibility unless possibility.blank?
|
100
|
+
end
|
101
|
+
rescue
|
102
|
+
puts 'Curation::Page find_image error'
|
103
|
+
end
|
104
|
+
return ''
|
79
105
|
end
|
80
106
|
|
81
107
|
def json_ld
|
82
108
|
unless @json_ld
|
83
109
|
@json_ld = []
|
84
110
|
begin
|
85
|
-
options =
|
111
|
+
options = nokogiri.css('[type="application/ld+json"]')
|
86
112
|
options.each do |option|
|
87
113
|
string = option.inner_text
|
88
114
|
hash = JSON.parse(string)
|
89
115
|
@json_ld << hash
|
90
116
|
end
|
91
117
|
rescue
|
92
|
-
puts
|
118
|
+
puts 'Curation::Page json_ld error'
|
93
119
|
end
|
94
120
|
end
|
95
121
|
@json_ld
|
96
122
|
end
|
97
123
|
|
98
|
-
def
|
99
|
-
|
100
|
-
URI.open url
|
124
|
+
def html
|
125
|
+
@html ||= URI.open url
|
101
126
|
rescue
|
102
127
|
puts "Impossible to open #{url}"
|
103
128
|
end
|
104
129
|
|
130
|
+
def nokogiri
|
131
|
+
@nokogiri ||= Nokogiri::HTML html
|
132
|
+
rescue
|
133
|
+
puts 'Curation::Page nokogiri error'
|
134
|
+
end
|
135
|
+
|
105
136
|
def metainspector
|
106
|
-
@metainspector ||= MetaInspector.new url
|
137
|
+
@metainspector ||= MetaInspector.new url, document: html
|
107
138
|
rescue
|
108
|
-
puts
|
139
|
+
puts 'Curation::Page metainspector error'
|
109
140
|
end
|
110
141
|
end
|
111
142
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.5'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|