curation 1.2 → 1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +14 -1
- data/Rakefile +9 -1
- data/lib/curation.rb +130 -27
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
|
4
|
+
data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
|
7
|
+
data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (
|
4
|
+
curation (1.7)
|
5
5
|
metainspector
|
6
6
|
nokogiri
|
7
7
|
|
@@ -10,6 +10,9 @@ GEM
|
|
10
10
|
specs:
|
11
11
|
addressable (2.7.0)
|
12
12
|
public_suffix (>= 2.0.2, < 5.0)
|
13
|
+
ansi (1.5.0)
|
14
|
+
builder (3.2.4)
|
15
|
+
byebug (11.1.3)
|
13
16
|
domain_name (0.5.20190701)
|
14
17
|
unf (>= 0.0.5, < 1.0.0)
|
15
18
|
faraday (1.0.1)
|
@@ -37,12 +40,19 @@ GEM
|
|
37
40
|
nesty (~> 1.0.2)
|
38
41
|
nokogiri (~> 1.10.9)
|
39
42
|
mini_portile2 (2.4.0)
|
43
|
+
minitest (5.14.1)
|
44
|
+
minitest-reporters (1.4.2)
|
45
|
+
ansi
|
46
|
+
builder
|
47
|
+
minitest (>= 5.0)
|
48
|
+
ruby-progressbar
|
40
49
|
multipart-post (2.1.1)
|
41
50
|
nesty (1.0.2)
|
42
51
|
nokogiri (1.10.10)
|
43
52
|
mini_portile2 (~> 2.4.0)
|
44
53
|
public_suffix (4.0.5)
|
45
54
|
rake (12.3.3)
|
55
|
+
ruby-progressbar (1.10.1)
|
46
56
|
unf (0.1.4)
|
47
57
|
unf_ext
|
48
58
|
unf_ext (0.0.7.7)
|
@@ -51,7 +61,10 @@ PLATFORMS
|
|
51
61
|
ruby
|
52
62
|
|
53
63
|
DEPENDENCIES
|
64
|
+
byebug
|
54
65
|
curation!
|
66
|
+
minitest
|
67
|
+
minitest-reporters
|
55
68
|
rake (~> 12.0)
|
56
69
|
|
57
70
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/lib/curation.rb
CHANGED
@@ -6,7 +6,7 @@ module Curation
|
|
6
6
|
class Error < StandardError; end
|
7
7
|
|
8
8
|
class Page
|
9
|
-
attr_reader :url
|
9
|
+
attr_reader :url
|
10
10
|
|
11
11
|
BLACKLIST = [
|
12
12
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
@@ -25,25 +25,80 @@ module Curation
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def title
|
28
|
+
@title ||= find_title
|
29
|
+
end
|
30
|
+
|
31
|
+
def image
|
32
|
+
unless @image
|
33
|
+
@image = find_image
|
34
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
35
|
+
end
|
36
|
+
@image
|
37
|
+
end
|
38
|
+
|
39
|
+
def text
|
40
|
+
@text ||= find_text
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
@date ||= find_date
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
def find_title
|
28
50
|
if json_ld.any?
|
29
51
|
json_ld.each do |ld|
|
30
52
|
return ld['headline'] if ld.has_key? 'headline'
|
31
53
|
end
|
32
54
|
end
|
33
|
-
|
34
|
-
|
55
|
+
begin
|
56
|
+
[
|
57
|
+
metainspector.best_title,
|
58
|
+
metainspector.title,
|
59
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
60
|
+
nokogiri.css('title')&.first&.inner_text
|
61
|
+
].each do |possibility|
|
62
|
+
return possibility unless possibility.to_s.empty?
|
63
|
+
end
|
64
|
+
rescue
|
65
|
+
puts 'Curation::Page find_title error'
|
66
|
+
end
|
67
|
+
return ''
|
35
68
|
end
|
36
69
|
|
37
|
-
def
|
38
|
-
|
39
|
-
|
40
|
-
|
70
|
+
def find_image
|
71
|
+
if json_ld.any?
|
72
|
+
json_ld.each do |ld|
|
73
|
+
if ld.has_key? 'image'
|
74
|
+
image_data = ld['image']
|
75
|
+
return image_data if image_data.is_a? String
|
76
|
+
if image_data.is_a? Array
|
77
|
+
first = image_data.first
|
78
|
+
return first if first.is_a? String
|
79
|
+
return first['url'] if first.is_a? Hash
|
80
|
+
end
|
81
|
+
return image_data['url'] if image_data.is_a? Hash
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
begin
|
86
|
+
[
|
87
|
+
metainspector.images.best,
|
88
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
89
|
+
].each do |possibility|
|
90
|
+
return possibility unless possibility.to_s.empty?
|
91
|
+
end
|
92
|
+
rescue
|
93
|
+
puts 'Curation::Page find_image error'
|
94
|
+
end
|
95
|
+
return ''
|
41
96
|
end
|
42
97
|
|
43
|
-
def
|
98
|
+
def find_text
|
44
99
|
if json_ld.any?
|
45
100
|
json_ld.each do |ld|
|
46
|
-
next unless ld['@type']
|
101
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
47
102
|
return ld['text'] if ld.has_key? 'text'
|
48
103
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
49
104
|
end
|
@@ -59,24 +114,44 @@ module Curation
|
|
59
114
|
text
|
60
115
|
end
|
61
116
|
|
62
|
-
|
63
|
-
|
64
|
-
def find_image
|
117
|
+
def find_date
|
65
118
|
if json_ld.any?
|
66
119
|
json_ld.each do |ld|
|
67
|
-
|
68
|
-
|
69
|
-
return image_data if image_data.is_a? String
|
70
|
-
return image_data.first if image_data.is_a? Array
|
71
|
-
return image_data['url'] if image_data.is_a? Hash
|
72
|
-
end
|
120
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
121
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
73
122
|
end
|
74
123
|
end
|
75
|
-
|
124
|
+
return Date.parse metatags['date'] rescue nil
|
125
|
+
return Date.parse metatags['pubdate'] rescue nil
|
126
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
127
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
128
|
+
chunks = html.split('DisplayDate')
|
129
|
+
if chunks.count > 1
|
130
|
+
value = chunks[1]
|
131
|
+
value = value.split(',').first
|
132
|
+
value = value.gsub('"', '')
|
133
|
+
value = value[1..-1] if value[0] == ':'
|
134
|
+
return Date.parse value rescue nil
|
135
|
+
end
|
136
|
+
begin
|
137
|
+
value = nokogiri.css('.postDate').first
|
138
|
+
value = value.inner_text
|
139
|
+
value = value.gsub(' — ', '')
|
140
|
+
return Date.parse value
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
begin
|
144
|
+
value = nokogiri.css('.gta_post_date').first
|
145
|
+
value = value.inner_text
|
146
|
+
return Date.parse value
|
147
|
+
rescue
|
148
|
+
end
|
76
149
|
end
|
77
150
|
|
151
|
+
private
|
152
|
+
|
78
153
|
def json_ld
|
79
|
-
unless @json_ld
|
154
|
+
unless defined?(@json_ld)
|
80
155
|
@json_ld = []
|
81
156
|
begin
|
82
157
|
options = nokogiri.css('[type="application/ld+json"]')
|
@@ -85,29 +160,57 @@ module Curation
|
|
85
160
|
hash = JSON.parse(string)
|
86
161
|
@json_ld << hash
|
87
162
|
end
|
163
|
+
# Some sites have tables in tables
|
164
|
+
@json_ld.flatten!
|
88
165
|
rescue
|
89
|
-
puts
|
166
|
+
puts 'Curation::Page json_ld error'
|
90
167
|
end
|
91
168
|
end
|
92
169
|
@json_ld
|
93
170
|
end
|
94
171
|
|
172
|
+
def file
|
173
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
174
|
+
rescue
|
175
|
+
puts "Curation::Page file error with url #{url}"
|
176
|
+
end
|
177
|
+
|
95
178
|
def html
|
96
|
-
@html
|
179
|
+
unless @html
|
180
|
+
file.rewind
|
181
|
+
@html = file.read
|
182
|
+
file.rewind
|
183
|
+
end
|
184
|
+
@html
|
97
185
|
rescue
|
98
|
-
puts "
|
186
|
+
puts "Curation::Page html error"
|
99
187
|
end
|
100
188
|
|
101
189
|
def nokogiri
|
102
|
-
@nokogiri
|
190
|
+
unless @nokogiri
|
191
|
+
file.rewind
|
192
|
+
@nokogiri = Nokogiri::HTML file
|
193
|
+
file.rewind
|
194
|
+
end
|
195
|
+
@nokogiri
|
103
196
|
rescue
|
104
|
-
puts
|
197
|
+
puts 'Curation::Page nokogiri error'
|
105
198
|
end
|
106
199
|
|
107
200
|
def metainspector
|
108
|
-
@metainspector
|
201
|
+
unless @metainspector
|
202
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
203
|
+
: MetaInspector.new(url, document: html)
|
204
|
+
end
|
205
|
+
@metainspector
|
206
|
+
rescue
|
207
|
+
puts 'Curation::Page metainspector error'
|
208
|
+
end
|
209
|
+
|
210
|
+
def metatags
|
211
|
+
@metatags ||= metainspector.meta_tag['name']
|
109
212
|
rescue
|
110
|
-
puts
|
213
|
+
puts 'Curation::Page metatags error'
|
111
214
|
end
|
112
215
|
end
|
113
216
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|