curation 1.2 → 1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +14 -1
- data/Rakefile +9 -1
- data/lib/curation.rb +130 -27
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
|
4
|
+
data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
|
7
|
+
data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (
|
4
|
+
curation (1.7)
|
5
5
|
metainspector
|
6
6
|
nokogiri
|
7
7
|
|
@@ -10,6 +10,9 @@ GEM
|
|
10
10
|
specs:
|
11
11
|
addressable (2.7.0)
|
12
12
|
public_suffix (>= 2.0.2, < 5.0)
|
13
|
+
ansi (1.5.0)
|
14
|
+
builder (3.2.4)
|
15
|
+
byebug (11.1.3)
|
13
16
|
domain_name (0.5.20190701)
|
14
17
|
unf (>= 0.0.5, < 1.0.0)
|
15
18
|
faraday (1.0.1)
|
@@ -37,12 +40,19 @@ GEM
|
|
37
40
|
nesty (~> 1.0.2)
|
38
41
|
nokogiri (~> 1.10.9)
|
39
42
|
mini_portile2 (2.4.0)
|
43
|
+
minitest (5.14.1)
|
44
|
+
minitest-reporters (1.4.2)
|
45
|
+
ansi
|
46
|
+
builder
|
47
|
+
minitest (>= 5.0)
|
48
|
+
ruby-progressbar
|
40
49
|
multipart-post (2.1.1)
|
41
50
|
nesty (1.0.2)
|
42
51
|
nokogiri (1.10.10)
|
43
52
|
mini_portile2 (~> 2.4.0)
|
44
53
|
public_suffix (4.0.5)
|
45
54
|
rake (12.3.3)
|
55
|
+
ruby-progressbar (1.10.1)
|
46
56
|
unf (0.1.4)
|
47
57
|
unf_ext
|
48
58
|
unf_ext (0.0.7.7)
|
@@ -51,7 +61,10 @@ PLATFORMS
|
|
51
61
|
ruby
|
52
62
|
|
53
63
|
DEPENDENCIES
|
64
|
+
byebug
|
54
65
|
curation!
|
66
|
+
minitest
|
67
|
+
minitest-reporters
|
55
68
|
rake (~> 12.0)
|
56
69
|
|
57
70
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/lib/curation.rb
CHANGED
@@ -6,7 +6,7 @@ module Curation
|
|
6
6
|
class Error < StandardError; end
|
7
7
|
|
8
8
|
class Page
|
9
|
-
attr_reader :url
|
9
|
+
attr_reader :url
|
10
10
|
|
11
11
|
BLACKLIST = [
|
12
12
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
@@ -25,25 +25,80 @@ module Curation
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def title
|
28
|
+
@title ||= find_title
|
29
|
+
end
|
30
|
+
|
31
|
+
def image
|
32
|
+
unless @image
|
33
|
+
@image = find_image
|
34
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
35
|
+
end
|
36
|
+
@image
|
37
|
+
end
|
38
|
+
|
39
|
+
def text
|
40
|
+
@text ||= find_text
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
@date ||= find_date
|
45
|
+
end
|
46
|
+
|
47
|
+
protected
|
48
|
+
|
49
|
+
def find_title
|
28
50
|
if json_ld.any?
|
29
51
|
json_ld.each do |ld|
|
30
52
|
return ld['headline'] if ld.has_key? 'headline'
|
31
53
|
end
|
32
54
|
end
|
33
|
-
|
34
|
-
|
55
|
+
begin
|
56
|
+
[
|
57
|
+
metainspector.best_title,
|
58
|
+
metainspector.title,
|
59
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
60
|
+
nokogiri.css('title')&.first&.inner_text
|
61
|
+
].each do |possibility|
|
62
|
+
return possibility unless possibility.to_s.empty?
|
63
|
+
end
|
64
|
+
rescue
|
65
|
+
puts 'Curation::Page find_title error'
|
66
|
+
end
|
67
|
+
return ''
|
35
68
|
end
|
36
69
|
|
37
|
-
def
|
38
|
-
|
39
|
-
|
40
|
-
|
70
|
+
def find_image
|
71
|
+
if json_ld.any?
|
72
|
+
json_ld.each do |ld|
|
73
|
+
if ld.has_key? 'image'
|
74
|
+
image_data = ld['image']
|
75
|
+
return image_data if image_data.is_a? String
|
76
|
+
if image_data.is_a? Array
|
77
|
+
first = image_data.first
|
78
|
+
return first if first.is_a? String
|
79
|
+
return first['url'] if first.is_a? Hash
|
80
|
+
end
|
81
|
+
return image_data['url'] if image_data.is_a? Hash
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
begin
|
86
|
+
[
|
87
|
+
metainspector.images.best,
|
88
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
89
|
+
].each do |possibility|
|
90
|
+
return possibility unless possibility.to_s.empty?
|
91
|
+
end
|
92
|
+
rescue
|
93
|
+
puts 'Curation::Page find_image error'
|
94
|
+
end
|
95
|
+
return ''
|
41
96
|
end
|
42
97
|
|
43
|
-
def
|
98
|
+
def find_text
|
44
99
|
if json_ld.any?
|
45
100
|
json_ld.each do |ld|
|
46
|
-
next unless ld['@type']
|
101
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
47
102
|
return ld['text'] if ld.has_key? 'text'
|
48
103
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
49
104
|
end
|
@@ -59,24 +114,44 @@ module Curation
|
|
59
114
|
text
|
60
115
|
end
|
61
116
|
|
62
|
-
|
63
|
-
|
64
|
-
def find_image
|
117
|
+
def find_date
|
65
118
|
if json_ld.any?
|
66
119
|
json_ld.each do |ld|
|
67
|
-
|
68
|
-
|
69
|
-
return image_data if image_data.is_a? String
|
70
|
-
return image_data.first if image_data.is_a? Array
|
71
|
-
return image_data['url'] if image_data.is_a? Hash
|
72
|
-
end
|
120
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
121
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
73
122
|
end
|
74
123
|
end
|
75
|
-
|
124
|
+
return Date.parse metatags['date'] rescue nil
|
125
|
+
return Date.parse metatags['pubdate'] rescue nil
|
126
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
127
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
128
|
+
chunks = html.split('DisplayDate')
|
129
|
+
if chunks.count > 1
|
130
|
+
value = chunks[1]
|
131
|
+
value = value.split(',').first
|
132
|
+
value = value.gsub('"', '')
|
133
|
+
value = value[1..-1] if value[0] == ':'
|
134
|
+
return Date.parse value rescue nil
|
135
|
+
end
|
136
|
+
begin
|
137
|
+
value = nokogiri.css('.postDate').first
|
138
|
+
value = value.inner_text
|
139
|
+
value = value.gsub(' — ', '')
|
140
|
+
return Date.parse value
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
begin
|
144
|
+
value = nokogiri.css('.gta_post_date').first
|
145
|
+
value = value.inner_text
|
146
|
+
return Date.parse value
|
147
|
+
rescue
|
148
|
+
end
|
76
149
|
end
|
77
150
|
|
151
|
+
private
|
152
|
+
|
78
153
|
def json_ld
|
79
|
-
unless @json_ld
|
154
|
+
unless defined?(@json_ld)
|
80
155
|
@json_ld = []
|
81
156
|
begin
|
82
157
|
options = nokogiri.css('[type="application/ld+json"]')
|
@@ -85,29 +160,57 @@ module Curation
|
|
85
160
|
hash = JSON.parse(string)
|
86
161
|
@json_ld << hash
|
87
162
|
end
|
163
|
+
# Some sites have tables in tables
|
164
|
+
@json_ld.flatten!
|
88
165
|
rescue
|
89
|
-
puts
|
166
|
+
puts 'Curation::Page json_ld error'
|
90
167
|
end
|
91
168
|
end
|
92
169
|
@json_ld
|
93
170
|
end
|
94
171
|
|
172
|
+
def file
|
173
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
174
|
+
rescue
|
175
|
+
puts "Curation::Page file error with url #{url}"
|
176
|
+
end
|
177
|
+
|
95
178
|
def html
|
96
|
-
@html
|
179
|
+
unless @html
|
180
|
+
file.rewind
|
181
|
+
@html = file.read
|
182
|
+
file.rewind
|
183
|
+
end
|
184
|
+
@html
|
97
185
|
rescue
|
98
|
-
puts "
|
186
|
+
puts "Curation::Page html error"
|
99
187
|
end
|
100
188
|
|
101
189
|
def nokogiri
|
102
|
-
@nokogiri
|
190
|
+
unless @nokogiri
|
191
|
+
file.rewind
|
192
|
+
@nokogiri = Nokogiri::HTML file
|
193
|
+
file.rewind
|
194
|
+
end
|
195
|
+
@nokogiri
|
103
196
|
rescue
|
104
|
-
puts
|
197
|
+
puts 'Curation::Page nokogiri error'
|
105
198
|
end
|
106
199
|
|
107
200
|
def metainspector
|
108
|
-
@metainspector
|
201
|
+
unless @metainspector
|
202
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
203
|
+
: MetaInspector.new(url, document: html)
|
204
|
+
end
|
205
|
+
@metainspector
|
206
|
+
rescue
|
207
|
+
puts 'Curation::Page metainspector error'
|
208
|
+
end
|
209
|
+
|
210
|
+
def metatags
|
211
|
+
@metatags ||= metainspector.meta_tag['name']
|
109
212
|
rescue
|
110
|
-
puts
|
213
|
+
puts 'Curation::Page metatags error'
|
111
214
|
end
|
112
215
|
end
|
113
216
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|