curation 1.6 → 1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +14 -1
- data/Rakefile +9 -1
- data/lib/curation.rb +101 -29
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
|
4
|
+
data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
|
7
|
+
data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (1.
|
4
|
+
curation (1.7)
|
5
5
|
metainspector
|
6
6
|
nokogiri
|
7
7
|
|
@@ -10,6 +10,9 @@ GEM
|
|
10
10
|
specs:
|
11
11
|
addressable (2.7.0)
|
12
12
|
public_suffix (>= 2.0.2, < 5.0)
|
13
|
+
ansi (1.5.0)
|
14
|
+
builder (3.2.4)
|
15
|
+
byebug (11.1.3)
|
13
16
|
domain_name (0.5.20190701)
|
14
17
|
unf (>= 0.0.5, < 1.0.0)
|
15
18
|
faraday (1.0.1)
|
@@ -37,12 +40,19 @@ GEM
|
|
37
40
|
nesty (~> 1.0.2)
|
38
41
|
nokogiri (~> 1.10.9)
|
39
42
|
mini_portile2 (2.4.0)
|
43
|
+
minitest (5.14.1)
|
44
|
+
minitest-reporters (1.4.2)
|
45
|
+
ansi
|
46
|
+
builder
|
47
|
+
minitest (>= 5.0)
|
48
|
+
ruby-progressbar
|
40
49
|
multipart-post (2.1.1)
|
41
50
|
nesty (1.0.2)
|
42
51
|
nokogiri (1.10.10)
|
43
52
|
mini_portile2 (~> 2.4.0)
|
44
53
|
public_suffix (4.0.5)
|
45
54
|
rake (12.3.3)
|
55
|
+
ruby-progressbar (1.10.1)
|
46
56
|
unf (0.1.4)
|
47
57
|
unf_ext
|
48
58
|
unf_ext (0.0.7.7)
|
@@ -51,7 +61,10 @@ PLATFORMS
|
|
51
61
|
ruby
|
52
62
|
|
53
63
|
DEPENDENCIES
|
64
|
+
byebug
|
54
65
|
curation!
|
66
|
+
minitest
|
67
|
+
minitest-reporters
|
55
68
|
rake (~> 12.0)
|
56
69
|
|
57
70
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/lib/curation.rb
CHANGED
@@ -6,7 +6,7 @@ module Curation
|
|
6
6
|
class Error < StandardError; end
|
7
7
|
|
8
8
|
class Page
|
9
|
-
attr_reader :url
|
9
|
+
attr_reader :url
|
10
10
|
|
11
11
|
BLACKLIST = [
|
12
12
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
@@ -25,40 +25,29 @@ module Curation
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def title
|
28
|
-
@title
|
28
|
+
@title ||= find_title
|
29
29
|
end
|
30
30
|
|
31
31
|
def image
|
32
|
-
@image
|
33
|
-
|
32
|
+
unless @image
|
33
|
+
@image = find_image
|
34
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
35
|
+
end
|
34
36
|
@image
|
35
37
|
end
|
36
38
|
|
37
39
|
def text
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
end
|
44
|
-
end
|
45
|
-
h = nokogiri.dup
|
46
|
-
BLACKLIST.each do |tag|
|
47
|
-
h.css(tag).remove
|
48
|
-
end
|
49
|
-
nodes = h.css('p')
|
50
|
-
nodes.xpath('//style').remove
|
51
|
-
text = nodes.to_html
|
52
|
-
text.gsub!('<br><br>', '<br>')
|
53
|
-
text
|
40
|
+
@text ||= find_text
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
@date ||= find_date
|
54
45
|
end
|
55
46
|
|
56
47
|
protected
|
57
48
|
|
58
49
|
def find_title
|
59
50
|
if json_ld.any?
|
60
|
-
# Some sites have tables in tables
|
61
|
-
json_ld.flatten!
|
62
51
|
json_ld.each do |ld|
|
63
52
|
return ld['headline'] if ld.has_key? 'headline'
|
64
53
|
end
|
@@ -70,7 +59,7 @@ module Curation
|
|
70
59
|
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
71
60
|
nokogiri.css('title')&.first&.inner_text
|
72
61
|
].each do |possibility|
|
73
|
-
return possibility unless possibility.
|
62
|
+
return possibility unless possibility.to_s.empty?
|
74
63
|
end
|
75
64
|
rescue
|
76
65
|
puts 'Curation::Page find_title error'
|
@@ -98,7 +87,7 @@ module Curation
|
|
98
87
|
metainspector.images.best,
|
99
88
|
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
100
89
|
].each do |possibility|
|
101
|
-
|
90
|
+
return possibility unless possibility.to_s.empty?
|
102
91
|
end
|
103
92
|
rescue
|
104
93
|
puts 'Curation::Page find_image error'
|
@@ -106,8 +95,63 @@ module Curation
|
|
106
95
|
return ''
|
107
96
|
end
|
108
97
|
|
98
|
+
def find_text
|
99
|
+
if json_ld.any?
|
100
|
+
json_ld.each do |ld|
|
101
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
102
|
+
return ld['text'] if ld.has_key? 'text'
|
103
|
+
return ld['articleBody'] if ld.has_key? 'articleBody'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
h = nokogiri.dup
|
107
|
+
BLACKLIST.each do |tag|
|
108
|
+
h.css(tag).remove
|
109
|
+
end
|
110
|
+
nodes = h.css('p')
|
111
|
+
nodes.xpath('//style').remove
|
112
|
+
text = nodes.to_html
|
113
|
+
text.gsub!('<br><br>', '<br>')
|
114
|
+
text
|
115
|
+
end
|
116
|
+
|
117
|
+
def find_date
|
118
|
+
if json_ld.any?
|
119
|
+
json_ld.each do |ld|
|
120
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
121
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return Date.parse metatags['date'] rescue nil
|
125
|
+
return Date.parse metatags['pubdate'] rescue nil
|
126
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
127
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
128
|
+
chunks = html.split('DisplayDate')
|
129
|
+
if chunks.count > 1
|
130
|
+
value = chunks[1]
|
131
|
+
value = value.split(',').first
|
132
|
+
value = value.gsub('"', '')
|
133
|
+
value = value[1..-1] if value[0] == ':'
|
134
|
+
return Date.parse value rescue nil
|
135
|
+
end
|
136
|
+
begin
|
137
|
+
value = nokogiri.css('.postDate').first
|
138
|
+
value = value.inner_text
|
139
|
+
value = value.gsub(' — ', '')
|
140
|
+
return Date.parse value
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
begin
|
144
|
+
value = nokogiri.css('.gta_post_date').first
|
145
|
+
value = value.inner_text
|
146
|
+
return Date.parse value
|
147
|
+
rescue
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
109
153
|
def json_ld
|
110
|
-
unless @json_ld
|
154
|
+
unless defined?(@json_ld)
|
111
155
|
@json_ld = []
|
112
156
|
begin
|
113
157
|
options = nokogiri.css('[type="application/ld+json"]')
|
@@ -116,6 +160,8 @@ module Curation
|
|
116
160
|
hash = JSON.parse(string)
|
117
161
|
@json_ld << hash
|
118
162
|
end
|
163
|
+
# Some sites have tables in tables
|
164
|
+
@json_ld.flatten!
|
119
165
|
rescue
|
120
166
|
puts 'Curation::Page json_ld error'
|
121
167
|
end
|
@@ -123,22 +169,48 @@ module Curation
|
|
123
169
|
@json_ld
|
124
170
|
end
|
125
171
|
|
172
|
+
def file
|
173
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
174
|
+
rescue
|
175
|
+
puts "Curation::Page file error with url #{url}"
|
176
|
+
end
|
177
|
+
|
126
178
|
def html
|
127
|
-
@html
|
179
|
+
unless @html
|
180
|
+
file.rewind
|
181
|
+
@html = file.read
|
182
|
+
file.rewind
|
183
|
+
end
|
184
|
+
@html
|
128
185
|
rescue
|
129
|
-
puts "
|
186
|
+
puts "Curation::Page html error"
|
130
187
|
end
|
131
188
|
|
132
189
|
def nokogiri
|
133
|
-
@nokogiri
|
190
|
+
unless @nokogiri
|
191
|
+
file.rewind
|
192
|
+
@nokogiri = Nokogiri::HTML file
|
193
|
+
file.rewind
|
194
|
+
end
|
195
|
+
@nokogiri
|
134
196
|
rescue
|
135
197
|
puts 'Curation::Page nokogiri error'
|
136
198
|
end
|
137
199
|
|
138
200
|
def metainspector
|
139
|
-
@metainspector
|
201
|
+
unless @metainspector
|
202
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
203
|
+
: MetaInspector.new(url, document: html)
|
204
|
+
end
|
205
|
+
@metainspector
|
140
206
|
rescue
|
141
207
|
puts 'Curation::Page metainspector error'
|
142
208
|
end
|
209
|
+
|
210
|
+
def metatags
|
211
|
+
@metatags ||= metainspector.meta_tag['name']
|
212
|
+
rescue
|
213
|
+
puts 'Curation::Page metatags error'
|
214
|
+
end
|
143
215
|
end
|
144
216
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|