curation 1.6 → 1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +14 -1
- data/Rakefile +9 -1
- data/lib/curation.rb +101 -29
- data/lib/curation/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
|
4
|
+
data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
|
7
|
+
data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (1.
|
4
|
+
curation (1.7)
|
5
5
|
metainspector
|
6
6
|
nokogiri
|
7
7
|
|
@@ -10,6 +10,9 @@ GEM
|
|
10
10
|
specs:
|
11
11
|
addressable (2.7.0)
|
12
12
|
public_suffix (>= 2.0.2, < 5.0)
|
13
|
+
ansi (1.5.0)
|
14
|
+
builder (3.2.4)
|
15
|
+
byebug (11.1.3)
|
13
16
|
domain_name (0.5.20190701)
|
14
17
|
unf (>= 0.0.5, < 1.0.0)
|
15
18
|
faraday (1.0.1)
|
@@ -37,12 +40,19 @@ GEM
|
|
37
40
|
nesty (~> 1.0.2)
|
38
41
|
nokogiri (~> 1.10.9)
|
39
42
|
mini_portile2 (2.4.0)
|
43
|
+
minitest (5.14.1)
|
44
|
+
minitest-reporters (1.4.2)
|
45
|
+
ansi
|
46
|
+
builder
|
47
|
+
minitest (>= 5.0)
|
48
|
+
ruby-progressbar
|
40
49
|
multipart-post (2.1.1)
|
41
50
|
nesty (1.0.2)
|
42
51
|
nokogiri (1.10.10)
|
43
52
|
mini_portile2 (~> 2.4.0)
|
44
53
|
public_suffix (4.0.5)
|
45
54
|
rake (12.3.3)
|
55
|
+
ruby-progressbar (1.10.1)
|
46
56
|
unf (0.1.4)
|
47
57
|
unf_ext
|
48
58
|
unf_ext (0.0.7.7)
|
@@ -51,7 +61,10 @@ PLATFORMS
|
|
51
61
|
ruby
|
52
62
|
|
53
63
|
DEPENDENCIES
|
64
|
+
byebug
|
54
65
|
curation!
|
66
|
+
minitest
|
67
|
+
minitest-reporters
|
55
68
|
rake (~> 12.0)
|
56
69
|
|
57
70
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/lib/curation.rb
CHANGED
@@ -6,7 +6,7 @@ module Curation
|
|
6
6
|
class Error < StandardError; end
|
7
7
|
|
8
8
|
class Page
|
9
|
-
attr_reader :url
|
9
|
+
attr_reader :url
|
10
10
|
|
11
11
|
BLACKLIST = [
|
12
12
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
@@ -25,40 +25,29 @@ module Curation
|
|
25
25
|
end
|
26
26
|
|
27
27
|
def title
|
28
|
-
@title
|
28
|
+
@title ||= find_title
|
29
29
|
end
|
30
30
|
|
31
31
|
def image
|
32
|
-
@image
|
33
|
-
|
32
|
+
unless @image
|
33
|
+
@image = find_image
|
34
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
35
|
+
end
|
34
36
|
@image
|
35
37
|
end
|
36
38
|
|
37
39
|
def text
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
end
|
44
|
-
end
|
45
|
-
h = nokogiri.dup
|
46
|
-
BLACKLIST.each do |tag|
|
47
|
-
h.css(tag).remove
|
48
|
-
end
|
49
|
-
nodes = h.css('p')
|
50
|
-
nodes.xpath('//style').remove
|
51
|
-
text = nodes.to_html
|
52
|
-
text.gsub!('<br><br>', '<br>')
|
53
|
-
text
|
40
|
+
@text ||= find_text
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
@date ||= find_date
|
54
45
|
end
|
55
46
|
|
56
47
|
protected
|
57
48
|
|
58
49
|
def find_title
|
59
50
|
if json_ld.any?
|
60
|
-
# Some sites have tables in tables
|
61
|
-
json_ld.flatten!
|
62
51
|
json_ld.each do |ld|
|
63
52
|
return ld['headline'] if ld.has_key? 'headline'
|
64
53
|
end
|
@@ -70,7 +59,7 @@ module Curation
|
|
70
59
|
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
71
60
|
nokogiri.css('title')&.first&.inner_text
|
72
61
|
].each do |possibility|
|
73
|
-
return possibility unless possibility.
|
62
|
+
return possibility unless possibility.to_s.empty?
|
74
63
|
end
|
75
64
|
rescue
|
76
65
|
puts 'Curation::Page find_title error'
|
@@ -98,7 +87,7 @@ module Curation
|
|
98
87
|
metainspector.images.best,
|
99
88
|
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
100
89
|
].each do |possibility|
|
101
|
-
|
90
|
+
return possibility unless possibility.to_s.empty?
|
102
91
|
end
|
103
92
|
rescue
|
104
93
|
puts 'Curation::Page find_image error'
|
@@ -106,8 +95,63 @@ module Curation
|
|
106
95
|
return ''
|
107
96
|
end
|
108
97
|
|
98
|
+
def find_text
|
99
|
+
if json_ld.any?
|
100
|
+
json_ld.each do |ld|
|
101
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
102
|
+
return ld['text'] if ld.has_key? 'text'
|
103
|
+
return ld['articleBody'] if ld.has_key? 'articleBody'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
h = nokogiri.dup
|
107
|
+
BLACKLIST.each do |tag|
|
108
|
+
h.css(tag).remove
|
109
|
+
end
|
110
|
+
nodes = h.css('p')
|
111
|
+
nodes.xpath('//style').remove
|
112
|
+
text = nodes.to_html
|
113
|
+
text.gsub!('<br><br>', '<br>')
|
114
|
+
text
|
115
|
+
end
|
116
|
+
|
117
|
+
def find_date
|
118
|
+
if json_ld.any?
|
119
|
+
json_ld.each do |ld|
|
120
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
121
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return Date.parse metatags['date'] rescue nil
|
125
|
+
return Date.parse metatags['pubdate'] rescue nil
|
126
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
127
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
128
|
+
chunks = html.split('DisplayDate')
|
129
|
+
if chunks.count > 1
|
130
|
+
value = chunks[1]
|
131
|
+
value = value.split(',').first
|
132
|
+
value = value.gsub('"', '')
|
133
|
+
value = value[1..-1] if value[0] == ':'
|
134
|
+
return Date.parse value rescue nil
|
135
|
+
end
|
136
|
+
begin
|
137
|
+
value = nokogiri.css('.postDate').first
|
138
|
+
value = value.inner_text
|
139
|
+
value = value.gsub(' — ', '')
|
140
|
+
return Date.parse value
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
begin
|
144
|
+
value = nokogiri.css('.gta_post_date').first
|
145
|
+
value = value.inner_text
|
146
|
+
return Date.parse value
|
147
|
+
rescue
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
109
153
|
def json_ld
|
110
|
-
unless @json_ld
|
154
|
+
unless defined?(@json_ld)
|
111
155
|
@json_ld = []
|
112
156
|
begin
|
113
157
|
options = nokogiri.css('[type="application/ld+json"]')
|
@@ -116,6 +160,8 @@ module Curation
|
|
116
160
|
hash = JSON.parse(string)
|
117
161
|
@json_ld << hash
|
118
162
|
end
|
163
|
+
# Some sites have tables in tables
|
164
|
+
@json_ld.flatten!
|
119
165
|
rescue
|
120
166
|
puts 'Curation::Page json_ld error'
|
121
167
|
end
|
@@ -123,22 +169,48 @@ module Curation
|
|
123
169
|
@json_ld
|
124
170
|
end
|
125
171
|
|
172
|
+
def file
|
173
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
174
|
+
rescue
|
175
|
+
puts "Curation::Page file error with url #{url}"
|
176
|
+
end
|
177
|
+
|
126
178
|
def html
|
127
|
-
@html
|
179
|
+
unless @html
|
180
|
+
file.rewind
|
181
|
+
@html = file.read
|
182
|
+
file.rewind
|
183
|
+
end
|
184
|
+
@html
|
128
185
|
rescue
|
129
|
-
puts "
|
186
|
+
puts "Curation::Page html error"
|
130
187
|
end
|
131
188
|
|
132
189
|
def nokogiri
|
133
|
-
@nokogiri
|
190
|
+
unless @nokogiri
|
191
|
+
file.rewind
|
192
|
+
@nokogiri = Nokogiri::HTML file
|
193
|
+
file.rewind
|
194
|
+
end
|
195
|
+
@nokogiri
|
134
196
|
rescue
|
135
197
|
puts 'Curation::Page nokogiri error'
|
136
198
|
end
|
137
199
|
|
138
200
|
def metainspector
|
139
|
-
@metainspector
|
201
|
+
unless @metainspector
|
202
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
203
|
+
: MetaInspector.new(url, document: html)
|
204
|
+
end
|
205
|
+
@metainspector
|
140
206
|
rescue
|
141
207
|
puts 'Curation::Page metainspector error'
|
142
208
|
end
|
209
|
+
|
210
|
+
def metatags
|
211
|
+
@metatags ||= metainspector.meta_tag['name']
|
212
|
+
rescue
|
213
|
+
puts 'Curation::Page metatags error'
|
214
|
+
end
|
143
215
|
end
|
144
216
|
end
|
data/lib/curation/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.7'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|