curation 1.6 → 1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +63 -24
- data/Rakefile +9 -1
- data/curation.gemspec +1 -0
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +152 -33
- metadata +20 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e68898021d1c54927e120e46d2b1282535aec94a4daa0c552d9edeb4d6dc1d88
|
|
4
|
+
data.tar.gz: d6ba0ccabe71d10efb60fb0a5806f94f3fdd42f7e49fe39f00d64c427455d450
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 88310bda3ba8221af689f5848c24af2dfd48668caaf875b55f05626e09dbf072747b877e779b93a91c4e602b39cd23e61dc602662bb2b34e8b0e095c6cd9c488
|
|
7
|
+
data.tar.gz: c750a811383b541beefd3214accbdacda85ab7552ded3863734101cf1ffa42ec963b455c83f817e6176a2e31dc8d57a07ce14fac0f2567a54fd353c42f26d7fb
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,57 +1,96 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
curation (1.
|
|
4
|
+
curation (1.9)
|
|
5
|
+
htmlentities
|
|
5
6
|
metainspector
|
|
6
7
|
nokogiri
|
|
7
8
|
|
|
8
9
|
GEM
|
|
9
10
|
remote: https://rubygems.org/
|
|
10
11
|
specs:
|
|
11
|
-
addressable (2.
|
|
12
|
+
addressable (2.8.0)
|
|
12
13
|
public_suffix (>= 2.0.2, < 5.0)
|
|
14
|
+
ansi (1.5.0)
|
|
15
|
+
builder (3.2.4)
|
|
16
|
+
byebug (11.1.3)
|
|
13
17
|
domain_name (0.5.20190701)
|
|
14
18
|
unf (>= 0.0.5, < 1.0.0)
|
|
15
|
-
faraday (1.0
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
faraday (
|
|
19
|
+
faraday (1.10.0)
|
|
20
|
+
faraday-em_http (~> 1.0)
|
|
21
|
+
faraday-em_synchrony (~> 1.0)
|
|
22
|
+
faraday-excon (~> 1.1)
|
|
23
|
+
faraday-httpclient (~> 1.0)
|
|
24
|
+
faraday-multipart (~> 1.0)
|
|
25
|
+
faraday-net_http (~> 1.0)
|
|
26
|
+
faraday-net_http_persistent (~> 1.0)
|
|
27
|
+
faraday-patron (~> 1.0)
|
|
28
|
+
faraday-rack (~> 1.0)
|
|
29
|
+
faraday-retry (~> 1.0)
|
|
30
|
+
ruby2_keywords (>= 0.0.4)
|
|
31
|
+
faraday-cookie_jar (0.0.7)
|
|
32
|
+
faraday (>= 0.8.0)
|
|
19
33
|
http-cookie (~> 1.0.0)
|
|
34
|
+
faraday-em_http (1.0.0)
|
|
35
|
+
faraday-em_synchrony (1.0.0)
|
|
20
36
|
faraday-encoding (0.0.5)
|
|
21
37
|
faraday
|
|
38
|
+
faraday-excon (1.1.0)
|
|
22
39
|
faraday-http-cache (2.2.0)
|
|
23
40
|
faraday (>= 0.8)
|
|
24
|
-
|
|
41
|
+
faraday-httpclient (1.0.1)
|
|
42
|
+
faraday-multipart (1.0.3)
|
|
43
|
+
multipart-post (>= 1.2, < 3)
|
|
44
|
+
faraday-net_http (1.0.1)
|
|
45
|
+
faraday-net_http_persistent (1.2.0)
|
|
46
|
+
faraday-patron (1.0.0)
|
|
47
|
+
faraday-rack (1.0.0)
|
|
48
|
+
faraday-retry (1.0.3)
|
|
49
|
+
faraday_middleware (1.2.0)
|
|
25
50
|
faraday (~> 1.0)
|
|
26
|
-
fastimage (2.
|
|
27
|
-
|
|
51
|
+
fastimage (2.2.6)
|
|
52
|
+
htmlentities (4.3.4)
|
|
53
|
+
http-cookie (1.0.4)
|
|
28
54
|
domain_name (~> 0.5)
|
|
29
|
-
metainspector (5.
|
|
30
|
-
addressable (~> 2.7
|
|
31
|
-
faraday (
|
|
32
|
-
faraday-cookie_jar (~> 0.0
|
|
33
|
-
faraday-encoding (~> 0.0
|
|
34
|
-
faraday-http-cache (~> 2.2
|
|
35
|
-
faraday_middleware (~> 1.0
|
|
36
|
-
fastimage (~> 2.
|
|
37
|
-
nesty (~> 1.0
|
|
38
|
-
nokogiri (~> 1.
|
|
39
|
-
mini_portile2 (2.
|
|
55
|
+
metainspector (5.12.1)
|
|
56
|
+
addressable (~> 2.7)
|
|
57
|
+
faraday (>= 1.4, < 3.0)
|
|
58
|
+
faraday-cookie_jar (~> 0.0)
|
|
59
|
+
faraday-encoding (~> 0.0)
|
|
60
|
+
faraday-http-cache (~> 2.2)
|
|
61
|
+
faraday_middleware (~> 1.0)
|
|
62
|
+
fastimage (~> 2.2)
|
|
63
|
+
nesty (~> 1.0)
|
|
64
|
+
nokogiri (~> 1.11)
|
|
65
|
+
mini_portile2 (2.8.0)
|
|
66
|
+
minitest (5.15.0)
|
|
67
|
+
minitest-reporters (1.5.0)
|
|
68
|
+
ansi
|
|
69
|
+
builder
|
|
70
|
+
minitest (>= 5.0)
|
|
71
|
+
ruby-progressbar
|
|
40
72
|
multipart-post (2.1.1)
|
|
41
73
|
nesty (1.0.2)
|
|
42
|
-
nokogiri (1.
|
|
43
|
-
mini_portile2 (~> 2.
|
|
44
|
-
|
|
74
|
+
nokogiri (1.13.6)
|
|
75
|
+
mini_portile2 (~> 2.8.0)
|
|
76
|
+
racc (~> 1.4)
|
|
77
|
+
public_suffix (4.0.7)
|
|
78
|
+
racc (1.6.0)
|
|
45
79
|
rake (12.3.3)
|
|
80
|
+
ruby-progressbar (1.11.0)
|
|
81
|
+
ruby2_keywords (0.0.5)
|
|
46
82
|
unf (0.1.4)
|
|
47
83
|
unf_ext
|
|
48
|
-
unf_ext (0.0.
|
|
84
|
+
unf_ext (0.0.8.1)
|
|
49
85
|
|
|
50
86
|
PLATFORMS
|
|
51
87
|
ruby
|
|
52
88
|
|
|
53
89
|
DEPENDENCIES
|
|
90
|
+
byebug
|
|
54
91
|
curation!
|
|
92
|
+
minitest
|
|
93
|
+
minitest-reporters
|
|
55
94
|
rake (~> 12.0)
|
|
56
95
|
|
|
57
96
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/curation.gemspec
CHANGED
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
require "curation/version"
|
|
2
2
|
require "metainspector"
|
|
3
3
|
require "open-uri"
|
|
4
|
+
require "htmlentities"
|
|
4
5
|
|
|
5
6
|
module Curation
|
|
6
7
|
class Error < StandardError; end
|
|
7
8
|
|
|
8
9
|
class Page
|
|
9
|
-
attr_reader :url
|
|
10
|
+
attr_reader :url
|
|
10
11
|
|
|
11
12
|
BLACKLIST = [
|
|
12
13
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
|
@@ -20,46 +21,38 @@ module Curation
|
|
|
20
21
|
]
|
|
21
22
|
|
|
22
23
|
def initialize(url, html = nil)
|
|
23
|
-
@url = url
|
|
24
|
+
@url = url.to_s.gsub('http://', 'https://')
|
|
24
25
|
@html = html
|
|
25
26
|
end
|
|
26
27
|
|
|
27
28
|
def title
|
|
28
|
-
@title
|
|
29
|
+
@title ||= find_title
|
|
29
30
|
end
|
|
30
31
|
|
|
31
32
|
def image
|
|
32
|
-
@image
|
|
33
|
-
|
|
33
|
+
unless @image
|
|
34
|
+
@image = find_image
|
|
35
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
|
36
|
+
end
|
|
34
37
|
@image
|
|
35
38
|
end
|
|
36
39
|
|
|
37
40
|
def text
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
end
|
|
45
|
-
h = nokogiri.dup
|
|
46
|
-
BLACKLIST.each do |tag|
|
|
47
|
-
h.css(tag).remove
|
|
48
|
-
end
|
|
49
|
-
nodes = h.css('p')
|
|
50
|
-
nodes.xpath('//style').remove
|
|
51
|
-
text = nodes.to_html
|
|
52
|
-
text.gsub!('<br><br>', '<br>')
|
|
53
|
-
text
|
|
41
|
+
# require 'byebug'; byebug
|
|
42
|
+
@text ||= find_text
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def date
|
|
46
|
+
@date ||= find_date
|
|
54
47
|
end
|
|
55
48
|
|
|
56
49
|
protected
|
|
57
50
|
|
|
58
51
|
def find_title
|
|
59
52
|
if json_ld.any?
|
|
60
|
-
# Some sites have tables in tables
|
|
61
|
-
json_ld.flatten!
|
|
62
53
|
json_ld.each do |ld|
|
|
54
|
+
# require 'byebug'; byebug
|
|
55
|
+
ld = ld.first if ld.is_a?(Array)
|
|
63
56
|
return ld['headline'] if ld.has_key? 'headline'
|
|
64
57
|
end
|
|
65
58
|
end
|
|
@@ -70,7 +63,7 @@ module Curation
|
|
|
70
63
|
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
|
71
64
|
nokogiri.css('title')&.first&.inner_text
|
|
72
65
|
].each do |possibility|
|
|
73
|
-
return possibility unless possibility.
|
|
66
|
+
return possibility unless possibility.to_s.empty?
|
|
74
67
|
end
|
|
75
68
|
rescue
|
|
76
69
|
puts 'Curation::Page find_title error'
|
|
@@ -81,6 +74,7 @@ module Curation
|
|
|
81
74
|
def find_image
|
|
82
75
|
if json_ld.any?
|
|
83
76
|
json_ld.each do |ld|
|
|
77
|
+
ld = ld.first if ld.is_a?(Array)
|
|
84
78
|
if ld.has_key? 'image'
|
|
85
79
|
image_data = ld['image']
|
|
86
80
|
return image_data if image_data.is_a? String
|
|
@@ -98,7 +92,7 @@ module Curation
|
|
|
98
92
|
metainspector.images.best,
|
|
99
93
|
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
|
100
94
|
].each do |possibility|
|
|
101
|
-
|
|
95
|
+
return possibility unless possibility.to_s.empty?
|
|
102
96
|
end
|
|
103
97
|
rescue
|
|
104
98
|
puts 'Curation::Page find_image error'
|
|
@@ -106,16 +100,84 @@ module Curation
|
|
|
106
100
|
return ''
|
|
107
101
|
end
|
|
108
102
|
|
|
103
|
+
def find_text
|
|
104
|
+
text = find_text_with_json_ld || find_text_with_nokogiri
|
|
105
|
+
text.to_s.gsub!('<br><br>', '<br>')
|
|
106
|
+
# require 'byebug'; byebug
|
|
107
|
+
text = clean_encoding text
|
|
108
|
+
text
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def find_text_with_json_ld
|
|
112
|
+
if json_ld.any?
|
|
113
|
+
json_ld.each do |ld|
|
|
114
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
|
115
|
+
return ld['text'] if ld.has_key? 'text'
|
|
116
|
+
return ld['articleBody'] if ld.has_key? 'articleBody'
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
nil
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def find_text_with_nokogiri
|
|
123
|
+
h = nokogiri.dup
|
|
124
|
+
BLACKLIST.each do |tag|
|
|
125
|
+
h.css(tag).remove
|
|
126
|
+
end
|
|
127
|
+
nodes = h.css('p')
|
|
128
|
+
nodes.xpath('//style').remove
|
|
129
|
+
text = nodes.to_html
|
|
130
|
+
text
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def find_date
|
|
134
|
+
if json_ld.any?
|
|
135
|
+
json_ld.each do |ld|
|
|
136
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
|
137
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
return Date.parse metatags['date'] rescue nil
|
|
141
|
+
return Date.parse metatags['pubdate'] rescue nil
|
|
142
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
|
143
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
|
144
|
+
return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
|
|
145
|
+
chunks = html.split('DisplayDate')
|
|
146
|
+
if chunks.count > 1
|
|
147
|
+
value = chunks[1]
|
|
148
|
+
value = value.split(',').first
|
|
149
|
+
value = value.gsub('"', '')
|
|
150
|
+
value = value[1..-1] if value[0] == ':'
|
|
151
|
+
return Date.parse value rescue nil
|
|
152
|
+
end
|
|
153
|
+
begin
|
|
154
|
+
value = nokogiri.css('.postDate').first
|
|
155
|
+
value = value.inner_text
|
|
156
|
+
value = value.gsub(' — ', '')
|
|
157
|
+
return Date.parse value
|
|
158
|
+
rescue
|
|
159
|
+
end
|
|
160
|
+
begin
|
|
161
|
+
value = nokogiri.css('.gta_post_date').first
|
|
162
|
+
value = value.inner_text
|
|
163
|
+
return Date.parse value
|
|
164
|
+
rescue
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
private
|
|
169
|
+
|
|
109
170
|
def json_ld
|
|
110
|
-
unless @json_ld
|
|
171
|
+
unless defined?(@json_ld)
|
|
111
172
|
@json_ld = []
|
|
112
173
|
begin
|
|
113
174
|
options = nokogiri.css('[type="application/ld+json"]')
|
|
114
175
|
options.each do |option|
|
|
115
|
-
|
|
116
|
-
hash = JSON.parse(string)
|
|
117
|
-
@json_ld << hash
|
|
176
|
+
@json_ld << json_ld_from_object(option)
|
|
118
177
|
end
|
|
178
|
+
# Some sites have tables in tables
|
|
179
|
+
@json_ld.flatten!
|
|
180
|
+
# require 'byebug'; byebug
|
|
119
181
|
rescue
|
|
120
182
|
puts 'Curation::Page json_ld error'
|
|
121
183
|
end
|
|
@@ -123,22 +185,79 @@ module Curation
|
|
|
123
185
|
@json_ld
|
|
124
186
|
end
|
|
125
187
|
|
|
188
|
+
def json_ld_from_object(object)
|
|
189
|
+
JSON.parse object.inner_text
|
|
190
|
+
rescue
|
|
191
|
+
{}
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def file
|
|
195
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
|
196
|
+
rescue
|
|
197
|
+
puts "Curation::Page file error with url #{url}"
|
|
198
|
+
end
|
|
199
|
+
|
|
126
200
|
def html
|
|
127
|
-
@html
|
|
201
|
+
unless @html
|
|
202
|
+
file.rewind
|
|
203
|
+
@html = file.read
|
|
204
|
+
file.rewind
|
|
205
|
+
end
|
|
206
|
+
@html
|
|
128
207
|
rescue
|
|
129
|
-
puts "
|
|
208
|
+
puts "Curation::Page html error"
|
|
130
209
|
end
|
|
131
210
|
|
|
132
211
|
def nokogiri
|
|
133
|
-
@nokogiri
|
|
212
|
+
unless @nokogiri
|
|
213
|
+
if file.nil?
|
|
214
|
+
@nokogiri = metainspector.parsed
|
|
215
|
+
else
|
|
216
|
+
file.rewind
|
|
217
|
+
@nokogiri = Nokogiri::HTML file
|
|
218
|
+
file.rewind
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
@nokogiri
|
|
134
222
|
rescue
|
|
135
223
|
puts 'Curation::Page nokogiri error'
|
|
136
224
|
end
|
|
137
225
|
|
|
138
226
|
def metainspector
|
|
139
|
-
@metainspector
|
|
227
|
+
unless @metainspector
|
|
228
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
|
229
|
+
: MetaInspector.new(url, document: html)
|
|
230
|
+
end
|
|
231
|
+
@metainspector
|
|
140
232
|
rescue
|
|
141
233
|
puts 'Curation::Page metainspector error'
|
|
142
234
|
end
|
|
235
|
+
|
|
236
|
+
def metatags
|
|
237
|
+
@metatags ||= metainspector.meta_tag['name']
|
|
238
|
+
rescue
|
|
239
|
+
puts 'Curation::Page metatags error'
|
|
240
|
+
end
|
|
241
|
+
|
|
242
|
+
# réforme -> réforme
|
|
243
|
+
def clean_encoding(text)
|
|
244
|
+
clean_text = HTMLEntities.new.decode text
|
|
245
|
+
double_encoding = false
|
|
246
|
+
[
|
|
247
|
+
'é', # é
|
|
248
|
+
'è', # è
|
|
249
|
+
'î', # î
|
|
250
|
+
'ê', # ê
|
|
251
|
+
].each do |string|
|
|
252
|
+
# require 'byebug'; byebug
|
|
253
|
+
double_encoding = true if clean_text.include? string
|
|
254
|
+
end
|
|
255
|
+
if double_encoding
|
|
256
|
+
clean_text.encode('iso-8859-1', undef: :replace)
|
|
257
|
+
.force_encoding('utf-8')
|
|
258
|
+
else
|
|
259
|
+
text
|
|
260
|
+
end
|
|
261
|
+
end
|
|
143
262
|
end
|
|
144
263
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: curation
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: '1.
|
|
4
|
+
version: '1.9'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Arnaud Levy
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2022-05-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: metainspector
|
|
@@ -38,6 +38,20 @@ dependencies:
|
|
|
38
38
|
- - ">="
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: htmlentities
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0'
|
|
48
|
+
type: :runtime
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
41
55
|
description: When you build content curation tools, you need to extract the content
|
|
42
56
|
of pages (title, text, image...). This requires different strategies and some fine
|
|
43
57
|
tuning to work efficiently.
|
|
@@ -65,7 +79,7 @@ licenses:
|
|
|
65
79
|
metadata:
|
|
66
80
|
homepage_uri: https://github.com/arnaudlevy/curation
|
|
67
81
|
source_code_uri: https://github.com/arnaudlevy/curation
|
|
68
|
-
post_install_message:
|
|
82
|
+
post_install_message:
|
|
69
83
|
rdoc_options: []
|
|
70
84
|
require_paths:
|
|
71
85
|
- lib
|
|
@@ -80,8 +94,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
80
94
|
- !ruby/object:Gem::Version
|
|
81
95
|
version: '0'
|
|
82
96
|
requirements: []
|
|
83
|
-
rubygems_version: 3.
|
|
84
|
-
signing_key:
|
|
97
|
+
rubygems_version: 3.1.6
|
|
98
|
+
signing_key:
|
|
85
99
|
specification_version: 4
|
|
86
100
|
summary: Curation of content
|
|
87
101
|
test_files: []
|