curation 1.7 → 1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +56 -32
- data/curation.gemspec +2 -1
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +55 -8
- metadata +21 -7
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: a23e967e5d017ce61f9719647f45c20e1336aff067bc2323f64641dfac695f75
|
|
4
|
+
data.tar.gz: 49427d9325a27034c1969d71875dcd4eacff4f5c3ac9625ccc6cdb554c4c2df4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b01f4209b09d6ec09917096159b98be71b31fd0952524a12c7310ba29f88ee8c888bc2927015ef2b1fb529cfe46964c08c743c818807518dee6bfa3cc32f6767
|
|
7
|
+
data.tar.gz: 564d14e3afaa17f00ac7b034917c7b28612b54b6f71126a36de4d6d43def7b8c19814475ee5b13651bbea7e7eed752b873f918af10baf169bd153d66b8d05c7d
|
data/Gemfile.lock
CHANGED
|
@@ -1,64 +1,88 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
curation (1.
|
|
5
|
-
|
|
4
|
+
curation (1.10)
|
|
5
|
+
htmlentities
|
|
6
|
+
metainspector (~> 5.12)
|
|
6
7
|
nokogiri
|
|
7
8
|
|
|
8
9
|
GEM
|
|
9
10
|
remote: https://rubygems.org/
|
|
10
11
|
specs:
|
|
11
|
-
addressable (2.
|
|
12
|
+
addressable (2.8.0)
|
|
12
13
|
public_suffix (>= 2.0.2, < 5.0)
|
|
13
14
|
ansi (1.5.0)
|
|
14
15
|
builder (3.2.4)
|
|
15
16
|
byebug (11.1.3)
|
|
16
17
|
domain_name (0.5.20190701)
|
|
17
18
|
unf (>= 0.0.5, < 1.0.0)
|
|
18
|
-
faraday (1.0
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
faraday (
|
|
19
|
+
faraday (1.10.0)
|
|
20
|
+
faraday-em_http (~> 1.0)
|
|
21
|
+
faraday-em_synchrony (~> 1.0)
|
|
22
|
+
faraday-excon (~> 1.1)
|
|
23
|
+
faraday-httpclient (~> 1.0)
|
|
24
|
+
faraday-multipart (~> 1.0)
|
|
25
|
+
faraday-net_http (~> 1.0)
|
|
26
|
+
faraday-net_http_persistent (~> 1.0)
|
|
27
|
+
faraday-patron (~> 1.0)
|
|
28
|
+
faraday-rack (~> 1.0)
|
|
29
|
+
faraday-retry (~> 1.0)
|
|
30
|
+
ruby2_keywords (>= 0.0.4)
|
|
31
|
+
faraday-cookie_jar (0.0.7)
|
|
32
|
+
faraday (>= 0.8.0)
|
|
22
33
|
http-cookie (~> 1.0.0)
|
|
34
|
+
faraday-em_http (1.0.0)
|
|
35
|
+
faraday-em_synchrony (1.0.0)
|
|
23
36
|
faraday-encoding (0.0.5)
|
|
24
37
|
faraday
|
|
25
|
-
faraday-
|
|
38
|
+
faraday-excon (1.1.0)
|
|
39
|
+
faraday-http-cache (2.4.0)
|
|
26
40
|
faraday (>= 0.8)
|
|
27
|
-
|
|
41
|
+
faraday-httpclient (1.0.1)
|
|
42
|
+
faraday-multipart (1.0.4)
|
|
43
|
+
multipart-post (~> 2)
|
|
44
|
+
faraday-net_http (1.0.1)
|
|
45
|
+
faraday-net_http_persistent (1.2.0)
|
|
46
|
+
faraday-patron (1.0.0)
|
|
47
|
+
faraday-rack (1.0.0)
|
|
48
|
+
faraday-retry (1.0.3)
|
|
49
|
+
faraday_middleware (1.2.0)
|
|
28
50
|
faraday (~> 1.0)
|
|
29
|
-
fastimage (2.
|
|
30
|
-
|
|
51
|
+
fastimage (2.2.6)
|
|
52
|
+
htmlentities (4.3.4)
|
|
53
|
+
http-cookie (1.0.5)
|
|
31
54
|
domain_name (~> 0.5)
|
|
32
|
-
metainspector (5.
|
|
33
|
-
addressable (~> 2.7
|
|
34
|
-
faraday (
|
|
35
|
-
faraday-cookie_jar (~> 0.0
|
|
36
|
-
faraday-encoding (~> 0.0
|
|
37
|
-
faraday-http-cache (~> 2.2
|
|
38
|
-
faraday_middleware (~> 1.0
|
|
39
|
-
fastimage (~> 2.
|
|
40
|
-
nesty (~> 1.0
|
|
41
|
-
nokogiri (~> 1.
|
|
42
|
-
|
|
43
|
-
minitest (5.
|
|
44
|
-
minitest-reporters (1.4.2)
|
|
55
|
+
metainspector (5.12.1)
|
|
56
|
+
addressable (~> 2.7)
|
|
57
|
+
faraday (>= 1.4, < 3.0)
|
|
58
|
+
faraday-cookie_jar (~> 0.0)
|
|
59
|
+
faraday-encoding (~> 0.0)
|
|
60
|
+
faraday-http-cache (~> 2.2)
|
|
61
|
+
faraday_middleware (~> 1.0)
|
|
62
|
+
fastimage (~> 2.2)
|
|
63
|
+
nesty (~> 1.0)
|
|
64
|
+
nokogiri (~> 1.11)
|
|
65
|
+
minitest (5.15.0)
|
|
66
|
+
minitest-reporters (1.5.0)
|
|
45
67
|
ansi
|
|
46
68
|
builder
|
|
47
69
|
minitest (>= 5.0)
|
|
48
70
|
ruby-progressbar
|
|
49
|
-
multipart-post (2.
|
|
71
|
+
multipart-post (2.2.0)
|
|
50
72
|
nesty (1.0.2)
|
|
51
|
-
nokogiri (1.
|
|
52
|
-
|
|
53
|
-
public_suffix (4.0.
|
|
73
|
+
nokogiri (1.13.6-x86_64-darwin)
|
|
74
|
+
racc (~> 1.4)
|
|
75
|
+
public_suffix (4.0.7)
|
|
76
|
+
racc (1.6.0)
|
|
54
77
|
rake (12.3.3)
|
|
55
|
-
ruby-progressbar (1.
|
|
78
|
+
ruby-progressbar (1.11.0)
|
|
79
|
+
ruby2_keywords (0.0.5)
|
|
56
80
|
unf (0.1.4)
|
|
57
81
|
unf_ext
|
|
58
|
-
unf_ext (0.0.
|
|
82
|
+
unf_ext (0.0.8.2)
|
|
59
83
|
|
|
60
84
|
PLATFORMS
|
|
61
|
-
|
|
85
|
+
x86_64-darwin-21
|
|
62
86
|
|
|
63
87
|
DEPENDENCIES
|
|
64
88
|
byebug
|
|
@@ -68,4 +92,4 @@ DEPENDENCIES
|
|
|
68
92
|
rake (~> 12.0)
|
|
69
93
|
|
|
70
94
|
BUNDLED WITH
|
|
71
|
-
2.
|
|
95
|
+
2.3.12
|
data/curation.gemspec
CHANGED
|
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
|
|
|
21
21
|
spec.bindir = "exe"
|
|
22
22
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
23
23
|
spec.require_paths = ["lib"]
|
|
24
|
-
spec.add_dependency "metainspector"
|
|
24
|
+
spec.add_dependency "metainspector", '~> 5.12'
|
|
25
25
|
spec.add_dependency "nokogiri"
|
|
26
|
+
spec.add_dependency "htmlentities"
|
|
26
27
|
end
|
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
require "curation/version"
|
|
2
2
|
require "metainspector"
|
|
3
3
|
require "open-uri"
|
|
4
|
+
require "htmlentities"
|
|
4
5
|
|
|
5
6
|
module Curation
|
|
6
7
|
class Error < StandardError; end
|
|
@@ -20,7 +21,7 @@ module Curation
|
|
|
20
21
|
]
|
|
21
22
|
|
|
22
23
|
def initialize(url, html = nil)
|
|
23
|
-
@url = url
|
|
24
|
+
@url = url.to_s.gsub('http://', 'https://')
|
|
24
25
|
@html = html
|
|
25
26
|
end
|
|
26
27
|
|
|
@@ -37,6 +38,7 @@ module Curation
|
|
|
37
38
|
end
|
|
38
39
|
|
|
39
40
|
def text
|
|
41
|
+
# require 'byebug'; byebug
|
|
40
42
|
@text ||= find_text
|
|
41
43
|
end
|
|
42
44
|
|
|
@@ -49,6 +51,8 @@ module Curation
|
|
|
49
51
|
def find_title
|
|
50
52
|
if json_ld.any?
|
|
51
53
|
json_ld.each do |ld|
|
|
54
|
+
# require 'byebug'; byebug
|
|
55
|
+
ld = ld.first if ld.is_a?(Array)
|
|
52
56
|
return ld['headline'] if ld.has_key? 'headline'
|
|
53
57
|
end
|
|
54
58
|
end
|
|
@@ -70,6 +74,7 @@ module Curation
|
|
|
70
74
|
def find_image
|
|
71
75
|
if json_ld.any?
|
|
72
76
|
json_ld.each do |ld|
|
|
77
|
+
ld = ld.first if ld.is_a?(Array)
|
|
73
78
|
if ld.has_key? 'image'
|
|
74
79
|
image_data = ld['image']
|
|
75
80
|
return image_data if image_data.is_a? String
|
|
@@ -96,6 +101,14 @@ module Curation
|
|
|
96
101
|
end
|
|
97
102
|
|
|
98
103
|
def find_text
|
|
104
|
+
text = find_text_with_json_ld || find_text_with_nokogiri
|
|
105
|
+
text.to_s.gsub!('<br><br>', '<br>')
|
|
106
|
+
# require 'byebug'; byebug
|
|
107
|
+
text = clean_encoding text
|
|
108
|
+
text
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def find_text_with_json_ld
|
|
99
112
|
if json_ld.any?
|
|
100
113
|
json_ld.each do |ld|
|
|
101
114
|
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
|
@@ -103,6 +116,10 @@ module Curation
|
|
|
103
116
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
|
104
117
|
end
|
|
105
118
|
end
|
|
119
|
+
nil
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def find_text_with_nokogiri
|
|
106
123
|
h = nokogiri.dup
|
|
107
124
|
BLACKLIST.each do |tag|
|
|
108
125
|
h.css(tag).remove
|
|
@@ -110,7 +127,6 @@ module Curation
|
|
|
110
127
|
nodes = h.css('p')
|
|
111
128
|
nodes.xpath('//style').remove
|
|
112
129
|
text = nodes.to_html
|
|
113
|
-
text.gsub!('<br><br>', '<br>')
|
|
114
130
|
text
|
|
115
131
|
end
|
|
116
132
|
|
|
@@ -125,6 +141,7 @@ module Curation
|
|
|
125
141
|
return Date.parse metatags['pubdate'] rescue nil
|
|
126
142
|
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
|
127
143
|
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
|
144
|
+
return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
|
|
128
145
|
chunks = html.split('DisplayDate')
|
|
129
146
|
if chunks.count > 1
|
|
130
147
|
value = chunks[1]
|
|
@@ -156,12 +173,11 @@ module Curation
|
|
|
156
173
|
begin
|
|
157
174
|
options = nokogiri.css('[type="application/ld+json"]')
|
|
158
175
|
options.each do |option|
|
|
159
|
-
|
|
160
|
-
hash = JSON.parse(string)
|
|
161
|
-
@json_ld << hash
|
|
176
|
+
@json_ld << json_ld_from_object(option)
|
|
162
177
|
end
|
|
163
178
|
# Some sites have tables in tables
|
|
164
179
|
@json_ld.flatten!
|
|
180
|
+
# require 'byebug'; byebug
|
|
165
181
|
rescue
|
|
166
182
|
puts 'Curation::Page json_ld error'
|
|
167
183
|
end
|
|
@@ -169,6 +185,12 @@ module Curation
|
|
|
169
185
|
@json_ld
|
|
170
186
|
end
|
|
171
187
|
|
|
188
|
+
def json_ld_from_object(object)
|
|
189
|
+
JSON.parse object.inner_text
|
|
190
|
+
rescue
|
|
191
|
+
{}
|
|
192
|
+
end
|
|
193
|
+
|
|
172
194
|
def file
|
|
173
195
|
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
|
174
196
|
rescue
|
|
@@ -188,9 +210,13 @@ module Curation
|
|
|
188
210
|
|
|
189
211
|
def nokogiri
|
|
190
212
|
unless @nokogiri
|
|
191
|
-
file.
|
|
192
|
-
|
|
193
|
-
|
|
213
|
+
if file.nil?
|
|
214
|
+
@nokogiri = metainspector.parsed
|
|
215
|
+
else
|
|
216
|
+
file.rewind
|
|
217
|
+
@nokogiri = Nokogiri::HTML file
|
|
218
|
+
file.rewind
|
|
219
|
+
end
|
|
194
220
|
end
|
|
195
221
|
@nokogiri
|
|
196
222
|
rescue
|
|
@@ -212,5 +238,26 @@ module Curation
|
|
|
212
238
|
rescue
|
|
213
239
|
puts 'Curation::Page metatags error'
|
|
214
240
|
end
|
|
241
|
+
|
|
242
|
+
# réforme -> réforme
|
|
243
|
+
def clean_encoding(text)
|
|
244
|
+
clean_text = HTMLEntities.new.decode text
|
|
245
|
+
double_encoding = false
|
|
246
|
+
[
|
|
247
|
+
'é', # é
|
|
248
|
+
'è', # è
|
|
249
|
+
'î', # î
|
|
250
|
+
'ê', # ê
|
|
251
|
+
].each do |string|
|
|
252
|
+
# require 'byebug'; byebug
|
|
253
|
+
double_encoding = true if clean_text.include? string
|
|
254
|
+
end
|
|
255
|
+
if double_encoding
|
|
256
|
+
clean_text.encode('iso-8859-1', undef: :replace)
|
|
257
|
+
.force_encoding('utf-8')
|
|
258
|
+
else
|
|
259
|
+
text
|
|
260
|
+
end
|
|
261
|
+
end
|
|
215
262
|
end
|
|
216
263
|
end
|
metadata
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: curation
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: '1.
|
|
4
|
+
version: '1.10'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Arnaud Levy
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2022-06-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: metainspector
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '5.12'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '5.12'
|
|
27
|
+
- !ruby/object:Gem::Dependency
|
|
28
|
+
name: nokogiri
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
16
30
|
requirements:
|
|
17
31
|
- - ">="
|
|
@@ -25,7 +39,7 @@ dependencies:
|
|
|
25
39
|
- !ruby/object:Gem::Version
|
|
26
40
|
version: '0'
|
|
27
41
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
42
|
+
name: htmlentities
|
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
|
30
44
|
requirements:
|
|
31
45
|
- - ">="
|
|
@@ -65,7 +79,7 @@ licenses:
|
|
|
65
79
|
metadata:
|
|
66
80
|
homepage_uri: https://github.com/arnaudlevy/curation
|
|
67
81
|
source_code_uri: https://github.com/arnaudlevy/curation
|
|
68
|
-
post_install_message:
|
|
82
|
+
post_install_message:
|
|
69
83
|
rdoc_options: []
|
|
70
84
|
require_paths:
|
|
71
85
|
- lib
|
|
@@ -80,8 +94,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
80
94
|
- !ruby/object:Gem::Version
|
|
81
95
|
version: '0'
|
|
82
96
|
requirements: []
|
|
83
|
-
rubygems_version: 3.
|
|
84
|
-
signing_key:
|
|
97
|
+
rubygems_version: 3.1.6
|
|
98
|
+
signing_key:
|
|
85
99
|
specification_version: 4
|
|
86
100
|
summary: Curation of content
|
|
87
101
|
test_files: []
|