curation 1.8 → 1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +11 -9
- data/curation.gemspec +1 -0
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +47 -5
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e68898021d1c54927e120e46d2b1282535aec94a4daa0c552d9edeb4d6dc1d88
|
4
|
+
data.tar.gz: d6ba0ccabe71d10efb60fb0a5806f94f3fdd42f7e49fe39f00d64c427455d450
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88310bda3ba8221af689f5848c24af2dfd48668caaf875b55f05626e09dbf072747b877e779b93a91c4e602b39cd23e61dc602662bb2b34e8b0e095c6cd9c488
|
7
|
+
data.tar.gz: c750a811383b541beefd3214accbdacda85ab7552ded3863734101cf1ffa42ec963b455c83f817e6176a2e31dc8d57a07ce14fac0f2567a54fd353c42f26d7fb
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (1.
|
4
|
+
curation (1.9)
|
5
|
+
htmlentities
|
5
6
|
metainspector
|
6
7
|
nokogiri
|
7
8
|
|
@@ -15,7 +16,7 @@ GEM
|
|
15
16
|
byebug (11.1.3)
|
16
17
|
domain_name (0.5.20190701)
|
17
18
|
unf (>= 0.0.5, < 1.0.0)
|
18
|
-
faraday (1.
|
19
|
+
faraday (1.10.0)
|
19
20
|
faraday-em_http (~> 1.0)
|
20
21
|
faraday-em_synchrony (~> 1.0)
|
21
22
|
faraday-excon (~> 1.1)
|
@@ -48,11 +49,12 @@ GEM
|
|
48
49
|
faraday_middleware (1.2.0)
|
49
50
|
faraday (~> 1.0)
|
50
51
|
fastimage (2.2.6)
|
52
|
+
htmlentities (4.3.4)
|
51
53
|
http-cookie (1.0.4)
|
52
54
|
domain_name (~> 0.5)
|
53
|
-
metainspector (5.
|
55
|
+
metainspector (5.12.1)
|
54
56
|
addressable (~> 2.7)
|
55
|
-
faraday (
|
57
|
+
faraday (>= 1.4, < 3.0)
|
56
58
|
faraday-cookie_jar (~> 0.0)
|
57
59
|
faraday-encoding (~> 0.0)
|
58
60
|
faraday-http-cache (~> 2.2)
|
@@ -60,7 +62,7 @@ GEM
|
|
60
62
|
fastimage (~> 2.2)
|
61
63
|
nesty (~> 1.0)
|
62
64
|
nokogiri (~> 1.11)
|
63
|
-
mini_portile2 (2.
|
65
|
+
mini_portile2 (2.8.0)
|
64
66
|
minitest (5.15.0)
|
65
67
|
minitest-reporters (1.5.0)
|
66
68
|
ansi
|
@@ -69,17 +71,17 @@ GEM
|
|
69
71
|
ruby-progressbar
|
70
72
|
multipart-post (2.1.1)
|
71
73
|
nesty (1.0.2)
|
72
|
-
nokogiri (1.13.
|
73
|
-
mini_portile2 (~> 2.
|
74
|
+
nokogiri (1.13.6)
|
75
|
+
mini_portile2 (~> 2.8.0)
|
74
76
|
racc (~> 1.4)
|
75
|
-
public_suffix (4.0.
|
77
|
+
public_suffix (4.0.7)
|
76
78
|
racc (1.6.0)
|
77
79
|
rake (12.3.3)
|
78
80
|
ruby-progressbar (1.11.0)
|
79
81
|
ruby2_keywords (0.0.5)
|
80
82
|
unf (0.1.4)
|
81
83
|
unf_ext
|
82
|
-
unf_ext (0.0.8)
|
84
|
+
unf_ext (0.0.8.1)
|
83
85
|
|
84
86
|
PLATFORMS
|
85
87
|
ruby
|
data/curation.gemspec
CHANGED
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require "curation/version"
|
2
2
|
require "metainspector"
|
3
3
|
require "open-uri"
|
4
|
+
require "htmlentities"
|
4
5
|
|
5
6
|
module Curation
|
6
7
|
class Error < StandardError; end
|
@@ -37,6 +38,7 @@ module Curation
|
|
37
38
|
end
|
38
39
|
|
39
40
|
def text
|
41
|
+
# require 'byebug'; byebug
|
40
42
|
@text ||= find_text
|
41
43
|
end
|
42
44
|
|
@@ -49,6 +51,8 @@ module Curation
|
|
49
51
|
def find_title
|
50
52
|
if json_ld.any?
|
51
53
|
json_ld.each do |ld|
|
54
|
+
# require 'byebug'; byebug
|
55
|
+
ld = ld.first if ld.is_a?(Array)
|
52
56
|
return ld['headline'] if ld.has_key? 'headline'
|
53
57
|
end
|
54
58
|
end
|
@@ -70,6 +74,7 @@ module Curation
|
|
70
74
|
def find_image
|
71
75
|
if json_ld.any?
|
72
76
|
json_ld.each do |ld|
|
77
|
+
ld = ld.first if ld.is_a?(Array)
|
73
78
|
if ld.has_key? 'image'
|
74
79
|
image_data = ld['image']
|
75
80
|
return image_data if image_data.is_a? String
|
@@ -96,6 +101,14 @@ module Curation
|
|
96
101
|
end
|
97
102
|
|
98
103
|
def find_text
|
104
|
+
text = find_text_with_json_ld || find_text_with_nokogiri
|
105
|
+
text.to_s.gsub!('<br><br>', '<br>')
|
106
|
+
# require 'byebug'; byebug
|
107
|
+
text = clean_encoding text
|
108
|
+
text
|
109
|
+
end
|
110
|
+
|
111
|
+
def find_text_with_json_ld
|
99
112
|
if json_ld.any?
|
100
113
|
json_ld.each do |ld|
|
101
114
|
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
@@ -103,6 +116,10 @@ module Curation
|
|
103
116
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
104
117
|
end
|
105
118
|
end
|
119
|
+
nil
|
120
|
+
end
|
121
|
+
|
122
|
+
def find_text_with_nokogiri
|
106
123
|
h = nokogiri.dup
|
107
124
|
BLACKLIST.each do |tag|
|
108
125
|
h.css(tag).remove
|
@@ -110,7 +127,6 @@ module Curation
|
|
110
127
|
nodes = h.css('p')
|
111
128
|
nodes.xpath('//style').remove
|
112
129
|
text = nodes.to_html
|
113
|
-
text.gsub!('<br><br>', '<br>')
|
114
130
|
text
|
115
131
|
end
|
116
132
|
|
@@ -125,6 +141,7 @@ module Curation
|
|
125
141
|
return Date.parse metatags['pubdate'] rescue nil
|
126
142
|
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
127
143
|
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
144
|
+
return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
|
128
145
|
chunks = html.split('DisplayDate')
|
129
146
|
if chunks.count > 1
|
130
147
|
value = chunks[1]
|
@@ -156,13 +173,11 @@ module Curation
|
|
156
173
|
begin
|
157
174
|
options = nokogiri.css('[type="application/ld+json"]')
|
158
175
|
options.each do |option|
|
159
|
-
|
160
|
-
string = option.inner_text
|
161
|
-
hash = JSON.parse(string)
|
162
|
-
@json_ld << hash
|
176
|
+
@json_ld << json_ld_from_object(option)
|
163
177
|
end
|
164
178
|
# Some sites have tables in tables
|
165
179
|
@json_ld.flatten!
|
180
|
+
# require 'byebug'; byebug
|
166
181
|
rescue
|
167
182
|
puts 'Curation::Page json_ld error'
|
168
183
|
end
|
@@ -170,6 +185,12 @@ module Curation
|
|
170
185
|
@json_ld
|
171
186
|
end
|
172
187
|
|
188
|
+
def json_ld_from_object(object)
|
189
|
+
JSON.parse object.inner_text
|
190
|
+
rescue
|
191
|
+
{}
|
192
|
+
end
|
193
|
+
|
173
194
|
def file
|
174
195
|
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
175
196
|
rescue
|
@@ -217,5 +238,26 @@ module Curation
|
|
217
238
|
rescue
|
218
239
|
puts 'Curation::Page metatags error'
|
219
240
|
end
|
241
|
+
|
242
|
+
# réforme -> réforme
|
243
|
+
def clean_encoding(text)
|
244
|
+
clean_text = HTMLEntities.new.decode text
|
245
|
+
double_encoding = false
|
246
|
+
[
|
247
|
+
'é', # é
|
248
|
+
'è', # è
|
249
|
+
'î', # î
|
250
|
+
'ê', # ê
|
251
|
+
].each do |string|
|
252
|
+
# require 'byebug'; byebug
|
253
|
+
double_encoding = true if clean_text.include? string
|
254
|
+
end
|
255
|
+
if double_encoding
|
256
|
+
clean_text.encode('iso-8859-1', undef: :replace)
|
257
|
+
.force_encoding('utf-8')
|
258
|
+
else
|
259
|
+
text
|
260
|
+
end
|
261
|
+
end
|
220
262
|
end
|
221
263
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.9'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: htmlentities
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
description: When you build content curation tools, you need to extract the content
|
42
56
|
of pages (title, text, image...). This requires different strategies and some fine
|
43
57
|
tuning to work efficiently.
|