curation 1.8 → 1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b8086e74a861e147bce3a73dcec3cebef7d5d98ede440c911f57757feaaac354
4
- data.tar.gz: 3d6ff271bcfbf8599a76653d60b2c26ed3832c1c174885d114912f95a7707ee7
3
+ metadata.gz: e68898021d1c54927e120e46d2b1282535aec94a4daa0c552d9edeb4d6dc1d88
4
+ data.tar.gz: d6ba0ccabe71d10efb60fb0a5806f94f3fdd42f7e49fe39f00d64c427455d450
5
5
  SHA512:
6
- metadata.gz: '084c4c3fbdf3491530cd4ca6ca0fdc736df69a22a49cce40b0d7c8d169f6a45e7c5f01f280957016e32bedca93d593bb2d9e60141e148ee4f110935877543168'
7
- data.tar.gz: 3fe95dde59a8cb2268a1f93ff1e8b7ccbca9212a7b0cdc0093c16925549545df422945db25ab29f250fcc05242b5d7b01eb822f3c97b4e2b96d19653d70dff39
6
+ metadata.gz: 88310bda3ba8221af689f5848c24af2dfd48668caaf875b55f05626e09dbf072747b877e779b93a91c4e602b39cd23e61dc602662bb2b34e8b0e095c6cd9c488
7
+ data.tar.gz: c750a811383b541beefd3214accbdacda85ab7552ded3863734101cf1ffa42ec963b455c83f817e6176a2e31dc8d57a07ce14fac0f2567a54fd353c42f26d7fb
data/Gemfile.lock CHANGED
@@ -1,7 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.8)
4
+ curation (1.9)
5
+ htmlentities
5
6
  metainspector
6
7
  nokogiri
7
8
 
@@ -15,7 +16,7 @@ GEM
15
16
  byebug (11.1.3)
16
17
  domain_name (0.5.20190701)
17
18
  unf (>= 0.0.5, < 1.0.0)
18
- faraday (1.9.3)
19
+ faraday (1.10.0)
19
20
  faraday-em_http (~> 1.0)
20
21
  faraday-em_synchrony (~> 1.0)
21
22
  faraday-excon (~> 1.1)
@@ -48,11 +49,12 @@ GEM
48
49
  faraday_middleware (1.2.0)
49
50
  faraday (~> 1.0)
50
51
  fastimage (2.2.6)
52
+ htmlentities (4.3.4)
51
53
  http-cookie (1.0.4)
52
54
  domain_name (~> 0.5)
53
- metainspector (5.11.2)
55
+ metainspector (5.12.1)
54
56
  addressable (~> 2.7)
55
- faraday (~> 1.4)
57
+ faraday (>= 1.4, < 3.0)
56
58
  faraday-cookie_jar (~> 0.0)
57
59
  faraday-encoding (~> 0.0)
58
60
  faraday-http-cache (~> 2.2)
@@ -60,7 +62,7 @@ GEM
60
62
  fastimage (~> 2.2)
61
63
  nesty (~> 1.0)
62
64
  nokogiri (~> 1.11)
63
- mini_portile2 (2.7.1)
65
+ mini_portile2 (2.8.0)
64
66
  minitest (5.15.0)
65
67
  minitest-reporters (1.5.0)
66
68
  ansi
@@ -69,17 +71,17 @@ GEM
69
71
  ruby-progressbar
70
72
  multipart-post (2.1.1)
71
73
  nesty (1.0.2)
72
- nokogiri (1.13.1)
73
- mini_portile2 (~> 2.7.0)
74
+ nokogiri (1.13.6)
75
+ mini_portile2 (~> 2.8.0)
74
76
  racc (~> 1.4)
75
- public_suffix (4.0.6)
77
+ public_suffix (4.0.7)
76
78
  racc (1.6.0)
77
79
  rake (12.3.3)
78
80
  ruby-progressbar (1.11.0)
79
81
  ruby2_keywords (0.0.5)
80
82
  unf (0.1.4)
81
83
  unf_ext
82
- unf_ext (0.0.8)
84
+ unf_ext (0.0.8.1)
83
85
 
84
86
  PLATFORMS
85
87
  ruby
data/curation.gemspec CHANGED
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.require_paths = ["lib"]
24
24
  spec.add_dependency "metainspector"
25
25
  spec.add_dependency "nokogiri"
26
+ spec.add_dependency "htmlentities"
26
27
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.8"
2
+ VERSION = "1.9"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "curation/version"
2
2
  require "metainspector"
3
3
  require "open-uri"
4
+ require "htmlentities"
4
5
 
5
6
  module Curation
6
7
  class Error < StandardError; end
@@ -37,6 +38,7 @@ module Curation
37
38
  end
38
39
 
39
40
  def text
41
+ # require 'byebug'; byebug
40
42
  @text ||= find_text
41
43
  end
42
44
 
@@ -49,6 +51,8 @@ module Curation
49
51
  def find_title
50
52
  if json_ld.any?
51
53
  json_ld.each do |ld|
54
+ # require 'byebug'; byebug
55
+ ld = ld.first if ld.is_a?(Array)
52
56
  return ld['headline'] if ld.has_key? 'headline'
53
57
  end
54
58
  end
@@ -70,6 +74,7 @@ module Curation
70
74
  def find_image
71
75
  if json_ld.any?
72
76
  json_ld.each do |ld|
77
+ ld = ld.first if ld.is_a?(Array)
73
78
  if ld.has_key? 'image'
74
79
  image_data = ld['image']
75
80
  return image_data if image_data.is_a? String
@@ -96,6 +101,14 @@ module Curation
96
101
  end
97
102
 
98
103
  def find_text
104
+ text = find_text_with_json_ld || find_text_with_nokogiri
105
+ text.to_s.gsub!('<br><br>', '<br>')
106
+ # require 'byebug'; byebug
107
+ text = clean_encoding text
108
+ text
109
+ end
110
+
111
+ def find_text_with_json_ld
99
112
  if json_ld.any?
100
113
  json_ld.each do |ld|
101
114
  next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
@@ -103,6 +116,10 @@ module Curation
103
116
  return ld['articleBody'] if ld.has_key? 'articleBody'
104
117
  end
105
118
  end
119
+ nil
120
+ end
121
+
122
+ def find_text_with_nokogiri
106
123
  h = nokogiri.dup
107
124
  BLACKLIST.each do |tag|
108
125
  h.css(tag).remove
@@ -110,7 +127,6 @@ module Curation
110
127
  nodes = h.css('p')
111
128
  nodes.xpath('//style').remove
112
129
  text = nodes.to_html
113
- text.gsub!('<br><br>', '<br>')
114
130
  text
115
131
  end
116
132
 
@@ -125,6 +141,7 @@ module Curation
125
141
  return Date.parse metatags['pubdate'] rescue nil
126
142
  return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
143
  return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
144
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
128
145
  chunks = html.split('DisplayDate')
129
146
  if chunks.count > 1
130
147
  value = chunks[1]
@@ -156,13 +173,11 @@ module Curation
156
173
  begin
157
174
  options = nokogiri.css('[type="application/ld+json"]')
158
175
  options.each do |option|
159
- # require 'byebug'; byebug
160
- string = option.inner_text
161
- hash = JSON.parse(string)
162
- @json_ld << hash
176
+ @json_ld << json_ld_from_object(option)
163
177
  end
164
178
  # Some sites have tables in tables
165
179
  @json_ld.flatten!
180
+ # require 'byebug'; byebug
166
181
  rescue
167
182
  puts 'Curation::Page json_ld error'
168
183
  end
@@ -170,6 +185,12 @@ module Curation
170
185
  @json_ld
171
186
  end
172
187
 
188
+ def json_ld_from_object(object)
189
+ JSON.parse object.inner_text
190
+ rescue
191
+ {}
192
+ end
193
+
173
194
  def file
174
195
  @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
175
196
  rescue
@@ -217,5 +238,26 @@ module Curation
217
238
  rescue
218
239
  puts 'Curation::Page metatags error'
219
240
  end
241
+
242
+ # r&Atilde;&copy;forme -> réforme
243
+ def clean_encoding(text)
244
+ clean_text = HTMLEntities.new.decode text
245
+ double_encoding = false
246
+ [
247
+ 'é', # é
248
+ 'è', # è
249
+ 'î', # î
250
+ 'ê', # ê
251
+ ].each do |string|
252
+ # require 'byebug'; byebug
253
+ double_encoding = true if clean_text.include? string
254
+ end
255
+ if double_encoding
256
+ clean_text.encode('iso-8859-1', undef: :replace)
257
+ .force_encoding('utf-8')
258
+ else
259
+ text
260
+ end
261
+ end
220
262
  end
221
263
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.8'
4
+ version: '1.9'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-02-14 00:00:00.000000000 Z
11
+ date: 2022-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: htmlentities
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: When you build content curation tools, you need to extract the content
42
56
  of pages (title, text, image...). This requires different strategies and some fine
43
57
  tuning to work efficiently.