curation 1.7 → 1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
4
- data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
3
+ metadata.gz: a23e967e5d017ce61f9719647f45c20e1336aff067bc2323f64641dfac695f75
4
+ data.tar.gz: 49427d9325a27034c1969d71875dcd4eacff4f5c3ac9625ccc6cdb554c4c2df4
5
5
  SHA512:
6
- metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
7
- data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
6
+ metadata.gz: b01f4209b09d6ec09917096159b98be71b31fd0952524a12c7310ba29f88ee8c888bc2927015ef2b1fb529cfe46964c08c743c818807518dee6bfa3cc32f6767
7
+ data.tar.gz: 564d14e3afaa17f00ac7b034917c7b28612b54b6f71126a36de4d6d43def7b8c19814475ee5b13651bbea7e7eed752b873f918af10baf169bd153d66b8d05c7d
data/Gemfile.lock CHANGED
@@ -1,64 +1,88 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.7)
5
- metainspector
4
+ curation (1.10)
5
+ htmlentities
6
+ metainspector (~> 5.12)
6
7
  nokogiri
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- addressable (2.7.0)
12
+ addressable (2.8.0)
12
13
  public_suffix (>= 2.0.2, < 5.0)
13
14
  ansi (1.5.0)
14
15
  builder (3.2.4)
15
16
  byebug (11.1.3)
16
17
  domain_name (0.5.20190701)
17
18
  unf (>= 0.0.5, < 1.0.0)
18
- faraday (1.0.1)
19
- multipart-post (>= 1.2, < 3)
20
- faraday-cookie_jar (0.0.6)
21
- faraday (>= 0.7.4)
19
+ faraday (1.10.0)
20
+ faraday-em_http (~> 1.0)
21
+ faraday-em_synchrony (~> 1.0)
22
+ faraday-excon (~> 1.1)
23
+ faraday-httpclient (~> 1.0)
24
+ faraday-multipart (~> 1.0)
25
+ faraday-net_http (~> 1.0)
26
+ faraday-net_http_persistent (~> 1.0)
27
+ faraday-patron (~> 1.0)
28
+ faraday-rack (~> 1.0)
29
+ faraday-retry (~> 1.0)
30
+ ruby2_keywords (>= 0.0.4)
31
+ faraday-cookie_jar (0.0.7)
32
+ faraday (>= 0.8.0)
22
33
  http-cookie (~> 1.0.0)
34
+ faraday-em_http (1.0.0)
35
+ faraday-em_synchrony (1.0.0)
23
36
  faraday-encoding (0.0.5)
24
37
  faraday
25
- faraday-http-cache (2.2.0)
38
+ faraday-excon (1.1.0)
39
+ faraday-http-cache (2.4.0)
26
40
  faraday (>= 0.8)
27
- faraday_middleware (1.0.0)
41
+ faraday-httpclient (1.0.1)
42
+ faraday-multipart (1.0.4)
43
+ multipart-post (~> 2)
44
+ faraday-net_http (1.0.1)
45
+ faraday-net_http_persistent (1.2.0)
46
+ faraday-patron (1.0.0)
47
+ faraday-rack (1.0.0)
48
+ faraday-retry (1.0.3)
49
+ faraday_middleware (1.2.0)
28
50
  faraday (~> 1.0)
29
- fastimage (2.1.7)
30
- http-cookie (1.0.3)
51
+ fastimage (2.2.6)
52
+ htmlentities (4.3.4)
53
+ http-cookie (1.0.5)
31
54
  domain_name (~> 0.5)
32
- metainspector (5.10.1)
33
- addressable (~> 2.7.0)
34
- faraday (~> 1.0.0)
35
- faraday-cookie_jar (~> 0.0.6)
36
- faraday-encoding (~> 0.0.5)
37
- faraday-http-cache (~> 2.2.0)
38
- faraday_middleware (~> 1.0.0)
39
- fastimage (~> 2.1.7)
40
- nesty (~> 1.0.2)
41
- nokogiri (~> 1.10.9)
42
- mini_portile2 (2.4.0)
43
- minitest (5.14.1)
44
- minitest-reporters (1.4.2)
55
+ metainspector (5.12.1)
56
+ addressable (~> 2.7)
57
+ faraday (>= 1.4, < 3.0)
58
+ faraday-cookie_jar (~> 0.0)
59
+ faraday-encoding (~> 0.0)
60
+ faraday-http-cache (~> 2.2)
61
+ faraday_middleware (~> 1.0)
62
+ fastimage (~> 2.2)
63
+ nesty (~> 1.0)
64
+ nokogiri (~> 1.11)
65
+ minitest (5.15.0)
66
+ minitest-reporters (1.5.0)
45
67
  ansi
46
68
  builder
47
69
  minitest (>= 5.0)
48
70
  ruby-progressbar
49
- multipart-post (2.1.1)
71
+ multipart-post (2.2.0)
50
72
  nesty (1.0.2)
51
- nokogiri (1.10.10)
52
- mini_portile2 (~> 2.4.0)
53
- public_suffix (4.0.5)
73
+ nokogiri (1.13.6-x86_64-darwin)
74
+ racc (~> 1.4)
75
+ public_suffix (4.0.7)
76
+ racc (1.6.0)
54
77
  rake (12.3.3)
55
- ruby-progressbar (1.10.1)
78
+ ruby-progressbar (1.11.0)
79
+ ruby2_keywords (0.0.5)
56
80
  unf (0.1.4)
57
81
  unf_ext
58
- unf_ext (0.0.7.7)
82
+ unf_ext (0.0.8.2)
59
83
 
60
84
  PLATFORMS
61
- ruby
85
+ x86_64-darwin-21
62
86
 
63
87
  DEPENDENCIES
64
88
  byebug
@@ -68,4 +92,4 @@ DEPENDENCIES
68
92
  rake (~> 12.0)
69
93
 
70
94
  BUNDLED WITH
71
- 2.1.4
95
+ 2.3.12
data/curation.gemspec CHANGED
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.bindir = "exe"
22
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
23
  spec.require_paths = ["lib"]
24
- spec.add_dependency "metainspector"
24
+ spec.add_dependency "metainspector", '~> 5.12'
25
25
  spec.add_dependency "nokogiri"
26
+ spec.add_dependency "htmlentities"
26
27
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.7"
2
+ VERSION = "1.10"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "curation/version"
2
2
  require "metainspector"
3
3
  require "open-uri"
4
+ require "htmlentities"
4
5
 
5
6
  module Curation
6
7
  class Error < StandardError; end
@@ -20,7 +21,7 @@ module Curation
20
21
  ]
21
22
 
22
23
  def initialize(url, html = nil)
23
- @url = url
24
+ @url = url.to_s.gsub('http://', 'https://')
24
25
  @html = html
25
26
  end
26
27
 
@@ -37,6 +38,7 @@ module Curation
37
38
  end
38
39
 
39
40
  def text
41
+ # require 'byebug'; byebug
40
42
  @text ||= find_text
41
43
  end
42
44
 
@@ -49,6 +51,8 @@ module Curation
49
51
  def find_title
50
52
  if json_ld.any?
51
53
  json_ld.each do |ld|
54
+ # require 'byebug'; byebug
55
+ ld = ld.first if ld.is_a?(Array)
52
56
  return ld['headline'] if ld.has_key? 'headline'
53
57
  end
54
58
  end
@@ -70,6 +74,7 @@ module Curation
70
74
  def find_image
71
75
  if json_ld.any?
72
76
  json_ld.each do |ld|
77
+ ld = ld.first if ld.is_a?(Array)
73
78
  if ld.has_key? 'image'
74
79
  image_data = ld['image']
75
80
  return image_data if image_data.is_a? String
@@ -96,6 +101,14 @@ module Curation
96
101
  end
97
102
 
98
103
  def find_text
104
+ text = find_text_with_json_ld || find_text_with_nokogiri
105
+ text.to_s.gsub!('<br><br>', '<br>')
106
+ # require 'byebug'; byebug
107
+ text = clean_encoding text
108
+ text
109
+ end
110
+
111
+ def find_text_with_json_ld
99
112
  if json_ld.any?
100
113
  json_ld.each do |ld|
101
114
  next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
@@ -103,6 +116,10 @@ module Curation
103
116
  return ld['articleBody'] if ld.has_key? 'articleBody'
104
117
  end
105
118
  end
119
+ nil
120
+ end
121
+
122
+ def find_text_with_nokogiri
106
123
  h = nokogiri.dup
107
124
  BLACKLIST.each do |tag|
108
125
  h.css(tag).remove
@@ -110,7 +127,6 @@ module Curation
110
127
  nodes = h.css('p')
111
128
  nodes.xpath('//style').remove
112
129
  text = nodes.to_html
113
- text.gsub!('<br><br>', '<br>')
114
130
  text
115
131
  end
116
132
 
@@ -125,6 +141,7 @@ module Curation
125
141
  return Date.parse metatags['pubdate'] rescue nil
126
142
  return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
143
  return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
144
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
128
145
  chunks = html.split('DisplayDate')
129
146
  if chunks.count > 1
130
147
  value = chunks[1]
@@ -156,12 +173,11 @@ module Curation
156
173
  begin
157
174
  options = nokogiri.css('[type="application/ld+json"]')
158
175
  options.each do |option|
159
- string = option.inner_text
160
- hash = JSON.parse(string)
161
- @json_ld << hash
176
+ @json_ld << json_ld_from_object(option)
162
177
  end
163
178
  # Some sites have tables in tables
164
179
  @json_ld.flatten!
180
+ # require 'byebug'; byebug
165
181
  rescue
166
182
  puts 'Curation::Page json_ld error'
167
183
  end
@@ -169,6 +185,12 @@ module Curation
169
185
  @json_ld
170
186
  end
171
187
 
188
+ def json_ld_from_object(object)
189
+ JSON.parse object.inner_text
190
+ rescue
191
+ {}
192
+ end
193
+
172
194
  def file
173
195
  @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
174
196
  rescue
@@ -188,9 +210,13 @@ module Curation
188
210
 
189
211
  def nokogiri
190
212
  unless @nokogiri
191
- file.rewind
192
- @nokogiri = Nokogiri::HTML file
193
- file.rewind
213
+ if file.nil?
214
+ @nokogiri = metainspector.parsed
215
+ else
216
+ file.rewind
217
+ @nokogiri = Nokogiri::HTML file
218
+ file.rewind
219
+ end
194
220
  end
195
221
  @nokogiri
196
222
  rescue
@@ -212,5 +238,26 @@ module Curation
212
238
  rescue
213
239
  puts 'Curation::Page metatags error'
214
240
  end
241
+
242
+ # r&Atilde;&copy;forme -> réforme
243
+ def clean_encoding(text)
244
+ clean_text = HTMLEntities.new.decode text
245
+ double_encoding = false
246
+ [
247
+ 'é', # é
248
+ 'è', # è
249
+ 'î', # î
250
+ 'ê', # ê
251
+ ].each do |string|
252
+ # require 'byebug'; byebug
253
+ double_encoding = true if clean_text.include? string
254
+ end
255
+ if double_encoding
256
+ clean_text.encode('iso-8859-1', undef: :replace)
257
+ .force_encoding('utf-8')
258
+ else
259
+ text
260
+ end
261
+ end
215
262
  end
216
263
  end
metadata CHANGED
@@ -1,17 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.7'
4
+ version: '1.10'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-25 00:00:00.000000000 Z
11
+ date: 2022-06-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.12'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.12'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
15
29
  requirement: !ruby/object:Gem::Requirement
16
30
  requirements:
17
31
  - - ">="
@@ -25,7 +39,7 @@ dependencies:
25
39
  - !ruby/object:Gem::Version
26
40
  version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
42
+ name: htmlentities
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - ">="
@@ -65,7 +79,7 @@ licenses:
65
79
  metadata:
66
80
  homepage_uri: https://github.com/arnaudlevy/curation
67
81
  source_code_uri: https://github.com/arnaudlevy/curation
68
- post_install_message:
82
+ post_install_message:
69
83
  rdoc_options: []
70
84
  require_paths:
71
85
  - lib
@@ -80,8 +94,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
94
  - !ruby/object:Gem::Version
81
95
  version: '0'
82
96
  requirements: []
83
- rubygems_version: 3.0.3
84
- signing_key:
97
+ rubygems_version: 3.1.6
98
+ signing_key:
85
99
  specification_version: 4
86
100
  summary: Curation of content
87
101
  test_files: []