curation 1.7 → 1.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
4
- data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
3
+ metadata.gz: a23e967e5d017ce61f9719647f45c20e1336aff067bc2323f64641dfac695f75
4
+ data.tar.gz: 49427d9325a27034c1969d71875dcd4eacff4f5c3ac9625ccc6cdb554c4c2df4
5
5
  SHA512:
6
- metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
7
- data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
6
+ metadata.gz: b01f4209b09d6ec09917096159b98be71b31fd0952524a12c7310ba29f88ee8c888bc2927015ef2b1fb529cfe46964c08c743c818807518dee6bfa3cc32f6767
7
+ data.tar.gz: 564d14e3afaa17f00ac7b034917c7b28612b54b6f71126a36de4d6d43def7b8c19814475ee5b13651bbea7e7eed752b873f918af10baf169bd153d66b8d05c7d
data/Gemfile.lock CHANGED
@@ -1,64 +1,88 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.7)
5
- metainspector
4
+ curation (1.10)
5
+ htmlentities
6
+ metainspector (~> 5.12)
6
7
  nokogiri
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- addressable (2.7.0)
12
+ addressable (2.8.0)
12
13
  public_suffix (>= 2.0.2, < 5.0)
13
14
  ansi (1.5.0)
14
15
  builder (3.2.4)
15
16
  byebug (11.1.3)
16
17
  domain_name (0.5.20190701)
17
18
  unf (>= 0.0.5, < 1.0.0)
18
- faraday (1.0.1)
19
- multipart-post (>= 1.2, < 3)
20
- faraday-cookie_jar (0.0.6)
21
- faraday (>= 0.7.4)
19
+ faraday (1.10.0)
20
+ faraday-em_http (~> 1.0)
21
+ faraday-em_synchrony (~> 1.0)
22
+ faraday-excon (~> 1.1)
23
+ faraday-httpclient (~> 1.0)
24
+ faraday-multipart (~> 1.0)
25
+ faraday-net_http (~> 1.0)
26
+ faraday-net_http_persistent (~> 1.0)
27
+ faraday-patron (~> 1.0)
28
+ faraday-rack (~> 1.0)
29
+ faraday-retry (~> 1.0)
30
+ ruby2_keywords (>= 0.0.4)
31
+ faraday-cookie_jar (0.0.7)
32
+ faraday (>= 0.8.0)
22
33
  http-cookie (~> 1.0.0)
34
+ faraday-em_http (1.0.0)
35
+ faraday-em_synchrony (1.0.0)
23
36
  faraday-encoding (0.0.5)
24
37
  faraday
25
- faraday-http-cache (2.2.0)
38
+ faraday-excon (1.1.0)
39
+ faraday-http-cache (2.4.0)
26
40
  faraday (>= 0.8)
27
- faraday_middleware (1.0.0)
41
+ faraday-httpclient (1.0.1)
42
+ faraday-multipart (1.0.4)
43
+ multipart-post (~> 2)
44
+ faraday-net_http (1.0.1)
45
+ faraday-net_http_persistent (1.2.0)
46
+ faraday-patron (1.0.0)
47
+ faraday-rack (1.0.0)
48
+ faraday-retry (1.0.3)
49
+ faraday_middleware (1.2.0)
28
50
  faraday (~> 1.0)
29
- fastimage (2.1.7)
30
- http-cookie (1.0.3)
51
+ fastimage (2.2.6)
52
+ htmlentities (4.3.4)
53
+ http-cookie (1.0.5)
31
54
  domain_name (~> 0.5)
32
- metainspector (5.10.1)
33
- addressable (~> 2.7.0)
34
- faraday (~> 1.0.0)
35
- faraday-cookie_jar (~> 0.0.6)
36
- faraday-encoding (~> 0.0.5)
37
- faraday-http-cache (~> 2.2.0)
38
- faraday_middleware (~> 1.0.0)
39
- fastimage (~> 2.1.7)
40
- nesty (~> 1.0.2)
41
- nokogiri (~> 1.10.9)
42
- mini_portile2 (2.4.0)
43
- minitest (5.14.1)
44
- minitest-reporters (1.4.2)
55
+ metainspector (5.12.1)
56
+ addressable (~> 2.7)
57
+ faraday (>= 1.4, < 3.0)
58
+ faraday-cookie_jar (~> 0.0)
59
+ faraday-encoding (~> 0.0)
60
+ faraday-http-cache (~> 2.2)
61
+ faraday_middleware (~> 1.0)
62
+ fastimage (~> 2.2)
63
+ nesty (~> 1.0)
64
+ nokogiri (~> 1.11)
65
+ minitest (5.15.0)
66
+ minitest-reporters (1.5.0)
45
67
  ansi
46
68
  builder
47
69
  minitest (>= 5.0)
48
70
  ruby-progressbar
49
- multipart-post (2.1.1)
71
+ multipart-post (2.2.0)
50
72
  nesty (1.0.2)
51
- nokogiri (1.10.10)
52
- mini_portile2 (~> 2.4.0)
53
- public_suffix (4.0.5)
73
+ nokogiri (1.13.6-x86_64-darwin)
74
+ racc (~> 1.4)
75
+ public_suffix (4.0.7)
76
+ racc (1.6.0)
54
77
  rake (12.3.3)
55
- ruby-progressbar (1.10.1)
78
+ ruby-progressbar (1.11.0)
79
+ ruby2_keywords (0.0.5)
56
80
  unf (0.1.4)
57
81
  unf_ext
58
- unf_ext (0.0.7.7)
82
+ unf_ext (0.0.8.2)
59
83
 
60
84
  PLATFORMS
61
- ruby
85
+ x86_64-darwin-21
62
86
 
63
87
  DEPENDENCIES
64
88
  byebug
@@ -68,4 +92,4 @@ DEPENDENCIES
68
92
  rake (~> 12.0)
69
93
 
70
94
  BUNDLED WITH
71
- 2.1.4
95
+ 2.3.12
data/curation.gemspec CHANGED
@@ -21,6 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.bindir = "exe"
22
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
23
  spec.require_paths = ["lib"]
24
- spec.add_dependency "metainspector"
24
+ spec.add_dependency "metainspector", '~> 5.12'
25
25
  spec.add_dependency "nokogiri"
26
+ spec.add_dependency "htmlentities"
26
27
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.7"
2
+ VERSION = "1.10"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "curation/version"
2
2
  require "metainspector"
3
3
  require "open-uri"
4
+ require "htmlentities"
4
5
 
5
6
  module Curation
6
7
  class Error < StandardError; end
@@ -20,7 +21,7 @@ module Curation
20
21
  ]
21
22
 
22
23
  def initialize(url, html = nil)
23
- @url = url
24
+ @url = url.to_s.gsub('http://', 'https://')
24
25
  @html = html
25
26
  end
26
27
 
@@ -37,6 +38,7 @@ module Curation
37
38
  end
38
39
 
39
40
  def text
41
+ # require 'byebug'; byebug
40
42
  @text ||= find_text
41
43
  end
42
44
 
@@ -49,6 +51,8 @@ module Curation
49
51
  def find_title
50
52
  if json_ld.any?
51
53
  json_ld.each do |ld|
54
+ # require 'byebug'; byebug
55
+ ld = ld.first if ld.is_a?(Array)
52
56
  return ld['headline'] if ld.has_key? 'headline'
53
57
  end
54
58
  end
@@ -70,6 +74,7 @@ module Curation
70
74
  def find_image
71
75
  if json_ld.any?
72
76
  json_ld.each do |ld|
77
+ ld = ld.first if ld.is_a?(Array)
73
78
  if ld.has_key? 'image'
74
79
  image_data = ld['image']
75
80
  return image_data if image_data.is_a? String
@@ -96,6 +101,14 @@ module Curation
96
101
  end
97
102
 
98
103
  def find_text
104
+ text = find_text_with_json_ld || find_text_with_nokogiri
105
+ text.to_s.gsub!('<br><br>', '<br>')
106
+ # require 'byebug'; byebug
107
+ text = clean_encoding text
108
+ text
109
+ end
110
+
111
+ def find_text_with_json_ld
99
112
  if json_ld.any?
100
113
  json_ld.each do |ld|
101
114
  next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
@@ -103,6 +116,10 @@ module Curation
103
116
  return ld['articleBody'] if ld.has_key? 'articleBody'
104
117
  end
105
118
  end
119
+ nil
120
+ end
121
+
122
+ def find_text_with_nokogiri
106
123
  h = nokogiri.dup
107
124
  BLACKLIST.each do |tag|
108
125
  h.css(tag).remove
@@ -110,7 +127,6 @@ module Curation
110
127
  nodes = h.css('p')
111
128
  nodes.xpath('//style').remove
112
129
  text = nodes.to_html
113
- text.gsub!('<br><br>', '<br>')
114
130
  text
115
131
  end
116
132
 
@@ -125,6 +141,7 @@ module Curation
125
141
  return Date.parse metatags['pubdate'] rescue nil
126
142
  return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
143
  return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
144
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
128
145
  chunks = html.split('DisplayDate')
129
146
  if chunks.count > 1
130
147
  value = chunks[1]
@@ -156,12 +173,11 @@ module Curation
156
173
  begin
157
174
  options = nokogiri.css('[type="application/ld+json"]')
158
175
  options.each do |option|
159
- string = option.inner_text
160
- hash = JSON.parse(string)
161
- @json_ld << hash
176
+ @json_ld << json_ld_from_object(option)
162
177
  end
163
178
  # Some sites have tables in tables
164
179
  @json_ld.flatten!
180
+ # require 'byebug'; byebug
165
181
  rescue
166
182
  puts 'Curation::Page json_ld error'
167
183
  end
@@ -169,6 +185,12 @@ module Curation
169
185
  @json_ld
170
186
  end
171
187
 
188
+ def json_ld_from_object(object)
189
+ JSON.parse object.inner_text
190
+ rescue
191
+ {}
192
+ end
193
+
172
194
  def file
173
195
  @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
174
196
  rescue
@@ -188,9 +210,13 @@ module Curation
188
210
 
189
211
  def nokogiri
190
212
  unless @nokogiri
191
- file.rewind
192
- @nokogiri = Nokogiri::HTML file
193
- file.rewind
213
+ if file.nil?
214
+ @nokogiri = metainspector.parsed
215
+ else
216
+ file.rewind
217
+ @nokogiri = Nokogiri::HTML file
218
+ file.rewind
219
+ end
194
220
  end
195
221
  @nokogiri
196
222
  rescue
@@ -212,5 +238,26 @@ module Curation
212
238
  rescue
213
239
  puts 'Curation::Page metatags error'
214
240
  end
241
+
242
+ # r&Atilde;&copy;forme -> réforme
243
+ def clean_encoding(text)
244
+ clean_text = HTMLEntities.new.decode text
245
+ double_encoding = false
246
+ [
247
+ 'é', # é
248
+ 'è', # è
249
+ 'î', # î
250
+ 'ê', # ê
251
+ ].each do |string|
252
+ # require 'byebug'; byebug
253
+ double_encoding = true if clean_text.include? string
254
+ end
255
+ if double_encoding
256
+ clean_text.encode('iso-8859-1', undef: :replace)
257
+ .force_encoding('utf-8')
258
+ else
259
+ text
260
+ end
261
+ end
215
262
  end
216
263
  end
metadata CHANGED
@@ -1,17 +1,31 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.7'
4
+ version: '1.10'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-25 00:00:00.000000000 Z
11
+ date: 2022-06-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.12'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.12'
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
15
29
  requirement: !ruby/object:Gem::Requirement
16
30
  requirements:
17
31
  - - ">="
@@ -25,7 +39,7 @@ dependencies:
25
39
  - !ruby/object:Gem::Version
26
40
  version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
- name: nokogiri
42
+ name: htmlentities
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - ">="
@@ -65,7 +79,7 @@ licenses:
65
79
  metadata:
66
80
  homepage_uri: https://github.com/arnaudlevy/curation
67
81
  source_code_uri: https://github.com/arnaudlevy/curation
68
- post_install_message:
82
+ post_install_message:
69
83
  rdoc_options: []
70
84
  require_paths:
71
85
  - lib
@@ -80,8 +94,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
94
  - !ruby/object:Gem::Version
81
95
  version: '0'
82
96
  requirements: []
83
- rubygems_version: 3.0.3
84
- signing_key:
97
+ rubygems_version: 3.1.6
98
+ signing_key:
85
99
  specification_version: 4
86
100
  summary: Curation of content
87
101
  test_files: []