curation 1.6 → 1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
4
- data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
3
+ metadata.gz: e68898021d1c54927e120e46d2b1282535aec94a4daa0c552d9edeb4d6dc1d88
4
+ data.tar.gz: d6ba0ccabe71d10efb60fb0a5806f94f3fdd42f7e49fe39f00d64c427455d450
5
5
  SHA512:
6
- metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
7
- data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
6
+ metadata.gz: 88310bda3ba8221af689f5848c24af2dfd48668caaf875b55f05626e09dbf072747b877e779b93a91c4e602b39cd23e61dc602662bb2b34e8b0e095c6cd9c488
7
+ data.tar.gz: c750a811383b541beefd3214accbdacda85ab7552ded3863734101cf1ffa42ec963b455c83f817e6176a2e31dc8d57a07ce14fac0f2567a54fd353c42f26d7fb
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
data/Gemfile.lock CHANGED
@@ -1,57 +1,96 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.6)
4
+ curation (1.9)
5
+ htmlentities
5
6
  metainspector
6
7
  nokogiri
7
8
 
8
9
  GEM
9
10
  remote: https://rubygems.org/
10
11
  specs:
11
- addressable (2.7.0)
12
+ addressable (2.8.0)
12
13
  public_suffix (>= 2.0.2, < 5.0)
14
+ ansi (1.5.0)
15
+ builder (3.2.4)
16
+ byebug (11.1.3)
13
17
  domain_name (0.5.20190701)
14
18
  unf (>= 0.0.5, < 1.0.0)
15
- faraday (1.0.1)
16
- multipart-post (>= 1.2, < 3)
17
- faraday-cookie_jar (0.0.6)
18
- faraday (>= 0.7.4)
19
+ faraday (1.10.0)
20
+ faraday-em_http (~> 1.0)
21
+ faraday-em_synchrony (~> 1.0)
22
+ faraday-excon (~> 1.1)
23
+ faraday-httpclient (~> 1.0)
24
+ faraday-multipart (~> 1.0)
25
+ faraday-net_http (~> 1.0)
26
+ faraday-net_http_persistent (~> 1.0)
27
+ faraday-patron (~> 1.0)
28
+ faraday-rack (~> 1.0)
29
+ faraday-retry (~> 1.0)
30
+ ruby2_keywords (>= 0.0.4)
31
+ faraday-cookie_jar (0.0.7)
32
+ faraday (>= 0.8.0)
19
33
  http-cookie (~> 1.0.0)
34
+ faraday-em_http (1.0.0)
35
+ faraday-em_synchrony (1.0.0)
20
36
  faraday-encoding (0.0.5)
21
37
  faraday
38
+ faraday-excon (1.1.0)
22
39
  faraday-http-cache (2.2.0)
23
40
  faraday (>= 0.8)
24
- faraday_middleware (1.0.0)
41
+ faraday-httpclient (1.0.1)
42
+ faraday-multipart (1.0.3)
43
+ multipart-post (>= 1.2, < 3)
44
+ faraday-net_http (1.0.1)
45
+ faraday-net_http_persistent (1.2.0)
46
+ faraday-patron (1.0.0)
47
+ faraday-rack (1.0.0)
48
+ faraday-retry (1.0.3)
49
+ faraday_middleware (1.2.0)
25
50
  faraday (~> 1.0)
26
- fastimage (2.1.7)
27
- http-cookie (1.0.3)
51
+ fastimage (2.2.6)
52
+ htmlentities (4.3.4)
53
+ http-cookie (1.0.4)
28
54
  domain_name (~> 0.5)
29
- metainspector (5.10.1)
30
- addressable (~> 2.7.0)
31
- faraday (~> 1.0.0)
32
- faraday-cookie_jar (~> 0.0.6)
33
- faraday-encoding (~> 0.0.5)
34
- faraday-http-cache (~> 2.2.0)
35
- faraday_middleware (~> 1.0.0)
36
- fastimage (~> 2.1.7)
37
- nesty (~> 1.0.2)
38
- nokogiri (~> 1.10.9)
39
- mini_portile2 (2.4.0)
55
+ metainspector (5.12.1)
56
+ addressable (~> 2.7)
57
+ faraday (>= 1.4, < 3.0)
58
+ faraday-cookie_jar (~> 0.0)
59
+ faraday-encoding (~> 0.0)
60
+ faraday-http-cache (~> 2.2)
61
+ faraday_middleware (~> 1.0)
62
+ fastimage (~> 2.2)
63
+ nesty (~> 1.0)
64
+ nokogiri (~> 1.11)
65
+ mini_portile2 (2.8.0)
66
+ minitest (5.15.0)
67
+ minitest-reporters (1.5.0)
68
+ ansi
69
+ builder
70
+ minitest (>= 5.0)
71
+ ruby-progressbar
40
72
  multipart-post (2.1.1)
41
73
  nesty (1.0.2)
42
- nokogiri (1.10.10)
43
- mini_portile2 (~> 2.4.0)
44
- public_suffix (4.0.5)
74
+ nokogiri (1.13.6)
75
+ mini_portile2 (~> 2.8.0)
76
+ racc (~> 1.4)
77
+ public_suffix (4.0.7)
78
+ racc (1.6.0)
45
79
  rake (12.3.3)
80
+ ruby-progressbar (1.11.0)
81
+ ruby2_keywords (0.0.5)
46
82
  unf (0.1.4)
47
83
  unf_ext
48
- unf_ext (0.0.7.7)
84
+ unf_ext (0.0.8.1)
49
85
 
50
86
  PLATFORMS
51
87
  ruby
52
88
 
53
89
  DEPENDENCIES
90
+ byebug
54
91
  curation!
92
+ minitest
93
+ minitest-reporters
55
94
  rake (~> 12.0)
56
95
 
57
96
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
data/curation.gemspec CHANGED
@@ -23,4 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.require_paths = ["lib"]
24
24
  spec.add_dependency "metainspector"
25
25
  spec.add_dependency "nokogiri"
26
+ spec.add_dependency "htmlentities"
26
27
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.6"
2
+ VERSION = "1.9"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,12 +1,13 @@
1
1
  require "curation/version"
2
2
  require "metainspector"
3
3
  require "open-uri"
4
+ require "htmlentities"
4
5
 
5
6
  module Curation
6
7
  class Error < StandardError; end
7
8
 
8
9
  class Page
9
- attr_reader :url, :title, :text, :image
10
+ attr_reader :url
10
11
 
11
12
  BLACKLIST = [
12
13
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -20,46 +21,38 @@ module Curation
20
21
  ]
21
22
 
22
23
  def initialize(url, html = nil)
23
- @url = url
24
+ @url = url.to_s.gsub('http://', 'https://')
24
25
  @html = html
25
26
  end
26
27
 
27
28
  def title
28
- @title = find_title
29
+ @title ||= find_title
29
30
  end
30
31
 
31
32
  def image
32
- @image = find_image
33
- @image = @image.to_s.gsub('http://', 'https://')
33
+ unless @image
34
+ @image = find_image
35
+ @image = @image.to_s.gsub('http://', 'https://')
36
+ end
34
37
  @image
35
38
  end
36
39
 
37
40
  def text
38
- if json_ld.any?
39
- json_ld.each do |ld|
40
- next unless ld['@type'] == 'NewsArticle'
41
- return ld['text'] if ld.has_key? 'text'
42
- return ld['articleBody'] if ld.has_key? 'articleBody'
43
- end
44
- end
45
- h = nokogiri.dup
46
- BLACKLIST.each do |tag|
47
- h.css(tag).remove
48
- end
49
- nodes = h.css('p')
50
- nodes.xpath('//style').remove
51
- text = nodes.to_html
52
- text.gsub!('<br><br>', '<br>')
53
- text
41
+ # require 'byebug'; byebug
42
+ @text ||= find_text
43
+ end
44
+
45
+ def date
46
+ @date ||= find_date
54
47
  end
55
48
 
56
49
  protected
57
50
 
58
51
  def find_title
59
52
  if json_ld.any?
60
- # Some sites have tables in tables
61
- json_ld.flatten!
62
53
  json_ld.each do |ld|
54
+ # require 'byebug'; byebug
55
+ ld = ld.first if ld.is_a?(Array)
63
56
  return ld['headline'] if ld.has_key? 'headline'
64
57
  end
65
58
  end
@@ -70,7 +63,7 @@ module Curation
70
63
  nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
71
64
  nokogiri.css('title')&.first&.inner_text
72
65
  ].each do |possibility|
73
- return possibility unless possibility.blank?
66
+ return possibility unless possibility.to_s.empty?
74
67
  end
75
68
  rescue
76
69
  puts 'Curation::Page find_title error'
@@ -81,6 +74,7 @@ module Curation
81
74
  def find_image
82
75
  if json_ld.any?
83
76
  json_ld.each do |ld|
77
+ ld = ld.first if ld.is_a?(Array)
84
78
  if ld.has_key? 'image'
85
79
  image_data = ld['image']
86
80
  return image_data if image_data.is_a? String
@@ -98,7 +92,7 @@ module Curation
98
92
  metainspector.images.best,
99
93
  nokogiri.css('[property="og:image"]').first&.attributes['content'].value
100
94
  ].each do |possibility|
101
- return possibility unless possibility.blank?
95
+ return possibility unless possibility.to_s.empty?
102
96
  end
103
97
  rescue
104
98
  puts 'Curation::Page find_image error'
@@ -106,16 +100,84 @@ module Curation
106
100
  return ''
107
101
  end
108
102
 
103
+ def find_text
104
+ text = find_text_with_json_ld || find_text_with_nokogiri
105
+ text.to_s.gsub!('<br><br>', '<br>')
106
+ # require 'byebug'; byebug
107
+ text = clean_encoding text
108
+ text
109
+ end
110
+
111
+ def find_text_with_json_ld
112
+ if json_ld.any?
113
+ json_ld.each do |ld|
114
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
115
+ return ld['text'] if ld.has_key? 'text'
116
+ return ld['articleBody'] if ld.has_key? 'articleBody'
117
+ end
118
+ end
119
+ nil
120
+ end
121
+
122
+ def find_text_with_nokogiri
123
+ h = nokogiri.dup
124
+ BLACKLIST.each do |tag|
125
+ h.css(tag).remove
126
+ end
127
+ nodes = h.css('p')
128
+ nodes.xpath('//style').remove
129
+ text = nodes.to_html
130
+ text
131
+ end
132
+
133
+ def find_date
134
+ if json_ld.any?
135
+ json_ld.each do |ld|
136
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
137
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
138
+ end
139
+ end
140
+ return Date.parse metatags['date'] rescue nil
141
+ return Date.parse metatags['pubdate'] rescue nil
142
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
143
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
144
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
145
+ chunks = html.split('DisplayDate')
146
+ if chunks.count > 1
147
+ value = chunks[1]
148
+ value = value.split(',').first
149
+ value = value.gsub('"', '')
150
+ value = value[1..-1] if value[0] == ':'
151
+ return Date.parse value rescue nil
152
+ end
153
+ begin
154
+ value = nokogiri.css('.postDate').first
155
+ value = value.inner_text
156
+ value = value.gsub(' — ', '')
157
+ return Date.parse value
158
+ rescue
159
+ end
160
+ begin
161
+ value = nokogiri.css('.gta_post_date').first
162
+ value = value.inner_text
163
+ return Date.parse value
164
+ rescue
165
+ end
166
+ end
167
+
168
+ private
169
+
109
170
  def json_ld
110
- unless @json_ld
171
+ unless defined?(@json_ld)
111
172
  @json_ld = []
112
173
  begin
113
174
  options = nokogiri.css('[type="application/ld+json"]')
114
175
  options.each do |option|
115
- string = option.inner_text
116
- hash = JSON.parse(string)
117
- @json_ld << hash
176
+ @json_ld << json_ld_from_object(option)
118
177
  end
178
+ # Some sites have tables in tables
179
+ @json_ld.flatten!
180
+ # require 'byebug'; byebug
119
181
  rescue
120
182
  puts 'Curation::Page json_ld error'
121
183
  end
@@ -123,22 +185,79 @@ module Curation
123
185
  @json_ld
124
186
  end
125
187
 
188
+ def json_ld_from_object(object)
189
+ JSON.parse object.inner_text
190
+ rescue
191
+ {}
192
+ end
193
+
194
+ def file
195
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
196
+ rescue
197
+ puts "Curation::Page file error with url #{url}"
198
+ end
199
+
126
200
  def html
127
- @html ||= URI.open url
201
+ unless @html
202
+ file.rewind
203
+ @html = file.read
204
+ file.rewind
205
+ end
206
+ @html
128
207
  rescue
129
- puts "Impossible to open #{url}"
208
+ puts "Curation::Page html error"
130
209
  end
131
210
 
132
211
  def nokogiri
133
- @nokogiri ||= Nokogiri::HTML html
212
+ unless @nokogiri
213
+ if file.nil?
214
+ @nokogiri = metainspector.parsed
215
+ else
216
+ file.rewind
217
+ @nokogiri = Nokogiri::HTML file
218
+ file.rewind
219
+ end
220
+ end
221
+ @nokogiri
134
222
  rescue
135
223
  puts 'Curation::Page nokogiri error'
136
224
  end
137
225
 
138
226
  def metainspector
139
- @metainspector ||= MetaInspector.new url, document: html
227
+ unless @metainspector
228
+ @metainspector = html.nil? ? MetaInspector.new(url)
229
+ : MetaInspector.new(url, document: html)
230
+ end
231
+ @metainspector
140
232
  rescue
141
233
  puts 'Curation::Page metainspector error'
142
234
  end
235
+
236
+ def metatags
237
+ @metatags ||= metainspector.meta_tag['name']
238
+ rescue
239
+ puts 'Curation::Page metatags error'
240
+ end
241
+
242
+ # r&Atilde;&copy;forme -> réforme
243
+ def clean_encoding(text)
244
+ clean_text = HTMLEntities.new.decode text
245
+ double_encoding = false
246
+ [
247
+ 'é', # é
248
+ 'è', # è
249
+ 'î', # î
250
+ 'ê', # ê
251
+ ].each do |string|
252
+ # require 'byebug'; byebug
253
+ double_encoding = true if clean_text.include? string
254
+ end
255
+ if double_encoding
256
+ clean_text.encode('iso-8859-1', undef: :replace)
257
+ .force_encoding('utf-8')
258
+ else
259
+ text
260
+ end
261
+ end
143
262
  end
144
263
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.6'
4
+ version: '1.9'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2022-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: htmlentities
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  description: When you build content curation tools, you need to extract the content
42
56
  of pages (title, text, image...). This requires different strategies and some fine
43
57
  tuning to work efficiently.
@@ -65,7 +79,7 @@ licenses:
65
79
  metadata:
66
80
  homepage_uri: https://github.com/arnaudlevy/curation
67
81
  source_code_uri: https://github.com/arnaudlevy/curation
68
- post_install_message:
82
+ post_install_message:
69
83
  rdoc_options: []
70
84
  require_paths:
71
85
  - lib
@@ -80,8 +94,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
94
  - !ruby/object:Gem::Version
81
95
  version: '0'
82
96
  requirements: []
83
- rubygems_version: 3.0.3
84
- signing_key:
97
+ rubygems_version: 3.1.6
98
+ signing_key:
85
99
  specification_version: 4
86
100
  summary: Curation of content
87
101
  test_files: []