curation 1.6 → 1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
4
- data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
3
+ metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
4
+ data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
5
5
  SHA512:
6
- metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
7
- data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
6
+ metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
7
+ data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.6)
4
+ curation (1.7)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -10,6 +10,9 @@ GEM
10
10
  specs:
11
11
  addressable (2.7.0)
12
12
  public_suffix (>= 2.0.2, < 5.0)
13
+ ansi (1.5.0)
14
+ builder (3.2.4)
15
+ byebug (11.1.3)
13
16
  domain_name (0.5.20190701)
14
17
  unf (>= 0.0.5, < 1.0.0)
15
18
  faraday (1.0.1)
@@ -37,12 +40,19 @@ GEM
37
40
  nesty (~> 1.0.2)
38
41
  nokogiri (~> 1.10.9)
39
42
  mini_portile2 (2.4.0)
43
+ minitest (5.14.1)
44
+ minitest-reporters (1.4.2)
45
+ ansi
46
+ builder
47
+ minitest (>= 5.0)
48
+ ruby-progressbar
40
49
  multipart-post (2.1.1)
41
50
  nesty (1.0.2)
42
51
  nokogiri (1.10.10)
43
52
  mini_portile2 (~> 2.4.0)
44
53
  public_suffix (4.0.5)
45
54
  rake (12.3.3)
55
+ ruby-progressbar (1.10.1)
46
56
  unf (0.1.4)
47
57
  unf_ext
48
58
  unf_ext (0.0.7.7)
@@ -51,7 +61,10 @@ PLATFORMS
51
61
  ruby
52
62
 
53
63
  DEPENDENCIES
64
+ byebug
54
65
  curation!
66
+ minitest
67
+ minitest-reporters
55
68
  rake (~> 12.0)
56
69
 
57
70
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
@@ -6,7 +6,7 @@ module Curation
6
6
  class Error < StandardError; end
7
7
 
8
8
  class Page
9
- attr_reader :url, :title, :text, :image
9
+ attr_reader :url
10
10
 
11
11
  BLACKLIST = [
12
12
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -25,40 +25,29 @@ module Curation
25
25
  end
26
26
 
27
27
  def title
28
- @title = find_title
28
+ @title ||= find_title
29
29
  end
30
30
 
31
31
  def image
32
- @image = find_image
33
- @image = @image.to_s.gsub('http://', 'https://')
32
+ unless @image
33
+ @image = find_image
34
+ @image = @image.to_s.gsub('http://', 'https://')
35
+ end
34
36
  @image
35
37
  end
36
38
 
37
39
  def text
38
- if json_ld.any?
39
- json_ld.each do |ld|
40
- next unless ld['@type'] == 'NewsArticle'
41
- return ld['text'] if ld.has_key? 'text'
42
- return ld['articleBody'] if ld.has_key? 'articleBody'
43
- end
44
- end
45
- h = nokogiri.dup
46
- BLACKLIST.each do |tag|
47
- h.css(tag).remove
48
- end
49
- nodes = h.css('p')
50
- nodes.xpath('//style').remove
51
- text = nodes.to_html
52
- text.gsub!('<br><br>', '<br>')
53
- text
40
+ @text ||= find_text
41
+ end
42
+
43
+ def date
44
+ @date ||= find_date
54
45
  end
55
46
 
56
47
  protected
57
48
 
58
49
  def find_title
59
50
  if json_ld.any?
60
- # Some sites have tables in tables
61
- json_ld.flatten!
62
51
  json_ld.each do |ld|
63
52
  return ld['headline'] if ld.has_key? 'headline'
64
53
  end
@@ -70,7 +59,7 @@ module Curation
70
59
  nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
71
60
  nokogiri.css('title')&.first&.inner_text
72
61
  ].each do |possibility|
73
- return possibility unless possibility.blank?
62
+ return possibility unless possibility.to_s.empty?
74
63
  end
75
64
  rescue
76
65
  puts 'Curation::Page find_title error'
@@ -98,7 +87,7 @@ module Curation
98
87
  metainspector.images.best,
99
88
  nokogiri.css('[property="og:image"]').first&.attributes['content'].value
100
89
  ].each do |possibility|
101
- return possibility unless possibility.blank?
90
+ return possibility unless possibility.to_s.empty?
102
91
  end
103
92
  rescue
104
93
  puts 'Curation::Page find_image error'
@@ -106,8 +95,63 @@ module Curation
106
95
  return ''
107
96
  end
108
97
 
98
+ def find_text
99
+ if json_ld.any?
100
+ json_ld.each do |ld|
101
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
102
+ return ld['text'] if ld.has_key? 'text'
103
+ return ld['articleBody'] if ld.has_key? 'articleBody'
104
+ end
105
+ end
106
+ h = nokogiri.dup
107
+ BLACKLIST.each do |tag|
108
+ h.css(tag).remove
109
+ end
110
+ nodes = h.css('p')
111
+ nodes.xpath('//style').remove
112
+ text = nodes.to_html
113
+ text.gsub!('<br><br>', '<br>')
114
+ text
115
+ end
116
+
117
+ def find_date
118
+ if json_ld.any?
119
+ json_ld.each do |ld|
120
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
121
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
122
+ end
123
+ end
124
+ return Date.parse metatags['date'] rescue nil
125
+ return Date.parse metatags['pubdate'] rescue nil
126
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
128
+ chunks = html.split('DisplayDate')
129
+ if chunks.count > 1
130
+ value = chunks[1]
131
+ value = value.split(',').first
132
+ value = value.gsub('"', '')
133
+ value = value[1..-1] if value[0] == ':'
134
+ return Date.parse value rescue nil
135
+ end
136
+ begin
137
+ value = nokogiri.css('.postDate').first
138
+ value = value.inner_text
139
+ value = value.gsub(' — ', '')
140
+ return Date.parse value
141
+ rescue
142
+ end
143
+ begin
144
+ value = nokogiri.css('.gta_post_date').first
145
+ value = value.inner_text
146
+ return Date.parse value
147
+ rescue
148
+ end
149
+ end
150
+
151
+ private
152
+
109
153
  def json_ld
110
- unless @json_ld
154
+ unless defined?(@json_ld)
111
155
  @json_ld = []
112
156
  begin
113
157
  options = nokogiri.css('[type="application/ld+json"]')
@@ -116,6 +160,8 @@ module Curation
116
160
  hash = JSON.parse(string)
117
161
  @json_ld << hash
118
162
  end
163
+ # Some sites have tables in tables
164
+ @json_ld.flatten!
119
165
  rescue
120
166
  puts 'Curation::Page json_ld error'
121
167
  end
@@ -123,22 +169,48 @@ module Curation
123
169
  @json_ld
124
170
  end
125
171
 
172
+ def file
173
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
174
+ rescue
175
+ puts "Curation::Page file error with url #{url}"
176
+ end
177
+
126
178
  def html
127
- @html ||= URI.open url
179
+ unless @html
180
+ file.rewind
181
+ @html = file.read
182
+ file.rewind
183
+ end
184
+ @html
128
185
  rescue
129
- puts "Impossible to open #{url}"
186
+ puts "Curation::Page html error"
130
187
  end
131
188
 
132
189
  def nokogiri
133
- @nokogiri ||= Nokogiri::HTML html
190
+ unless @nokogiri
191
+ file.rewind
192
+ @nokogiri = Nokogiri::HTML file
193
+ file.rewind
194
+ end
195
+ @nokogiri
134
196
  rescue
135
197
  puts 'Curation::Page nokogiri error'
136
198
  end
137
199
 
138
200
  def metainspector
139
- @metainspector ||= MetaInspector.new url, document: html
201
+ unless @metainspector
202
+ @metainspector = html.nil? ? MetaInspector.new(url)
203
+ : MetaInspector.new(url, document: html)
204
+ end
205
+ @metainspector
140
206
  rescue
141
207
  puts 'Curation::Page metainspector error'
142
208
  end
209
+
210
+ def metatags
211
+ @metatags ||= metainspector.meta_tag['name']
212
+ rescue
213
+ puts 'Curation::Page metatags error'
214
+ end
143
215
  end
144
216
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.6"
2
+ VERSION = "1.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.6'
4
+ version: '1.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2020-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector