curation 1.6 → 1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
4
- data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
3
+ metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
4
+ data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
5
5
  SHA512:
6
- metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
7
- data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
6
+ metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
7
+ data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.6)
4
+ curation (1.7)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -10,6 +10,9 @@ GEM
10
10
  specs:
11
11
  addressable (2.7.0)
12
12
  public_suffix (>= 2.0.2, < 5.0)
13
+ ansi (1.5.0)
14
+ builder (3.2.4)
15
+ byebug (11.1.3)
13
16
  domain_name (0.5.20190701)
14
17
  unf (>= 0.0.5, < 1.0.0)
15
18
  faraday (1.0.1)
@@ -37,12 +40,19 @@ GEM
37
40
  nesty (~> 1.0.2)
38
41
  nokogiri (~> 1.10.9)
39
42
  mini_portile2 (2.4.0)
43
+ minitest (5.14.1)
44
+ minitest-reporters (1.4.2)
45
+ ansi
46
+ builder
47
+ minitest (>= 5.0)
48
+ ruby-progressbar
40
49
  multipart-post (2.1.1)
41
50
  nesty (1.0.2)
42
51
  nokogiri (1.10.10)
43
52
  mini_portile2 (~> 2.4.0)
44
53
  public_suffix (4.0.5)
45
54
  rake (12.3.3)
55
+ ruby-progressbar (1.10.1)
46
56
  unf (0.1.4)
47
57
  unf_ext
48
58
  unf_ext (0.0.7.7)
@@ -51,7 +61,10 @@ PLATFORMS
51
61
  ruby
52
62
 
53
63
  DEPENDENCIES
64
+ byebug
54
65
  curation!
66
+ minitest
67
+ minitest-reporters
55
68
  rake (~> 12.0)
56
69
 
57
70
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
@@ -6,7 +6,7 @@ module Curation
6
6
  class Error < StandardError; end
7
7
 
8
8
  class Page
9
- attr_reader :url, :title, :text, :image
9
+ attr_reader :url
10
10
 
11
11
  BLACKLIST = [
12
12
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -25,40 +25,29 @@ module Curation
25
25
  end
26
26
 
27
27
  def title
28
- @title = find_title
28
+ @title ||= find_title
29
29
  end
30
30
 
31
31
  def image
32
- @image = find_image
33
- @image = @image.to_s.gsub('http://', 'https://')
32
+ unless @image
33
+ @image = find_image
34
+ @image = @image.to_s.gsub('http://', 'https://')
35
+ end
34
36
  @image
35
37
  end
36
38
 
37
39
  def text
38
- if json_ld.any?
39
- json_ld.each do |ld|
40
- next unless ld['@type'] == 'NewsArticle'
41
- return ld['text'] if ld.has_key? 'text'
42
- return ld['articleBody'] if ld.has_key? 'articleBody'
43
- end
44
- end
45
- h = nokogiri.dup
46
- BLACKLIST.each do |tag|
47
- h.css(tag).remove
48
- end
49
- nodes = h.css('p')
50
- nodes.xpath('//style').remove
51
- text = nodes.to_html
52
- text.gsub!('<br><br>', '<br>')
53
- text
40
+ @text ||= find_text
41
+ end
42
+
43
+ def date
44
+ @date ||= find_date
54
45
  end
55
46
 
56
47
  protected
57
48
 
58
49
  def find_title
59
50
  if json_ld.any?
60
- # Some sites have tables in tables
61
- json_ld.flatten!
62
51
  json_ld.each do |ld|
63
52
  return ld['headline'] if ld.has_key? 'headline'
64
53
  end
@@ -70,7 +59,7 @@ module Curation
70
59
  nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
71
60
  nokogiri.css('title')&.first&.inner_text
72
61
  ].each do |possibility|
73
- return possibility unless possibility.blank?
62
+ return possibility unless possibility.to_s.empty?
74
63
  end
75
64
  rescue
76
65
  puts 'Curation::Page find_title error'
@@ -98,7 +87,7 @@ module Curation
98
87
  metainspector.images.best,
99
88
  nokogiri.css('[property="og:image"]').first&.attributes['content'].value
100
89
  ].each do |possibility|
101
- return possibility unless possibility.blank?
90
+ return possibility unless possibility.to_s.empty?
102
91
  end
103
92
  rescue
104
93
  puts 'Curation::Page find_image error'
@@ -106,8 +95,63 @@ module Curation
106
95
  return ''
107
96
  end
108
97
 
98
+ def find_text
99
+ if json_ld.any?
100
+ json_ld.each do |ld|
101
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
102
+ return ld['text'] if ld.has_key? 'text'
103
+ return ld['articleBody'] if ld.has_key? 'articleBody'
104
+ end
105
+ end
106
+ h = nokogiri.dup
107
+ BLACKLIST.each do |tag|
108
+ h.css(tag).remove
109
+ end
110
+ nodes = h.css('p')
111
+ nodes.xpath('//style').remove
112
+ text = nodes.to_html
113
+ text.gsub!('<br><br>', '<br>')
114
+ text
115
+ end
116
+
117
+ def find_date
118
+ if json_ld.any?
119
+ json_ld.each do |ld|
120
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
121
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
122
+ end
123
+ end
124
+ return Date.parse metatags['date'] rescue nil
125
+ return Date.parse metatags['pubdate'] rescue nil
126
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
128
+ chunks = html.split('DisplayDate')
129
+ if chunks.count > 1
130
+ value = chunks[1]
131
+ value = value.split(',').first
132
+ value = value.gsub('"', '')
133
+ value = value[1..-1] if value[0] == ':'
134
+ return Date.parse value rescue nil
135
+ end
136
+ begin
137
+ value = nokogiri.css('.postDate').first
138
+ value = value.inner_text
139
+ value = value.gsub(' — ', '')
140
+ return Date.parse value
141
+ rescue
142
+ end
143
+ begin
144
+ value = nokogiri.css('.gta_post_date').first
145
+ value = value.inner_text
146
+ return Date.parse value
147
+ rescue
148
+ end
149
+ end
150
+
151
+ private
152
+
109
153
  def json_ld
110
- unless @json_ld
154
+ unless defined?(@json_ld)
111
155
  @json_ld = []
112
156
  begin
113
157
  options = nokogiri.css('[type="application/ld+json"]')
@@ -116,6 +160,8 @@ module Curation
116
160
  hash = JSON.parse(string)
117
161
  @json_ld << hash
118
162
  end
163
+ # Some sites have tables in tables
164
+ @json_ld.flatten!
119
165
  rescue
120
166
  puts 'Curation::Page json_ld error'
121
167
  end
@@ -123,22 +169,48 @@ module Curation
123
169
  @json_ld
124
170
  end
125
171
 
172
+ def file
173
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
174
+ rescue
175
+ puts "Curation::Page file error with url #{url}"
176
+ end
177
+
126
178
  def html
127
- @html ||= URI.open url
179
+ unless @html
180
+ file.rewind
181
+ @html = file.read
182
+ file.rewind
183
+ end
184
+ @html
128
185
  rescue
129
- puts "Impossible to open #{url}"
186
+ puts "Curation::Page html error"
130
187
  end
131
188
 
132
189
  def nokogiri
133
- @nokogiri ||= Nokogiri::HTML html
190
+ unless @nokogiri
191
+ file.rewind
192
+ @nokogiri = Nokogiri::HTML file
193
+ file.rewind
194
+ end
195
+ @nokogiri
134
196
  rescue
135
197
  puts 'Curation::Page nokogiri error'
136
198
  end
137
199
 
138
200
  def metainspector
139
- @metainspector ||= MetaInspector.new url, document: html
201
+ unless @metainspector
202
+ @metainspector = html.nil? ? MetaInspector.new(url)
203
+ : MetaInspector.new(url, document: html)
204
+ end
205
+ @metainspector
140
206
  rescue
141
207
  puts 'Curation::Page metainspector error'
142
208
  end
209
+
210
+ def metatags
211
+ @metatags ||= metainspector.meta_tag['name']
212
+ rescue
213
+ puts 'Curation::Page metatags error'
214
+ end
143
215
  end
144
216
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.6"
2
+ VERSION = "1.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.6'
4
+ version: '1.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2020-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector