curation 1.2 → 1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21a638c75deffb70db8694ecb99d94b1b711fba922ecbd0826d3a69597031df6
4
- data.tar.gz: 621519940fe6bc44302201f64211eba932d52ed366ebe0c6f11c02688b5ae0ba
3
+ metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
4
+ data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
5
5
  SHA512:
6
- metadata.gz: bf5f63d05793d71ac052b94dcaf5f4e34943355799e042361ae704ca9423b501ab3e157ae8b6509da642e6fa680768205030a24ddc1eb4e24c31863284289575
7
- data.tar.gz: d09140b59ba119e85e9c955a40586bd1ce6410fe392a59b5e4a92e2a1ae80b4608fd458c0b84b24be8ba31267bcee9550432264e6ca4068be47b90db46228ed1
6
+ metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
7
+ data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (0.1.0)
4
+ curation (1.7)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -10,6 +10,9 @@ GEM
10
10
  specs:
11
11
  addressable (2.7.0)
12
12
  public_suffix (>= 2.0.2, < 5.0)
13
+ ansi (1.5.0)
14
+ builder (3.2.4)
15
+ byebug (11.1.3)
13
16
  domain_name (0.5.20190701)
14
17
  unf (>= 0.0.5, < 1.0.0)
15
18
  faraday (1.0.1)
@@ -37,12 +40,19 @@ GEM
37
40
  nesty (~> 1.0.2)
38
41
  nokogiri (~> 1.10.9)
39
42
  mini_portile2 (2.4.0)
43
+ minitest (5.14.1)
44
+ minitest-reporters (1.4.2)
45
+ ansi
46
+ builder
47
+ minitest (>= 5.0)
48
+ ruby-progressbar
40
49
  multipart-post (2.1.1)
41
50
  nesty (1.0.2)
42
51
  nokogiri (1.10.10)
43
52
  mini_portile2 (~> 2.4.0)
44
53
  public_suffix (4.0.5)
45
54
  rake (12.3.3)
55
+ ruby-progressbar (1.10.1)
46
56
  unf (0.1.4)
47
57
  unf_ext
48
58
  unf_ext (0.0.7.7)
@@ -51,7 +61,10 @@ PLATFORMS
51
61
  ruby
52
62
 
53
63
  DEPENDENCIES
64
+ byebug
54
65
  curation!
66
+ minitest
67
+ minitest-reporters
55
68
  rake (~> 12.0)
56
69
 
57
70
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
@@ -6,7 +6,7 @@ module Curation
6
6
  class Error < StandardError; end
7
7
 
8
8
  class Page
9
- attr_reader :url, :title, :text, :image
9
+ attr_reader :url
10
10
 
11
11
  BLACKLIST = [
12
12
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -25,25 +25,80 @@ module Curation
25
25
  end
26
26
 
27
27
  def title
28
+ @title ||= find_title
29
+ end
30
+
31
+ def image
32
+ unless @image
33
+ @image = find_image
34
+ @image = @image.to_s.gsub('http://', 'https://')
35
+ end
36
+ @image
37
+ end
38
+
39
+ def text
40
+ @text ||= find_text
41
+ end
42
+
43
+ def date
44
+ @date ||= find_date
45
+ end
46
+
47
+ protected
48
+
49
+ def find_title
28
50
  if json_ld.any?
29
51
  json_ld.each do |ld|
30
52
  return ld['headline'] if ld.has_key? 'headline'
31
53
  end
32
54
  end
33
- metainspector.best_title unless metainspector.best_title.blank?
34
- metainspector.title
55
+ begin
56
+ [
57
+ metainspector.best_title,
58
+ metainspector.title,
59
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
60
+ nokogiri.css('title')&.first&.inner_text
61
+ ].each do |possibility|
62
+ return possibility unless possibility.to_s.empty?
63
+ end
64
+ rescue
65
+ puts 'Curation::Page find_title error'
66
+ end
67
+ return ''
35
68
  end
36
69
 
37
- def image
38
- @image = find_image
39
- @image = @image.to_s.gsub('http://', 'https://')
40
- @image
70
+ def find_image
71
+ if json_ld.any?
72
+ json_ld.each do |ld|
73
+ if ld.has_key? 'image'
74
+ image_data = ld['image']
75
+ return image_data if image_data.is_a? String
76
+ if image_data.is_a? Array
77
+ first = image_data.first
78
+ return first if first.is_a? String
79
+ return first['url'] if first.is_a? Hash
80
+ end
81
+ return image_data['url'] if image_data.is_a? Hash
82
+ end
83
+ end
84
+ end
85
+ begin
86
+ [
87
+ metainspector.images.best,
88
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
89
+ ].each do |possibility|
90
+ return possibility unless possibility.to_s.empty?
91
+ end
92
+ rescue
93
+ puts 'Curation::Page find_image error'
94
+ end
95
+ return ''
41
96
  end
42
97
 
43
- def text
98
+ def find_text
44
99
  if json_ld.any?
45
100
  json_ld.each do |ld|
46
- next unless ld['@type'] == 'NewsArticle'
101
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
47
102
  return ld['text'] if ld.has_key? 'text'
48
103
  return ld['articleBody'] if ld.has_key? 'articleBody'
49
104
  end
@@ -59,24 +114,44 @@ module Curation
59
114
  text
60
115
  end
61
116
 
62
- protected
63
-
64
- def find_image
117
+ def find_date
65
118
  if json_ld.any?
66
119
  json_ld.each do |ld|
67
- if ld.has_key? 'image'
68
- image_data = ld['image']
69
- return image_data if image_data.is_a? String
70
- return image_data.first if image_data.is_a? Array
71
- return image_data['url'] if image_data.is_a? Hash
72
- end
120
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
121
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
73
122
  end
74
123
  end
75
- metainspector.images.best
124
+ return Date.parse metatags['date'] rescue nil
125
+ return Date.parse metatags['pubdate'] rescue nil
126
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
128
+ chunks = html.split('DisplayDate')
129
+ if chunks.count > 1
130
+ value = chunks[1]
131
+ value = value.split(',').first
132
+ value = value.gsub('"', '')
133
+ value = value[1..-1] if value[0] == ':'
134
+ return Date.parse value rescue nil
135
+ end
136
+ begin
137
+ value = nokogiri.css('.postDate').first
138
+ value = value.inner_text
139
+ value = value.gsub(' — ', '')
140
+ return Date.parse value
141
+ rescue
142
+ end
143
+ begin
144
+ value = nokogiri.css('.gta_post_date').first
145
+ value = value.inner_text
146
+ return Date.parse value
147
+ rescue
148
+ end
76
149
  end
77
150
 
151
+ private
152
+
78
153
  def json_ld
79
- unless @json_ld
154
+ unless defined?(@json_ld)
80
155
  @json_ld = []
81
156
  begin
82
157
  options = nokogiri.css('[type="application/ld+json"]')
@@ -85,29 +160,57 @@ module Curation
85
160
  hash = JSON.parse(string)
86
161
  @json_ld << hash
87
162
  end
163
+ # Some sites have tables in tables
164
+ @json_ld.flatten!
88
165
  rescue
89
- puts "JSON LD error"
166
+ puts 'Curation::Page json_ld error'
90
167
  end
91
168
  end
92
169
  @json_ld
93
170
  end
94
171
 
172
+ def file
173
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
174
+ rescue
175
+ puts "Curation::Page file error with url #{url}"
176
+ end
177
+
95
178
  def html
96
- @html ||= URI.open url
179
+ unless @html
180
+ file.rewind
181
+ @html = file.read
182
+ file.rewind
183
+ end
184
+ @html
97
185
  rescue
98
- puts "Impossible to open #{url}"
186
+ puts "Curation::Page html error"
99
187
  end
100
188
 
101
189
  def nokogiri
102
- @nokogiri ||= Nokogiri::HTML html
190
+ unless @nokogiri
191
+ file.rewind
192
+ @nokogiri = Nokogiri::HTML file
193
+ file.rewind
194
+ end
195
+ @nokogiri
103
196
  rescue
104
- puts "Nokogiri error"
197
+ puts 'Curation::Page nokogiri error'
105
198
  end
106
199
 
107
200
  def metainspector
108
- @metainspector ||= MetaInspector.new url, document: html
201
+ unless @metainspector
202
+ @metainspector = html.nil? ? MetaInspector.new(url)
203
+ : MetaInspector.new(url, document: html)
204
+ end
205
+ @metainspector
206
+ rescue
207
+ puts 'Curation::Page metainspector error'
208
+ end
209
+
210
+ def metatags
211
+ @metatags ||= metainspector.meta_tag['name']
109
212
  rescue
110
- puts "MetaInspector error"
213
+ puts 'Curation::Page metatags error'
111
214
  end
112
215
  end
113
216
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.2"
2
+ VERSION = "1.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.2'
4
+ version: '1.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector