curation 1.2 → 1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 21a638c75deffb70db8694ecb99d94b1b711fba922ecbd0826d3a69597031df6
4
- data.tar.gz: 621519940fe6bc44302201f64211eba932d52ed366ebe0c6f11c02688b5ae0ba
3
+ metadata.gz: e3e4d559a2583393e2d58cba1bfe30ef8870733ce013c00780b29706b9bfce68
4
+ data.tar.gz: 5802c61d564cfc2568f9173f6b270e3e288f4dbdfcfec624b02205bd01a81aaf
5
5
  SHA512:
6
- metadata.gz: bf5f63d05793d71ac052b94dcaf5f4e34943355799e042361ae704ca9423b501ab3e157ae8b6509da642e6fa680768205030a24ddc1eb4e24c31863284289575
7
- data.tar.gz: d09140b59ba119e85e9c955a40586bd1ce6410fe392a59b5e4a92e2a1ae80b4608fd458c0b84b24be8ba31267bcee9550432264e6ca4068be47b90db46228ed1
6
+ metadata.gz: 7cd290b2f9b5d9188c9db0bd07a9f88ce564faac927c3590af5c88cf11be0a848dd209fd7c0dd48e5af103bc8f05822911e7ffbe94c9a35dab0732e9e7d70a21
7
+ data.tar.gz: 0bcd9304b78f9ddb362ef7809831fe5065454adc64aa117df302d8a0ec384d92d7aac64e2dc1e8ebc3caf12267d78dbf7150f9fe12254a2b1d31509644e1e62e
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (0.1.0)
4
+ curation (1.7)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -10,6 +10,9 @@ GEM
10
10
  specs:
11
11
  addressable (2.7.0)
12
12
  public_suffix (>= 2.0.2, < 5.0)
13
+ ansi (1.5.0)
14
+ builder (3.2.4)
15
+ byebug (11.1.3)
13
16
  domain_name (0.5.20190701)
14
17
  unf (>= 0.0.5, < 1.0.0)
15
18
  faraday (1.0.1)
@@ -37,12 +40,19 @@ GEM
37
40
  nesty (~> 1.0.2)
38
41
  nokogiri (~> 1.10.9)
39
42
  mini_portile2 (2.4.0)
43
+ minitest (5.14.1)
44
+ minitest-reporters (1.4.2)
45
+ ansi
46
+ builder
47
+ minitest (>= 5.0)
48
+ ruby-progressbar
40
49
  multipart-post (2.1.1)
41
50
  nesty (1.0.2)
42
51
  nokogiri (1.10.10)
43
52
  mini_portile2 (~> 2.4.0)
44
53
  public_suffix (4.0.5)
45
54
  rake (12.3.3)
55
+ ruby-progressbar (1.10.1)
46
56
  unf (0.1.4)
47
57
  unf_ext
48
58
  unf_ext (0.0.7.7)
@@ -51,7 +61,10 @@ PLATFORMS
51
61
  ruby
52
62
 
53
63
  DEPENDENCIES
64
+ byebug
54
65
  curation!
66
+ minitest
67
+ minitest-reporters
55
68
  rake (~> 12.0)
56
69
 
57
70
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
@@ -6,7 +6,7 @@ module Curation
6
6
  class Error < StandardError; end
7
7
 
8
8
  class Page
9
- attr_reader :url, :title, :text, :image
9
+ attr_reader :url
10
10
 
11
11
  BLACKLIST = [
12
12
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -25,25 +25,80 @@ module Curation
25
25
  end
26
26
 
27
27
  def title
28
+ @title ||= find_title
29
+ end
30
+
31
+ def image
32
+ unless @image
33
+ @image = find_image
34
+ @image = @image.to_s.gsub('http://', 'https://')
35
+ end
36
+ @image
37
+ end
38
+
39
+ def text
40
+ @text ||= find_text
41
+ end
42
+
43
+ def date
44
+ @date ||= find_date
45
+ end
46
+
47
+ protected
48
+
49
+ def find_title
28
50
  if json_ld.any?
29
51
  json_ld.each do |ld|
30
52
  return ld['headline'] if ld.has_key? 'headline'
31
53
  end
32
54
  end
33
- metainspector.best_title unless metainspector.best_title.blank?
34
- metainspector.title
55
+ begin
56
+ [
57
+ metainspector.best_title,
58
+ metainspector.title,
59
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
60
+ nokogiri.css('title')&.first&.inner_text
61
+ ].each do |possibility|
62
+ return possibility unless possibility.to_s.empty?
63
+ end
64
+ rescue
65
+ puts 'Curation::Page find_title error'
66
+ end
67
+ return ''
35
68
  end
36
69
 
37
- def image
38
- @image = find_image
39
- @image = @image.to_s.gsub('http://', 'https://')
40
- @image
70
+ def find_image
71
+ if json_ld.any?
72
+ json_ld.each do |ld|
73
+ if ld.has_key? 'image'
74
+ image_data = ld['image']
75
+ return image_data if image_data.is_a? String
76
+ if image_data.is_a? Array
77
+ first = image_data.first
78
+ return first if first.is_a? String
79
+ return first['url'] if first.is_a? Hash
80
+ end
81
+ return image_data['url'] if image_data.is_a? Hash
82
+ end
83
+ end
84
+ end
85
+ begin
86
+ [
87
+ metainspector.images.best,
88
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
89
+ ].each do |possibility|
90
+ return possibility unless possibility.to_s.empty?
91
+ end
92
+ rescue
93
+ puts 'Curation::Page find_image error'
94
+ end
95
+ return ''
41
96
  end
42
97
 
43
- def text
98
+ def find_text
44
99
  if json_ld.any?
45
100
  json_ld.each do |ld|
46
- next unless ld['@type'] == 'NewsArticle'
101
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
47
102
  return ld['text'] if ld.has_key? 'text'
48
103
  return ld['articleBody'] if ld.has_key? 'articleBody'
49
104
  end
@@ -59,24 +114,44 @@ module Curation
59
114
  text
60
115
  end
61
116
 
62
- protected
63
-
64
- def find_image
117
+ def find_date
65
118
  if json_ld.any?
66
119
  json_ld.each do |ld|
67
- if ld.has_key? 'image'
68
- image_data = ld['image']
69
- return image_data if image_data.is_a? String
70
- return image_data.first if image_data.is_a? Array
71
- return image_data['url'] if image_data.is_a? Hash
72
- end
120
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
121
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
73
122
  end
74
123
  end
75
- metainspector.images.best
124
+ return Date.parse metatags['date'] rescue nil
125
+ return Date.parse metatags['pubdate'] rescue nil
126
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
128
+ chunks = html.split('DisplayDate')
129
+ if chunks.count > 1
130
+ value = chunks[1]
131
+ value = value.split(',').first
132
+ value = value.gsub('"', '')
133
+ value = value[1..-1] if value[0] == ':'
134
+ return Date.parse value rescue nil
135
+ end
136
+ begin
137
+ value = nokogiri.css('.postDate').first
138
+ value = value.inner_text
139
+ value = value.gsub(' — ', '')
140
+ return Date.parse value
141
+ rescue
142
+ end
143
+ begin
144
+ value = nokogiri.css('.gta_post_date').first
145
+ value = value.inner_text
146
+ return Date.parse value
147
+ rescue
148
+ end
76
149
  end
77
150
 
151
+ private
152
+
78
153
  def json_ld
79
- unless @json_ld
154
+ unless defined?(@json_ld)
80
155
  @json_ld = []
81
156
  begin
82
157
  options = nokogiri.css('[type="application/ld+json"]')
@@ -85,29 +160,57 @@ module Curation
85
160
  hash = JSON.parse(string)
86
161
  @json_ld << hash
87
162
  end
163
+ # Some sites have tables in tables
164
+ @json_ld.flatten!
88
165
  rescue
89
- puts "JSON LD error"
166
+ puts 'Curation::Page json_ld error'
90
167
  end
91
168
  end
92
169
  @json_ld
93
170
  end
94
171
 
172
+ def file
173
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
174
+ rescue
175
+ puts "Curation::Page file error with url #{url}"
176
+ end
177
+
95
178
  def html
96
- @html ||= URI.open url
179
+ unless @html
180
+ file.rewind
181
+ @html = file.read
182
+ file.rewind
183
+ end
184
+ @html
97
185
  rescue
98
- puts "Impossible to open #{url}"
186
+ puts "Curation::Page html error"
99
187
  end
100
188
 
101
189
  def nokogiri
102
- @nokogiri ||= Nokogiri::HTML html
190
+ unless @nokogiri
191
+ file.rewind
192
+ @nokogiri = Nokogiri::HTML file
193
+ file.rewind
194
+ end
195
+ @nokogiri
103
196
  rescue
104
- puts "Nokogiri error"
197
+ puts 'Curation::Page nokogiri error'
105
198
  end
106
199
 
107
200
  def metainspector
108
- @metainspector ||= MetaInspector.new url, document: html
201
+ unless @metainspector
202
+ @metainspector = html.nil? ? MetaInspector.new(url)
203
+ : MetaInspector.new(url, document: html)
204
+ end
205
+ @metainspector
206
+ rescue
207
+ puts 'Curation::Page metainspector error'
208
+ end
209
+
210
+ def metatags
211
+ @metatags ||= metainspector.meta_tag['name']
109
212
  rescue
110
- puts "MetaInspector error"
213
+ puts 'Curation::Page metatags error'
111
214
  end
112
215
  end
113
216
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.2"
2
+ VERSION = "1.7"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.2'
4
+ version: '1.7'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-07-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector