curation 1.4 → 1.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 71f7502a62d7bbb12799e869c1e5c64bd3e514c7ed97190790d42675451d48d1
4
- data.tar.gz: 5cec6d6556dce75884b2e1a5e92e0d34744b623fcfb81b8b52684974a980f8b8
3
+ metadata.gz: b8086e74a861e147bce3a73dcec3cebef7d5d98ede440c911f57757feaaac354
4
+ data.tar.gz: 3d6ff271bcfbf8599a76653d60b2c26ed3832c1c174885d114912f95a7707ee7
5
5
  SHA512:
6
- metadata.gz: 39dabe191b310e192e663836e00b8e4c28c716685cf05e0d86c807cc968b43d525480c8488b1d93e68c30b931d2d30560e445d3693458f78f937c76e38d788eb
7
- data.tar.gz: ac5786df34fca74a9b7e6f336b630ce89baebf1720c01e48ac87c3f933457259276035587dbe0d922fa3096acb55cdfb392a1869b8c412eebaba214f66cd4f63
6
+ metadata.gz: '084c4c3fbdf3491530cd4ca6ca0fdc736df69a22a49cce40b0d7c8d169f6a45e7c5f01f280957016e32bedca93d593bb2d9e60141e148ee4f110935877543168'
7
+ data.tar.gz: 3fe95dde59a8cb2268a1f93ff1e8b7ccbca9212a7b0cdc0093c16925549545df422945db25ab29f250fcc05242b5d7b01eb822f3c97b4e2b96d19653d70dff39
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
data/Gemfile.lock CHANGED
@@ -1,57 +1,94 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.4)
4
+ curation (1.8)
5
5
  metainspector
6
6
  nokogiri
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- addressable (2.7.0)
11
+ addressable (2.8.0)
12
12
  public_suffix (>= 2.0.2, < 5.0)
13
+ ansi (1.5.0)
14
+ builder (3.2.4)
15
+ byebug (11.1.3)
13
16
  domain_name (0.5.20190701)
14
17
  unf (>= 0.0.5, < 1.0.0)
15
- faraday (1.0.1)
16
- multipart-post (>= 1.2, < 3)
17
- faraday-cookie_jar (0.0.6)
18
- faraday (>= 0.7.4)
18
+ faraday (1.9.3)
19
+ faraday-em_http (~> 1.0)
20
+ faraday-em_synchrony (~> 1.0)
21
+ faraday-excon (~> 1.1)
22
+ faraday-httpclient (~> 1.0)
23
+ faraday-multipart (~> 1.0)
24
+ faraday-net_http (~> 1.0)
25
+ faraday-net_http_persistent (~> 1.0)
26
+ faraday-patron (~> 1.0)
27
+ faraday-rack (~> 1.0)
28
+ faraday-retry (~> 1.0)
29
+ ruby2_keywords (>= 0.0.4)
30
+ faraday-cookie_jar (0.0.7)
31
+ faraday (>= 0.8.0)
19
32
  http-cookie (~> 1.0.0)
33
+ faraday-em_http (1.0.0)
34
+ faraday-em_synchrony (1.0.0)
20
35
  faraday-encoding (0.0.5)
21
36
  faraday
37
+ faraday-excon (1.1.0)
22
38
  faraday-http-cache (2.2.0)
23
39
  faraday (>= 0.8)
24
- faraday_middleware (1.0.0)
40
+ faraday-httpclient (1.0.1)
41
+ faraday-multipart (1.0.3)
42
+ multipart-post (>= 1.2, < 3)
43
+ faraday-net_http (1.0.1)
44
+ faraday-net_http_persistent (1.2.0)
45
+ faraday-patron (1.0.0)
46
+ faraday-rack (1.0.0)
47
+ faraday-retry (1.0.3)
48
+ faraday_middleware (1.2.0)
25
49
  faraday (~> 1.0)
26
- fastimage (2.1.7)
27
- http-cookie (1.0.3)
50
+ fastimage (2.2.6)
51
+ http-cookie (1.0.4)
28
52
  domain_name (~> 0.5)
29
- metainspector (5.10.1)
30
- addressable (~> 2.7.0)
31
- faraday (~> 1.0.0)
32
- faraday-cookie_jar (~> 0.0.6)
33
- faraday-encoding (~> 0.0.5)
34
- faraday-http-cache (~> 2.2.0)
35
- faraday_middleware (~> 1.0.0)
36
- fastimage (~> 2.1.7)
37
- nesty (~> 1.0.2)
38
- nokogiri (~> 1.10.9)
39
- mini_portile2 (2.4.0)
53
+ metainspector (5.11.2)
54
+ addressable (~> 2.7)
55
+ faraday (~> 1.4)
56
+ faraday-cookie_jar (~> 0.0)
57
+ faraday-encoding (~> 0.0)
58
+ faraday-http-cache (~> 2.2)
59
+ faraday_middleware (~> 1.0)
60
+ fastimage (~> 2.2)
61
+ nesty (~> 1.0)
62
+ nokogiri (~> 1.11)
63
+ mini_portile2 (2.7.1)
64
+ minitest (5.15.0)
65
+ minitest-reporters (1.5.0)
66
+ ansi
67
+ builder
68
+ minitest (>= 5.0)
69
+ ruby-progressbar
40
70
  multipart-post (2.1.1)
41
71
  nesty (1.0.2)
42
- nokogiri (1.10.10)
43
- mini_portile2 (~> 2.4.0)
44
- public_suffix (4.0.5)
72
+ nokogiri (1.13.1)
73
+ mini_portile2 (~> 2.7.0)
74
+ racc (~> 1.4)
75
+ public_suffix (4.0.6)
76
+ racc (1.6.0)
45
77
  rake (12.3.3)
78
+ ruby-progressbar (1.11.0)
79
+ ruby2_keywords (0.0.5)
46
80
  unf (0.1.4)
47
81
  unf_ext
48
- unf_ext (0.0.7.7)
82
+ unf_ext (0.0.8)
49
83
 
50
84
  PLATFORMS
51
85
  ruby
52
86
 
53
87
  DEPENDENCIES
88
+ byebug
54
89
  curation!
90
+ minitest
91
+ minitest-reporters
55
92
  rake (~> 12.0)
56
93
 
57
94
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.4"
2
+ VERSION = "1.8"
3
3
  end
data/lib/curation.rb CHANGED
@@ -6,7 +6,7 @@ module Curation
6
6
  class Error < StandardError; end
7
7
 
8
8
  class Page
9
- attr_reader :url, :title, :text, :image
9
+ attr_reader :url
10
10
 
11
11
  BLACKLIST = [
12
12
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -20,37 +20,28 @@ module Curation
20
20
  ]
21
21
 
22
22
  def initialize(url, html = nil)
23
- @url = url
23
+ @url = url.to_s.gsub('http://', 'https://')
24
24
  @html = html
25
25
  end
26
26
 
27
27
  def title
28
- @title = find_title
28
+ @title ||= find_title
29
29
  end
30
30
 
31
31
  def image
32
- @image = find_image
33
- @image = @image.to_s.gsub('http://', 'https://')
32
+ unless @image
33
+ @image = find_image
34
+ @image = @image.to_s.gsub('http://', 'https://')
35
+ end
34
36
  @image
35
37
  end
36
38
 
37
39
  def text
38
- if json_ld.any?
39
- json_ld.each do |ld|
40
- next unless ld['@type'] == 'NewsArticle'
41
- return ld['text'] if ld.has_key? 'text'
42
- return ld['articleBody'] if ld.has_key? 'articleBody'
43
- end
44
- end
45
- h = nokogiri.dup
46
- BLACKLIST.each do |tag|
47
- h.css(tag).remove
48
- end
49
- nodes = h.css('p')
50
- nodes.xpath('//style').remove
51
- text = nodes.to_html
52
- text.gsub!('<br><br>', '<br>')
53
- text
40
+ @text ||= find_text
41
+ end
42
+
43
+ def date
44
+ @date ||= find_date
54
45
  end
55
46
 
56
47
  protected
@@ -61,13 +52,17 @@ module Curation
61
52
  return ld['headline'] if ld.has_key? 'headline'
62
53
  end
63
54
  end
64
- [
65
- metainspector.best_title,
66
- metainspector.title,
67
- nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
68
- nokogiri.css('title')&.first&.inner_text
69
- ].each do |possibility|
70
- return possibility unless possibility.blank?
55
+ begin
56
+ [
57
+ metainspector.best_title,
58
+ metainspector.title,
59
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
60
+ nokogiri.css('title')&.first&.inner_text
61
+ ].each do |possibility|
62
+ return possibility unless possibility.to_s.empty?
63
+ end
64
+ rescue
65
+ puts 'Curation::Page find_title error'
71
66
  end
72
67
  return ''
73
68
  end
@@ -87,48 +82,140 @@ module Curation
87
82
  end
88
83
  end
89
84
  end
90
- [
91
- metainspector.images.best,
92
- nokogiri.css('[property="og:image"]').first&.attributes["content"].value
93
- ].each do |possibility|
94
- return possibility unless possibility.blank?
85
+ begin
86
+ [
87
+ metainspector.images.best,
88
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
89
+ ].each do |possibility|
90
+ return possibility unless possibility.to_s.empty?
91
+ end
92
+ rescue
93
+ puts 'Curation::Page find_image error'
95
94
  end
96
95
  return ''
97
96
  end
98
97
 
98
+ def find_text
99
+ if json_ld.any?
100
+ json_ld.each do |ld|
101
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
102
+ return ld['text'] if ld.has_key? 'text'
103
+ return ld['articleBody'] if ld.has_key? 'articleBody'
104
+ end
105
+ end
106
+ h = nokogiri.dup
107
+ BLACKLIST.each do |tag|
108
+ h.css(tag).remove
109
+ end
110
+ nodes = h.css('p')
111
+ nodes.xpath('//style').remove
112
+ text = nodes.to_html
113
+ text.gsub!('<br><br>', '<br>')
114
+ text
115
+ end
116
+
117
+ def find_date
118
+ if json_ld.any?
119
+ json_ld.each do |ld|
120
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
121
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
122
+ end
123
+ end
124
+ return Date.parse metatags['date'] rescue nil
125
+ return Date.parse metatags['pubdate'] rescue nil
126
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
128
+ chunks = html.split('DisplayDate')
129
+ if chunks.count > 1
130
+ value = chunks[1]
131
+ value = value.split(',').first
132
+ value = value.gsub('"', '')
133
+ value = value[1..-1] if value[0] == ':'
134
+ return Date.parse value rescue nil
135
+ end
136
+ begin
137
+ value = nokogiri.css('.postDate').first
138
+ value = value.inner_text
139
+ value = value.gsub(' — ', '')
140
+ return Date.parse value
141
+ rescue
142
+ end
143
+ begin
144
+ value = nokogiri.css('.gta_post_date').first
145
+ value = value.inner_text
146
+ return Date.parse value
147
+ rescue
148
+ end
149
+ end
150
+
151
+ private
152
+
99
153
  def json_ld
100
- unless @json_ld
154
+ unless defined?(@json_ld)
101
155
  @json_ld = []
102
156
  begin
103
157
  options = nokogiri.css('[type="application/ld+json"]')
104
158
  options.each do |option|
159
+ # require 'byebug'; byebug
105
160
  string = option.inner_text
106
161
  hash = JSON.parse(string)
107
162
  @json_ld << hash
108
163
  end
164
+ # Some sites have tables in tables
165
+ @json_ld.flatten!
109
166
  rescue
110
- puts "JSON LD error"
167
+ puts 'Curation::Page json_ld error'
111
168
  end
112
169
  end
113
170
  @json_ld
114
171
  end
115
172
 
173
+ def file
174
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
175
+ rescue
176
+ puts "Curation::Page file error with url #{url}"
177
+ end
178
+
116
179
  def html
117
- @html ||= URI.open url
180
+ unless @html
181
+ file.rewind
182
+ @html = file.read
183
+ file.rewind
184
+ end
185
+ @html
118
186
  rescue
119
- puts "Impossible to open #{url}"
187
+ puts "Curation::Page html error"
120
188
  end
121
189
 
122
190
  def nokogiri
123
- @nokogiri ||= Nokogiri::HTML html
191
+ unless @nokogiri
192
+ if file.nil?
193
+ @nokogiri = metainspector.parsed
194
+ else
195
+ file.rewind
196
+ @nokogiri = Nokogiri::HTML file
197
+ file.rewind
198
+ end
199
+ end
200
+ @nokogiri
124
201
  rescue
125
- puts "Nokogiri error"
202
+ puts 'Curation::Page nokogiri error'
126
203
  end
127
204
 
128
205
  def metainspector
129
- @metainspector ||= MetaInspector.new url, document: html
206
+ unless @metainspector
207
+ @metainspector = html.nil? ? MetaInspector.new(url)
208
+ : MetaInspector.new(url, document: html)
209
+ end
210
+ @metainspector
211
+ rescue
212
+ puts 'Curation::Page metainspector error'
213
+ end
214
+
215
+ def metatags
216
+ @metatags ||= metainspector.meta_tag['name']
130
217
  rescue
131
- puts "MetaInspector error"
218
+ puts 'Curation::Page metatags error'
132
219
  end
133
220
  end
134
221
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.4'
4
+ version: '1.8'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2022-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
@@ -65,7 +65,7 @@ licenses:
65
65
  metadata:
66
66
  homepage_uri: https://github.com/arnaudlevy/curation
67
67
  source_code_uri: https://github.com/arnaudlevy/curation
68
- post_install_message:
68
+ post_install_message:
69
69
  rdoc_options: []
70
70
  require_paths:
71
71
  - lib
@@ -80,8 +80,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
80
  - !ruby/object:Gem::Version
81
81
  version: '0'
82
82
  requirements: []
83
- rubygems_version: 3.0.3
84
- signing_key:
83
+ rubygems_version: 3.1.6
84
+ signing_key:
85
85
  specification_version: 4
86
86
  summary: Curation of content
87
87
  test_files: []