curation 1.4 → 1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 71f7502a62d7bbb12799e869c1e5c64bd3e514c7ed97190790d42675451d48d1
4
- data.tar.gz: 5cec6d6556dce75884b2e1a5e92e0d34744b623fcfb81b8b52684974a980f8b8
3
+ metadata.gz: b8086e74a861e147bce3a73dcec3cebef7d5d98ede440c911f57757feaaac354
4
+ data.tar.gz: 3d6ff271bcfbf8599a76653d60b2c26ed3832c1c174885d114912f95a7707ee7
5
5
  SHA512:
6
- metadata.gz: 39dabe191b310e192e663836e00b8e4c28c716685cf05e0d86c807cc968b43d525480c8488b1d93e68c30b931d2d30560e445d3693458f78f937c76e38d788eb
7
- data.tar.gz: ac5786df34fca74a9b7e6f336b630ce89baebf1720c01e48ac87c3f933457259276035587dbe0d922fa3096acb55cdfb392a1869b8c412eebaba214f66cd4f63
6
+ metadata.gz: '084c4c3fbdf3491530cd4ca6ca0fdc736df69a22a49cce40b0d7c8d169f6a45e7c5f01f280957016e32bedca93d593bb2d9e60141e148ee4f110935877543168'
7
+ data.tar.gz: 3fe95dde59a8cb2268a1f93ff1e8b7ccbca9212a7b0cdc0093c16925549545df422945db25ab29f250fcc05242b5d7b01eb822f3c97b4e2b96d19653d70dff39
data/.gitignore CHANGED
@@ -6,3 +6,4 @@
6
6
  /pkg/
7
7
  /spec/reports/
8
8
  /tmp/
9
+ .byebug_history
data/Gemfile CHANGED
@@ -3,4 +3,7 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in curation.gemspec
4
4
  gemspec
5
5
 
6
- gem "rake", "~> 12.0"
6
+ gem 'rake', '~> 12.0'
7
+ gem 'minitest'
8
+ gem 'minitest-reporters'
9
+ gem 'byebug'
data/Gemfile.lock CHANGED
@@ -1,57 +1,94 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.4)
4
+ curation (1.8)
5
5
  metainspector
6
6
  nokogiri
7
7
 
8
8
  GEM
9
9
  remote: https://rubygems.org/
10
10
  specs:
11
- addressable (2.7.0)
11
+ addressable (2.8.0)
12
12
  public_suffix (>= 2.0.2, < 5.0)
13
+ ansi (1.5.0)
14
+ builder (3.2.4)
15
+ byebug (11.1.3)
13
16
  domain_name (0.5.20190701)
14
17
  unf (>= 0.0.5, < 1.0.0)
15
- faraday (1.0.1)
16
- multipart-post (>= 1.2, < 3)
17
- faraday-cookie_jar (0.0.6)
18
- faraday (>= 0.7.4)
18
+ faraday (1.9.3)
19
+ faraday-em_http (~> 1.0)
20
+ faraday-em_synchrony (~> 1.0)
21
+ faraday-excon (~> 1.1)
22
+ faraday-httpclient (~> 1.0)
23
+ faraday-multipart (~> 1.0)
24
+ faraday-net_http (~> 1.0)
25
+ faraday-net_http_persistent (~> 1.0)
26
+ faraday-patron (~> 1.0)
27
+ faraday-rack (~> 1.0)
28
+ faraday-retry (~> 1.0)
29
+ ruby2_keywords (>= 0.0.4)
30
+ faraday-cookie_jar (0.0.7)
31
+ faraday (>= 0.8.0)
19
32
  http-cookie (~> 1.0.0)
33
+ faraday-em_http (1.0.0)
34
+ faraday-em_synchrony (1.0.0)
20
35
  faraday-encoding (0.0.5)
21
36
  faraday
37
+ faraday-excon (1.1.0)
22
38
  faraday-http-cache (2.2.0)
23
39
  faraday (>= 0.8)
24
- faraday_middleware (1.0.0)
40
+ faraday-httpclient (1.0.1)
41
+ faraday-multipart (1.0.3)
42
+ multipart-post (>= 1.2, < 3)
43
+ faraday-net_http (1.0.1)
44
+ faraday-net_http_persistent (1.2.0)
45
+ faraday-patron (1.0.0)
46
+ faraday-rack (1.0.0)
47
+ faraday-retry (1.0.3)
48
+ faraday_middleware (1.2.0)
25
49
  faraday (~> 1.0)
26
- fastimage (2.1.7)
27
- http-cookie (1.0.3)
50
+ fastimage (2.2.6)
51
+ http-cookie (1.0.4)
28
52
  domain_name (~> 0.5)
29
- metainspector (5.10.1)
30
- addressable (~> 2.7.0)
31
- faraday (~> 1.0.0)
32
- faraday-cookie_jar (~> 0.0.6)
33
- faraday-encoding (~> 0.0.5)
34
- faraday-http-cache (~> 2.2.0)
35
- faraday_middleware (~> 1.0.0)
36
- fastimage (~> 2.1.7)
37
- nesty (~> 1.0.2)
38
- nokogiri (~> 1.10.9)
39
- mini_portile2 (2.4.0)
53
+ metainspector (5.11.2)
54
+ addressable (~> 2.7)
55
+ faraday (~> 1.4)
56
+ faraday-cookie_jar (~> 0.0)
57
+ faraday-encoding (~> 0.0)
58
+ faraday-http-cache (~> 2.2)
59
+ faraday_middleware (~> 1.0)
60
+ fastimage (~> 2.2)
61
+ nesty (~> 1.0)
62
+ nokogiri (~> 1.11)
63
+ mini_portile2 (2.7.1)
64
+ minitest (5.15.0)
65
+ minitest-reporters (1.5.0)
66
+ ansi
67
+ builder
68
+ minitest (>= 5.0)
69
+ ruby-progressbar
40
70
  multipart-post (2.1.1)
41
71
  nesty (1.0.2)
42
- nokogiri (1.10.10)
43
- mini_portile2 (~> 2.4.0)
44
- public_suffix (4.0.5)
72
+ nokogiri (1.13.1)
73
+ mini_portile2 (~> 2.7.0)
74
+ racc (~> 1.4)
75
+ public_suffix (4.0.6)
76
+ racc (1.6.0)
45
77
  rake (12.3.3)
78
+ ruby-progressbar (1.11.0)
79
+ ruby2_keywords (0.0.5)
46
80
  unf (0.1.4)
47
81
  unf_ext
48
- unf_ext (0.0.7.7)
82
+ unf_ext (0.0.8)
49
83
 
50
84
  PLATFORMS
51
85
  ruby
52
86
 
53
87
  DEPENDENCIES
88
+ byebug
54
89
  curation!
90
+ minitest
91
+ minitest-reporters
55
92
  rake (~> 12.0)
56
93
 
57
94
  BUNDLED WITH
data/Rakefile CHANGED
@@ -1,2 +1,10 @@
1
1
  require "bundler/gem_tasks"
2
- task :default => :spec
2
+ require "rake/testtask"
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << "test"
6
+ t.test_files = FileList["test/**/*_spec.rb"]
7
+ t.warning = false
8
+ end
9
+
10
+ task default: :test
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.4"
2
+ VERSION = "1.8"
3
3
  end
data/lib/curation.rb CHANGED
@@ -6,7 +6,7 @@ module Curation
6
6
  class Error < StandardError; end
7
7
 
8
8
  class Page
9
- attr_reader :url, :title, :text, :image
9
+ attr_reader :url
10
10
 
11
11
  BLACKLIST = [
12
12
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
@@ -20,37 +20,28 @@ module Curation
20
20
  ]
21
21
 
22
22
  def initialize(url, html = nil)
23
- @url = url
23
+ @url = url.to_s.gsub('http://', 'https://')
24
24
  @html = html
25
25
  end
26
26
 
27
27
  def title
28
- @title = find_title
28
+ @title ||= find_title
29
29
  end
30
30
 
31
31
  def image
32
- @image = find_image
33
- @image = @image.to_s.gsub('http://', 'https://')
32
+ unless @image
33
+ @image = find_image
34
+ @image = @image.to_s.gsub('http://', 'https://')
35
+ end
34
36
  @image
35
37
  end
36
38
 
37
39
  def text
38
- if json_ld.any?
39
- json_ld.each do |ld|
40
- next unless ld['@type'] == 'NewsArticle'
41
- return ld['text'] if ld.has_key? 'text'
42
- return ld['articleBody'] if ld.has_key? 'articleBody'
43
- end
44
- end
45
- h = nokogiri.dup
46
- BLACKLIST.each do |tag|
47
- h.css(tag).remove
48
- end
49
- nodes = h.css('p')
50
- nodes.xpath('//style').remove
51
- text = nodes.to_html
52
- text.gsub!('<br><br>', '<br>')
53
- text
40
+ @text ||= find_text
41
+ end
42
+
43
+ def date
44
+ @date ||= find_date
54
45
  end
55
46
 
56
47
  protected
@@ -61,13 +52,17 @@ module Curation
61
52
  return ld['headline'] if ld.has_key? 'headline'
62
53
  end
63
54
  end
64
- [
65
- metainspector.best_title,
66
- metainspector.title,
67
- nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
68
- nokogiri.css('title')&.first&.inner_text
69
- ].each do |possibility|
70
- return possibility unless possibility.blank?
55
+ begin
56
+ [
57
+ metainspector.best_title,
58
+ metainspector.title,
59
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
60
+ nokogiri.css('title')&.first&.inner_text
61
+ ].each do |possibility|
62
+ return possibility unless possibility.to_s.empty?
63
+ end
64
+ rescue
65
+ puts 'Curation::Page find_title error'
71
66
  end
72
67
  return ''
73
68
  end
@@ -87,48 +82,140 @@ module Curation
87
82
  end
88
83
  end
89
84
  end
90
- [
91
- metainspector.images.best,
92
- nokogiri.css('[property="og:image"]').first&.attributes["content"].value
93
- ].each do |possibility|
94
- return possibility unless possibility.blank?
85
+ begin
86
+ [
87
+ metainspector.images.best,
88
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
89
+ ].each do |possibility|
90
+ return possibility unless possibility.to_s.empty?
91
+ end
92
+ rescue
93
+ puts 'Curation::Page find_image error'
95
94
  end
96
95
  return ''
97
96
  end
98
97
 
98
+ def find_text
99
+ if json_ld.any?
100
+ json_ld.each do |ld|
101
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
102
+ return ld['text'] if ld.has_key? 'text'
103
+ return ld['articleBody'] if ld.has_key? 'articleBody'
104
+ end
105
+ end
106
+ h = nokogiri.dup
107
+ BLACKLIST.each do |tag|
108
+ h.css(tag).remove
109
+ end
110
+ nodes = h.css('p')
111
+ nodes.xpath('//style').remove
112
+ text = nodes.to_html
113
+ text.gsub!('<br><br>', '<br>')
114
+ text
115
+ end
116
+
117
+ def find_date
118
+ if json_ld.any?
119
+ json_ld.each do |ld|
120
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
121
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
122
+ end
123
+ end
124
+ return Date.parse metatags['date'] rescue nil
125
+ return Date.parse metatags['pubdate'] rescue nil
126
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
127
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
128
+ chunks = html.split('DisplayDate')
129
+ if chunks.count > 1
130
+ value = chunks[1]
131
+ value = value.split(',').first
132
+ value = value.gsub('"', '')
133
+ value = value[1..-1] if value[0] == ':'
134
+ return Date.parse value rescue nil
135
+ end
136
+ begin
137
+ value = nokogiri.css('.postDate').first
138
+ value = value.inner_text
139
+ value = value.gsub(' — ', '')
140
+ return Date.parse value
141
+ rescue
142
+ end
143
+ begin
144
+ value = nokogiri.css('.gta_post_date').first
145
+ value = value.inner_text
146
+ return Date.parse value
147
+ rescue
148
+ end
149
+ end
150
+
151
+ private
152
+
99
153
  def json_ld
100
- unless @json_ld
154
+ unless defined?(@json_ld)
101
155
  @json_ld = []
102
156
  begin
103
157
  options = nokogiri.css('[type="application/ld+json"]')
104
158
  options.each do |option|
159
+ # require 'byebug'; byebug
105
160
  string = option.inner_text
106
161
  hash = JSON.parse(string)
107
162
  @json_ld << hash
108
163
  end
164
+ # Some sites have tables in tables
165
+ @json_ld.flatten!
109
166
  rescue
110
- puts "JSON LD error"
167
+ puts 'Curation::Page json_ld error'
111
168
  end
112
169
  end
113
170
  @json_ld
114
171
  end
115
172
 
173
+ def file
174
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
175
+ rescue
176
+ puts "Curation::Page file error with url #{url}"
177
+ end
178
+
116
179
  def html
117
- @html ||= URI.open url
180
+ unless @html
181
+ file.rewind
182
+ @html = file.read
183
+ file.rewind
184
+ end
185
+ @html
118
186
  rescue
119
- puts "Impossible to open #{url}"
187
+ puts "Curation::Page html error"
120
188
  end
121
189
 
122
190
  def nokogiri
123
- @nokogiri ||= Nokogiri::HTML html
191
+ unless @nokogiri
192
+ if file.nil?
193
+ @nokogiri = metainspector.parsed
194
+ else
195
+ file.rewind
196
+ @nokogiri = Nokogiri::HTML file
197
+ file.rewind
198
+ end
199
+ end
200
+ @nokogiri
124
201
  rescue
125
- puts "Nokogiri error"
202
+ puts 'Curation::Page nokogiri error'
126
203
  end
127
204
 
128
205
  def metainspector
129
- @metainspector ||= MetaInspector.new url, document: html
206
+ unless @metainspector
207
+ @metainspector = html.nil? ? MetaInspector.new(url)
208
+ : MetaInspector.new(url, document: html)
209
+ end
210
+ @metainspector
211
+ rescue
212
+ puts 'Curation::Page metainspector error'
213
+ end
214
+
215
+ def metatags
216
+ @metatags ||= metainspector.meta_tag['name']
130
217
  rescue
131
- puts "MetaInspector error"
218
+ puts 'Curation::Page metatags error'
132
219
  end
133
220
  end
134
221
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.4'
4
+ version: '1.8'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-22 00:00:00.000000000 Z
11
+ date: 2022-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
@@ -65,7 +65,7 @@ licenses:
65
65
  metadata:
66
66
  homepage_uri: https://github.com/arnaudlevy/curation
67
67
  source_code_uri: https://github.com/arnaudlevy/curation
68
- post_install_message:
68
+ post_install_message:
69
69
  rdoc_options: []
70
70
  require_paths:
71
71
  - lib
@@ -80,8 +80,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
80
80
  - !ruby/object:Gem::Version
81
81
  version: '0'
82
82
  requirements: []
83
- rubygems_version: 3.0.3
84
- signing_key:
83
+ rubygems_version: 3.1.6
84
+ signing_key:
85
85
  specification_version: 4
86
86
  summary: Curation of content
87
87
  test_files: []