curation 1.4 → 1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +61 -24
- data/Rakefile +9 -1
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +128 -41
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b8086e74a861e147bce3a73dcec3cebef7d5d98ede440c911f57757feaaac354
|
|
4
|
+
data.tar.gz: 3d6ff271bcfbf8599a76653d60b2c26ed3832c1c174885d114912f95a7707ee7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '084c4c3fbdf3491530cd4ca6ca0fdc736df69a22a49cce40b0d7c8d169f6a45e7c5f01f280957016e32bedca93d593bb2d9e60141e148ee4f110935877543168'
|
|
7
|
+
data.tar.gz: 3fe95dde59a8cb2268a1f93ff1e8b7ccbca9212a7b0cdc0093c16925549545df422945db25ab29f250fcc05242b5d7b01eb822f3c97b4e2b96d19653d70dff39
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,57 +1,94 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
curation (1.
|
|
4
|
+
curation (1.8)
|
|
5
5
|
metainspector
|
|
6
6
|
nokogiri
|
|
7
7
|
|
|
8
8
|
GEM
|
|
9
9
|
remote: https://rubygems.org/
|
|
10
10
|
specs:
|
|
11
|
-
addressable (2.
|
|
11
|
+
addressable (2.8.0)
|
|
12
12
|
public_suffix (>= 2.0.2, < 5.0)
|
|
13
|
+
ansi (1.5.0)
|
|
14
|
+
builder (3.2.4)
|
|
15
|
+
byebug (11.1.3)
|
|
13
16
|
domain_name (0.5.20190701)
|
|
14
17
|
unf (>= 0.0.5, < 1.0.0)
|
|
15
|
-
faraday (1.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
faraday (
|
|
18
|
+
faraday (1.9.3)
|
|
19
|
+
faraday-em_http (~> 1.0)
|
|
20
|
+
faraday-em_synchrony (~> 1.0)
|
|
21
|
+
faraday-excon (~> 1.1)
|
|
22
|
+
faraday-httpclient (~> 1.0)
|
|
23
|
+
faraday-multipart (~> 1.0)
|
|
24
|
+
faraday-net_http (~> 1.0)
|
|
25
|
+
faraday-net_http_persistent (~> 1.0)
|
|
26
|
+
faraday-patron (~> 1.0)
|
|
27
|
+
faraday-rack (~> 1.0)
|
|
28
|
+
faraday-retry (~> 1.0)
|
|
29
|
+
ruby2_keywords (>= 0.0.4)
|
|
30
|
+
faraday-cookie_jar (0.0.7)
|
|
31
|
+
faraday (>= 0.8.0)
|
|
19
32
|
http-cookie (~> 1.0.0)
|
|
33
|
+
faraday-em_http (1.0.0)
|
|
34
|
+
faraday-em_synchrony (1.0.0)
|
|
20
35
|
faraday-encoding (0.0.5)
|
|
21
36
|
faraday
|
|
37
|
+
faraday-excon (1.1.0)
|
|
22
38
|
faraday-http-cache (2.2.0)
|
|
23
39
|
faraday (>= 0.8)
|
|
24
|
-
|
|
40
|
+
faraday-httpclient (1.0.1)
|
|
41
|
+
faraday-multipart (1.0.3)
|
|
42
|
+
multipart-post (>= 1.2, < 3)
|
|
43
|
+
faraday-net_http (1.0.1)
|
|
44
|
+
faraday-net_http_persistent (1.2.0)
|
|
45
|
+
faraday-patron (1.0.0)
|
|
46
|
+
faraday-rack (1.0.0)
|
|
47
|
+
faraday-retry (1.0.3)
|
|
48
|
+
faraday_middleware (1.2.0)
|
|
25
49
|
faraday (~> 1.0)
|
|
26
|
-
fastimage (2.
|
|
27
|
-
http-cookie (1.0.
|
|
50
|
+
fastimage (2.2.6)
|
|
51
|
+
http-cookie (1.0.4)
|
|
28
52
|
domain_name (~> 0.5)
|
|
29
|
-
metainspector (5.
|
|
30
|
-
addressable (~> 2.7
|
|
31
|
-
faraday (~> 1.
|
|
32
|
-
faraday-cookie_jar (~> 0.0
|
|
33
|
-
faraday-encoding (~> 0.0
|
|
34
|
-
faraday-http-cache (~> 2.2
|
|
35
|
-
faraday_middleware (~> 1.0
|
|
36
|
-
fastimage (~> 2.
|
|
37
|
-
nesty (~> 1.0
|
|
38
|
-
nokogiri (~> 1.
|
|
39
|
-
mini_portile2 (2.
|
|
53
|
+
metainspector (5.11.2)
|
|
54
|
+
addressable (~> 2.7)
|
|
55
|
+
faraday (~> 1.4)
|
|
56
|
+
faraday-cookie_jar (~> 0.0)
|
|
57
|
+
faraday-encoding (~> 0.0)
|
|
58
|
+
faraday-http-cache (~> 2.2)
|
|
59
|
+
faraday_middleware (~> 1.0)
|
|
60
|
+
fastimage (~> 2.2)
|
|
61
|
+
nesty (~> 1.0)
|
|
62
|
+
nokogiri (~> 1.11)
|
|
63
|
+
mini_portile2 (2.7.1)
|
|
64
|
+
minitest (5.15.0)
|
|
65
|
+
minitest-reporters (1.5.0)
|
|
66
|
+
ansi
|
|
67
|
+
builder
|
|
68
|
+
minitest (>= 5.0)
|
|
69
|
+
ruby-progressbar
|
|
40
70
|
multipart-post (2.1.1)
|
|
41
71
|
nesty (1.0.2)
|
|
42
|
-
nokogiri (1.
|
|
43
|
-
mini_portile2 (~> 2.
|
|
44
|
-
|
|
72
|
+
nokogiri (1.13.1)
|
|
73
|
+
mini_portile2 (~> 2.7.0)
|
|
74
|
+
racc (~> 1.4)
|
|
75
|
+
public_suffix (4.0.6)
|
|
76
|
+
racc (1.6.0)
|
|
45
77
|
rake (12.3.3)
|
|
78
|
+
ruby-progressbar (1.11.0)
|
|
79
|
+
ruby2_keywords (0.0.5)
|
|
46
80
|
unf (0.1.4)
|
|
47
81
|
unf_ext
|
|
48
|
-
unf_ext (0.0.
|
|
82
|
+
unf_ext (0.0.8)
|
|
49
83
|
|
|
50
84
|
PLATFORMS
|
|
51
85
|
ruby
|
|
52
86
|
|
|
53
87
|
DEPENDENCIES
|
|
88
|
+
byebug
|
|
54
89
|
curation!
|
|
90
|
+
minitest
|
|
91
|
+
minitest-reporters
|
|
55
92
|
rake (~> 12.0)
|
|
56
93
|
|
|
57
94
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
|
@@ -6,7 +6,7 @@ module Curation
|
|
|
6
6
|
class Error < StandardError; end
|
|
7
7
|
|
|
8
8
|
class Page
|
|
9
|
-
attr_reader :url
|
|
9
|
+
attr_reader :url
|
|
10
10
|
|
|
11
11
|
BLACKLIST = [
|
|
12
12
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
|
@@ -20,37 +20,28 @@ module Curation
|
|
|
20
20
|
]
|
|
21
21
|
|
|
22
22
|
def initialize(url, html = nil)
|
|
23
|
-
@url = url
|
|
23
|
+
@url = url.to_s.gsub('http://', 'https://')
|
|
24
24
|
@html = html
|
|
25
25
|
end
|
|
26
26
|
|
|
27
27
|
def title
|
|
28
|
-
@title
|
|
28
|
+
@title ||= find_title
|
|
29
29
|
end
|
|
30
30
|
|
|
31
31
|
def image
|
|
32
|
-
@image
|
|
33
|
-
|
|
32
|
+
unless @image
|
|
33
|
+
@image = find_image
|
|
34
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
|
35
|
+
end
|
|
34
36
|
@image
|
|
35
37
|
end
|
|
36
38
|
|
|
37
39
|
def text
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
h = nokogiri.dup
|
|
46
|
-
BLACKLIST.each do |tag|
|
|
47
|
-
h.css(tag).remove
|
|
48
|
-
end
|
|
49
|
-
nodes = h.css('p')
|
|
50
|
-
nodes.xpath('//style').remove
|
|
51
|
-
text = nodes.to_html
|
|
52
|
-
text.gsub!('<br><br>', '<br>')
|
|
53
|
-
text
|
|
40
|
+
@text ||= find_text
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def date
|
|
44
|
+
@date ||= find_date
|
|
54
45
|
end
|
|
55
46
|
|
|
56
47
|
protected
|
|
@@ -61,13 +52,17 @@ module Curation
|
|
|
61
52
|
return ld['headline'] if ld.has_key? 'headline'
|
|
62
53
|
end
|
|
63
54
|
end
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
55
|
+
begin
|
|
56
|
+
[
|
|
57
|
+
metainspector.best_title,
|
|
58
|
+
metainspector.title,
|
|
59
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
|
60
|
+
nokogiri.css('title')&.first&.inner_text
|
|
61
|
+
].each do |possibility|
|
|
62
|
+
return possibility unless possibility.to_s.empty?
|
|
63
|
+
end
|
|
64
|
+
rescue
|
|
65
|
+
puts 'Curation::Page find_title error'
|
|
71
66
|
end
|
|
72
67
|
return ''
|
|
73
68
|
end
|
|
@@ -87,48 +82,140 @@ module Curation
|
|
|
87
82
|
end
|
|
88
83
|
end
|
|
89
84
|
end
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
85
|
+
begin
|
|
86
|
+
[
|
|
87
|
+
metainspector.images.best,
|
|
88
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
|
89
|
+
].each do |possibility|
|
|
90
|
+
return possibility unless possibility.to_s.empty?
|
|
91
|
+
end
|
|
92
|
+
rescue
|
|
93
|
+
puts 'Curation::Page find_image error'
|
|
95
94
|
end
|
|
96
95
|
return ''
|
|
97
96
|
end
|
|
98
97
|
|
|
98
|
+
def find_text
|
|
99
|
+
if json_ld.any?
|
|
100
|
+
json_ld.each do |ld|
|
|
101
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
|
102
|
+
return ld['text'] if ld.has_key? 'text'
|
|
103
|
+
return ld['articleBody'] if ld.has_key? 'articleBody'
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
h = nokogiri.dup
|
|
107
|
+
BLACKLIST.each do |tag|
|
|
108
|
+
h.css(tag).remove
|
|
109
|
+
end
|
|
110
|
+
nodes = h.css('p')
|
|
111
|
+
nodes.xpath('//style').remove
|
|
112
|
+
text = nodes.to_html
|
|
113
|
+
text.gsub!('<br><br>', '<br>')
|
|
114
|
+
text
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def find_date
|
|
118
|
+
if json_ld.any?
|
|
119
|
+
json_ld.each do |ld|
|
|
120
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
|
121
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
return Date.parse metatags['date'] rescue nil
|
|
125
|
+
return Date.parse metatags['pubdate'] rescue nil
|
|
126
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
|
127
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
|
128
|
+
chunks = html.split('DisplayDate')
|
|
129
|
+
if chunks.count > 1
|
|
130
|
+
value = chunks[1]
|
|
131
|
+
value = value.split(',').first
|
|
132
|
+
value = value.gsub('"', '')
|
|
133
|
+
value = value[1..-1] if value[0] == ':'
|
|
134
|
+
return Date.parse value rescue nil
|
|
135
|
+
end
|
|
136
|
+
begin
|
|
137
|
+
value = nokogiri.css('.postDate').first
|
|
138
|
+
value = value.inner_text
|
|
139
|
+
value = value.gsub(' — ', '')
|
|
140
|
+
return Date.parse value
|
|
141
|
+
rescue
|
|
142
|
+
end
|
|
143
|
+
begin
|
|
144
|
+
value = nokogiri.css('.gta_post_date').first
|
|
145
|
+
value = value.inner_text
|
|
146
|
+
return Date.parse value
|
|
147
|
+
rescue
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
private
|
|
152
|
+
|
|
99
153
|
def json_ld
|
|
100
|
-
unless @json_ld
|
|
154
|
+
unless defined?(@json_ld)
|
|
101
155
|
@json_ld = []
|
|
102
156
|
begin
|
|
103
157
|
options = nokogiri.css('[type="application/ld+json"]')
|
|
104
158
|
options.each do |option|
|
|
159
|
+
# require 'byebug'; byebug
|
|
105
160
|
string = option.inner_text
|
|
106
161
|
hash = JSON.parse(string)
|
|
107
162
|
@json_ld << hash
|
|
108
163
|
end
|
|
164
|
+
# Some sites have tables in tables
|
|
165
|
+
@json_ld.flatten!
|
|
109
166
|
rescue
|
|
110
|
-
puts
|
|
167
|
+
puts 'Curation::Page json_ld error'
|
|
111
168
|
end
|
|
112
169
|
end
|
|
113
170
|
@json_ld
|
|
114
171
|
end
|
|
115
172
|
|
|
173
|
+
def file
|
|
174
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
|
175
|
+
rescue
|
|
176
|
+
puts "Curation::Page file error with url #{url}"
|
|
177
|
+
end
|
|
178
|
+
|
|
116
179
|
def html
|
|
117
|
-
@html
|
|
180
|
+
unless @html
|
|
181
|
+
file.rewind
|
|
182
|
+
@html = file.read
|
|
183
|
+
file.rewind
|
|
184
|
+
end
|
|
185
|
+
@html
|
|
118
186
|
rescue
|
|
119
|
-
puts "
|
|
187
|
+
puts "Curation::Page html error"
|
|
120
188
|
end
|
|
121
189
|
|
|
122
190
|
def nokogiri
|
|
123
|
-
@nokogiri
|
|
191
|
+
unless @nokogiri
|
|
192
|
+
if file.nil?
|
|
193
|
+
@nokogiri = metainspector.parsed
|
|
194
|
+
else
|
|
195
|
+
file.rewind
|
|
196
|
+
@nokogiri = Nokogiri::HTML file
|
|
197
|
+
file.rewind
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
@nokogiri
|
|
124
201
|
rescue
|
|
125
|
-
puts
|
|
202
|
+
puts 'Curation::Page nokogiri error'
|
|
126
203
|
end
|
|
127
204
|
|
|
128
205
|
def metainspector
|
|
129
|
-
@metainspector
|
|
206
|
+
unless @metainspector
|
|
207
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
|
208
|
+
: MetaInspector.new(url, document: html)
|
|
209
|
+
end
|
|
210
|
+
@metainspector
|
|
211
|
+
rescue
|
|
212
|
+
puts 'Curation::Page metainspector error'
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
def metatags
|
|
216
|
+
@metatags ||= metainspector.meta_tag['name']
|
|
130
217
|
rescue
|
|
131
|
-
puts
|
|
218
|
+
puts 'Curation::Page metatags error'
|
|
132
219
|
end
|
|
133
220
|
end
|
|
134
221
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: curation
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: '1.
|
|
4
|
+
version: '1.8'
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Arnaud Levy
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2022-02-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: metainspector
|
|
@@ -65,7 +65,7 @@ licenses:
|
|
|
65
65
|
metadata:
|
|
66
66
|
homepage_uri: https://github.com/arnaudlevy/curation
|
|
67
67
|
source_code_uri: https://github.com/arnaudlevy/curation
|
|
68
|
-
post_install_message:
|
|
68
|
+
post_install_message:
|
|
69
69
|
rdoc_options: []
|
|
70
70
|
require_paths:
|
|
71
71
|
- lib
|
|
@@ -80,8 +80,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
80
80
|
- !ruby/object:Gem::Version
|
|
81
81
|
version: '0'
|
|
82
82
|
requirements: []
|
|
83
|
-
rubygems_version: 3.
|
|
84
|
-
signing_key:
|
|
83
|
+
rubygems_version: 3.1.6
|
|
84
|
+
signing_key:
|
|
85
85
|
specification_version: 4
|
|
86
86
|
summary: Curation of content
|
|
87
87
|
test_files: []
|