curation 1.4 → 1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +4 -1
- data/Gemfile.lock +61 -24
- data/Rakefile +9 -1
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +128 -41
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8086e74a861e147bce3a73dcec3cebef7d5d98ede440c911f57757feaaac354
|
4
|
+
data.tar.gz: 3d6ff271bcfbf8599a76653d60b2c26ed3832c1c174885d114912f95a7707ee7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '084c4c3fbdf3491530cd4ca6ca0fdc736df69a22a49cce40b0d7c8d169f6a45e7c5f01f280957016e32bedca93d593bb2d9e60141e148ee4f110935877543168'
|
7
|
+
data.tar.gz: 3fe95dde59a8cb2268a1f93ff1e8b7ccbca9212a7b0cdc0093c16925549545df422945db25ab29f250fcc05242b5d7b01eb822f3c97b4e2b96d19653d70dff39
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,57 +1,94 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (1.
|
4
|
+
curation (1.8)
|
5
5
|
metainspector
|
6
6
|
nokogiri
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
addressable (2.
|
11
|
+
addressable (2.8.0)
|
12
12
|
public_suffix (>= 2.0.2, < 5.0)
|
13
|
+
ansi (1.5.0)
|
14
|
+
builder (3.2.4)
|
15
|
+
byebug (11.1.3)
|
13
16
|
domain_name (0.5.20190701)
|
14
17
|
unf (>= 0.0.5, < 1.0.0)
|
15
|
-
faraday (1.
|
16
|
-
|
17
|
-
|
18
|
-
faraday (
|
18
|
+
faraday (1.9.3)
|
19
|
+
faraday-em_http (~> 1.0)
|
20
|
+
faraday-em_synchrony (~> 1.0)
|
21
|
+
faraday-excon (~> 1.1)
|
22
|
+
faraday-httpclient (~> 1.0)
|
23
|
+
faraday-multipart (~> 1.0)
|
24
|
+
faraday-net_http (~> 1.0)
|
25
|
+
faraday-net_http_persistent (~> 1.0)
|
26
|
+
faraday-patron (~> 1.0)
|
27
|
+
faraday-rack (~> 1.0)
|
28
|
+
faraday-retry (~> 1.0)
|
29
|
+
ruby2_keywords (>= 0.0.4)
|
30
|
+
faraday-cookie_jar (0.0.7)
|
31
|
+
faraday (>= 0.8.0)
|
19
32
|
http-cookie (~> 1.0.0)
|
33
|
+
faraday-em_http (1.0.0)
|
34
|
+
faraday-em_synchrony (1.0.0)
|
20
35
|
faraday-encoding (0.0.5)
|
21
36
|
faraday
|
37
|
+
faraday-excon (1.1.0)
|
22
38
|
faraday-http-cache (2.2.0)
|
23
39
|
faraday (>= 0.8)
|
24
|
-
|
40
|
+
faraday-httpclient (1.0.1)
|
41
|
+
faraday-multipart (1.0.3)
|
42
|
+
multipart-post (>= 1.2, < 3)
|
43
|
+
faraday-net_http (1.0.1)
|
44
|
+
faraday-net_http_persistent (1.2.0)
|
45
|
+
faraday-patron (1.0.0)
|
46
|
+
faraday-rack (1.0.0)
|
47
|
+
faraday-retry (1.0.3)
|
48
|
+
faraday_middleware (1.2.0)
|
25
49
|
faraday (~> 1.0)
|
26
|
-
fastimage (2.
|
27
|
-
http-cookie (1.0.
|
50
|
+
fastimage (2.2.6)
|
51
|
+
http-cookie (1.0.4)
|
28
52
|
domain_name (~> 0.5)
|
29
|
-
metainspector (5.
|
30
|
-
addressable (~> 2.7
|
31
|
-
faraday (~> 1.
|
32
|
-
faraday-cookie_jar (~> 0.0
|
33
|
-
faraday-encoding (~> 0.0
|
34
|
-
faraday-http-cache (~> 2.2
|
35
|
-
faraday_middleware (~> 1.0
|
36
|
-
fastimage (~> 2.
|
37
|
-
nesty (~> 1.0
|
38
|
-
nokogiri (~> 1.
|
39
|
-
mini_portile2 (2.
|
53
|
+
metainspector (5.11.2)
|
54
|
+
addressable (~> 2.7)
|
55
|
+
faraday (~> 1.4)
|
56
|
+
faraday-cookie_jar (~> 0.0)
|
57
|
+
faraday-encoding (~> 0.0)
|
58
|
+
faraday-http-cache (~> 2.2)
|
59
|
+
faraday_middleware (~> 1.0)
|
60
|
+
fastimage (~> 2.2)
|
61
|
+
nesty (~> 1.0)
|
62
|
+
nokogiri (~> 1.11)
|
63
|
+
mini_portile2 (2.7.1)
|
64
|
+
minitest (5.15.0)
|
65
|
+
minitest-reporters (1.5.0)
|
66
|
+
ansi
|
67
|
+
builder
|
68
|
+
minitest (>= 5.0)
|
69
|
+
ruby-progressbar
|
40
70
|
multipart-post (2.1.1)
|
41
71
|
nesty (1.0.2)
|
42
|
-
nokogiri (1.
|
43
|
-
mini_portile2 (~> 2.
|
44
|
-
|
72
|
+
nokogiri (1.13.1)
|
73
|
+
mini_portile2 (~> 2.7.0)
|
74
|
+
racc (~> 1.4)
|
75
|
+
public_suffix (4.0.6)
|
76
|
+
racc (1.6.0)
|
45
77
|
rake (12.3.3)
|
78
|
+
ruby-progressbar (1.11.0)
|
79
|
+
ruby2_keywords (0.0.5)
|
46
80
|
unf (0.1.4)
|
47
81
|
unf_ext
|
48
|
-
unf_ext (0.0.
|
82
|
+
unf_ext (0.0.8)
|
49
83
|
|
50
84
|
PLATFORMS
|
51
85
|
ruby
|
52
86
|
|
53
87
|
DEPENDENCIES
|
88
|
+
byebug
|
54
89
|
curation!
|
90
|
+
minitest
|
91
|
+
minitest-reporters
|
55
92
|
rake (~> 12.0)
|
56
93
|
|
57
94
|
BUNDLED WITH
|
data/Rakefile
CHANGED
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
@@ -6,7 +6,7 @@ module Curation
|
|
6
6
|
class Error < StandardError; end
|
7
7
|
|
8
8
|
class Page
|
9
|
-
attr_reader :url
|
9
|
+
attr_reader :url
|
10
10
|
|
11
11
|
BLACKLIST = [
|
12
12
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
@@ -20,37 +20,28 @@ module Curation
|
|
20
20
|
]
|
21
21
|
|
22
22
|
def initialize(url, html = nil)
|
23
|
-
@url = url
|
23
|
+
@url = url.to_s.gsub('http://', 'https://')
|
24
24
|
@html = html
|
25
25
|
end
|
26
26
|
|
27
27
|
def title
|
28
|
-
@title
|
28
|
+
@title ||= find_title
|
29
29
|
end
|
30
30
|
|
31
31
|
def image
|
32
|
-
@image
|
33
|
-
|
32
|
+
unless @image
|
33
|
+
@image = find_image
|
34
|
+
@image = @image.to_s.gsub('http://', 'https://')
|
35
|
+
end
|
34
36
|
@image
|
35
37
|
end
|
36
38
|
|
37
39
|
def text
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
end
|
44
|
-
end
|
45
|
-
h = nokogiri.dup
|
46
|
-
BLACKLIST.each do |tag|
|
47
|
-
h.css(tag).remove
|
48
|
-
end
|
49
|
-
nodes = h.css('p')
|
50
|
-
nodes.xpath('//style').remove
|
51
|
-
text = nodes.to_html
|
52
|
-
text.gsub!('<br><br>', '<br>')
|
53
|
-
text
|
40
|
+
@text ||= find_text
|
41
|
+
end
|
42
|
+
|
43
|
+
def date
|
44
|
+
@date ||= find_date
|
54
45
|
end
|
55
46
|
|
56
47
|
protected
|
@@ -61,13 +52,17 @@ module Curation
|
|
61
52
|
return ld['headline'] if ld.has_key? 'headline'
|
62
53
|
end
|
63
54
|
end
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
55
|
+
begin
|
56
|
+
[
|
57
|
+
metainspector.best_title,
|
58
|
+
metainspector.title,
|
59
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
60
|
+
nokogiri.css('title')&.first&.inner_text
|
61
|
+
].each do |possibility|
|
62
|
+
return possibility unless possibility.to_s.empty?
|
63
|
+
end
|
64
|
+
rescue
|
65
|
+
puts 'Curation::Page find_title error'
|
71
66
|
end
|
72
67
|
return ''
|
73
68
|
end
|
@@ -87,48 +82,140 @@ module Curation
|
|
87
82
|
end
|
88
83
|
end
|
89
84
|
end
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
85
|
+
begin
|
86
|
+
[
|
87
|
+
metainspector.images.best,
|
88
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
89
|
+
].each do |possibility|
|
90
|
+
return possibility unless possibility.to_s.empty?
|
91
|
+
end
|
92
|
+
rescue
|
93
|
+
puts 'Curation::Page find_image error'
|
95
94
|
end
|
96
95
|
return ''
|
97
96
|
end
|
98
97
|
|
98
|
+
def find_text
|
99
|
+
if json_ld.any?
|
100
|
+
json_ld.each do |ld|
|
101
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
102
|
+
return ld['text'] if ld.has_key? 'text'
|
103
|
+
return ld['articleBody'] if ld.has_key? 'articleBody'
|
104
|
+
end
|
105
|
+
end
|
106
|
+
h = nokogiri.dup
|
107
|
+
BLACKLIST.each do |tag|
|
108
|
+
h.css(tag).remove
|
109
|
+
end
|
110
|
+
nodes = h.css('p')
|
111
|
+
nodes.xpath('//style').remove
|
112
|
+
text = nodes.to_html
|
113
|
+
text.gsub!('<br><br>', '<br>')
|
114
|
+
text
|
115
|
+
end
|
116
|
+
|
117
|
+
def find_date
|
118
|
+
if json_ld.any?
|
119
|
+
json_ld.each do |ld|
|
120
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
121
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
122
|
+
end
|
123
|
+
end
|
124
|
+
return Date.parse metatags['date'] rescue nil
|
125
|
+
return Date.parse metatags['pubdate'] rescue nil
|
126
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
127
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
128
|
+
chunks = html.split('DisplayDate')
|
129
|
+
if chunks.count > 1
|
130
|
+
value = chunks[1]
|
131
|
+
value = value.split(',').first
|
132
|
+
value = value.gsub('"', '')
|
133
|
+
value = value[1..-1] if value[0] == ':'
|
134
|
+
return Date.parse value rescue nil
|
135
|
+
end
|
136
|
+
begin
|
137
|
+
value = nokogiri.css('.postDate').first
|
138
|
+
value = value.inner_text
|
139
|
+
value = value.gsub(' — ', '')
|
140
|
+
return Date.parse value
|
141
|
+
rescue
|
142
|
+
end
|
143
|
+
begin
|
144
|
+
value = nokogiri.css('.gta_post_date').first
|
145
|
+
value = value.inner_text
|
146
|
+
return Date.parse value
|
147
|
+
rescue
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
99
153
|
def json_ld
|
100
|
-
unless @json_ld
|
154
|
+
unless defined?(@json_ld)
|
101
155
|
@json_ld = []
|
102
156
|
begin
|
103
157
|
options = nokogiri.css('[type="application/ld+json"]')
|
104
158
|
options.each do |option|
|
159
|
+
# require 'byebug'; byebug
|
105
160
|
string = option.inner_text
|
106
161
|
hash = JSON.parse(string)
|
107
162
|
@json_ld << hash
|
108
163
|
end
|
164
|
+
# Some sites have tables in tables
|
165
|
+
@json_ld.flatten!
|
109
166
|
rescue
|
110
|
-
puts
|
167
|
+
puts 'Curation::Page json_ld error'
|
111
168
|
end
|
112
169
|
end
|
113
170
|
@json_ld
|
114
171
|
end
|
115
172
|
|
173
|
+
def file
|
174
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
175
|
+
rescue
|
176
|
+
puts "Curation::Page file error with url #{url}"
|
177
|
+
end
|
178
|
+
|
116
179
|
def html
|
117
|
-
@html
|
180
|
+
unless @html
|
181
|
+
file.rewind
|
182
|
+
@html = file.read
|
183
|
+
file.rewind
|
184
|
+
end
|
185
|
+
@html
|
118
186
|
rescue
|
119
|
-
puts "
|
187
|
+
puts "Curation::Page html error"
|
120
188
|
end
|
121
189
|
|
122
190
|
def nokogiri
|
123
|
-
@nokogiri
|
191
|
+
unless @nokogiri
|
192
|
+
if file.nil?
|
193
|
+
@nokogiri = metainspector.parsed
|
194
|
+
else
|
195
|
+
file.rewind
|
196
|
+
@nokogiri = Nokogiri::HTML file
|
197
|
+
file.rewind
|
198
|
+
end
|
199
|
+
end
|
200
|
+
@nokogiri
|
124
201
|
rescue
|
125
|
-
puts
|
202
|
+
puts 'Curation::Page nokogiri error'
|
126
203
|
end
|
127
204
|
|
128
205
|
def metainspector
|
129
|
-
@metainspector
|
206
|
+
unless @metainspector
|
207
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
208
|
+
: MetaInspector.new(url, document: html)
|
209
|
+
end
|
210
|
+
@metainspector
|
211
|
+
rescue
|
212
|
+
puts 'Curation::Page metainspector error'
|
213
|
+
end
|
214
|
+
|
215
|
+
def metatags
|
216
|
+
@metatags ||= metainspector.meta_tag['name']
|
130
217
|
rescue
|
131
|
-
puts
|
218
|
+
puts 'Curation::Page metatags error'
|
132
219
|
end
|
133
220
|
end
|
134
221
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.8'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|
@@ -65,7 +65,7 @@ licenses:
|
|
65
65
|
metadata:
|
66
66
|
homepage_uri: https://github.com/arnaudlevy/curation
|
67
67
|
source_code_uri: https://github.com/arnaudlevy/curation
|
68
|
-
post_install_message:
|
68
|
+
post_install_message:
|
69
69
|
rdoc_options: []
|
70
70
|
require_paths:
|
71
71
|
- lib
|
@@ -80,8 +80,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
80
80
|
- !ruby/object:Gem::Version
|
81
81
|
version: '0'
|
82
82
|
requirements: []
|
83
|
-
rubygems_version: 3.
|
84
|
-
signing_key:
|
83
|
+
rubygems_version: 3.1.6
|
84
|
+
signing_key:
|
85
85
|
specification_version: 4
|
86
86
|
summary: Curation of content
|
87
87
|
test_files: []
|