curation 1.11 → 2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -5
- data/LICENSE.txt +1 -1
- data/README.md +1 -1
- data/curation.gemspec +2 -2
- data/lib/curation/finders/image.rb +51 -0
- data/lib/curation/finders/publication_date.rb +43 -0
- data/lib/curation/finders/text.rb +78 -0
- data/lib/curation/finders/title.rb +57 -0
- data/lib/curation/tools/jsonld.rb +26 -0
- data/lib/curation/tools/metainspector.rb +18 -0
- data/lib/curation/tools/nokogiri.rb +17 -0
- data/lib/curation/tools/raw.rb +19 -0
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +20 -257
- metadata +14 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8bfaa5cfe719f5a5f5c9f153a358e74f11eb4217f4bbddca880b8f45829fc913
|
4
|
+
data.tar.gz: 4abb219144b696da0f33ab12b251036bd67c4ca7250a4576febdcaaa8e409dc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d19e2a775ea069a3e6b0dc50b66f30d52f0c4e1e101cea2c9f0dde5cedde611043cf4267a4764994be8105e422919bf708f850935d531d9bb8f907025393a05
|
7
|
+
data.tar.gz: 4f0dd67b073cc2003cf798603f7221ba32977645180b671bcde0b1464fc53085cf9a076c2f30f4ce3775a8524246386d576e1bd0d1021c8388db58d13c16068f
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (
|
4
|
+
curation (2.0)
|
5
5
|
htmlentities
|
6
6
|
metainspector
|
7
7
|
nokogiri
|
@@ -12,11 +12,13 @@ GEM
|
|
12
12
|
addressable (2.8.5)
|
13
13
|
public_suffix (>= 2.0.2, < 6.0)
|
14
14
|
ansi (1.5.0)
|
15
|
+
base64 (0.2.0)
|
15
16
|
builder (3.2.4)
|
16
17
|
byebug (11.1.3)
|
17
18
|
domain_name (0.5.20190701)
|
18
19
|
unf (>= 0.0.5, < 1.0.0)
|
19
|
-
faraday (2.7.
|
20
|
+
faraday (2.7.11)
|
21
|
+
base64
|
20
22
|
faraday-net_http (>= 2.0, < 3.1)
|
21
23
|
ruby2_keywords (>= 0.0.4)
|
22
24
|
faraday-cookie_jar (0.0.7)
|
@@ -50,7 +52,7 @@ GEM
|
|
50
52
|
fastimage (~> 2.2)
|
51
53
|
nesty (~> 1.0)
|
52
54
|
nokogiri (~> 1.13)
|
53
|
-
minitest (5.
|
55
|
+
minitest (5.20.0)
|
54
56
|
minitest-reporters (1.6.1)
|
55
57
|
ansi
|
56
58
|
builder
|
@@ -60,13 +62,13 @@ GEM
|
|
60
62
|
nokogiri (1.15.4-x86_64-darwin)
|
61
63
|
racc (~> 1.4)
|
62
64
|
public_suffix (5.0.3)
|
63
|
-
racc (1.7.
|
65
|
+
racc (1.7.3)
|
64
66
|
rake (12.3.3)
|
65
67
|
ruby-progressbar (1.13.0)
|
66
68
|
ruby2_keywords (0.0.5)
|
67
69
|
unf (0.1.4)
|
68
70
|
unf_ext
|
69
|
-
unf_ext (0.0.
|
71
|
+
unf_ext (0.0.9)
|
70
72
|
zlib (2.1.1)
|
71
73
|
|
72
74
|
PLATFORMS
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -37,7 +37,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
37
37
|
|
38
38
|
## Contributing
|
39
39
|
|
40
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
40
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/noesya/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/noesya/curation/blob/master/CODE_OF_CONDUCT.md).
|
41
41
|
|
42
42
|
|
43
43
|
## License
|
data/curation.gemspec
CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
|
|
8
8
|
|
9
9
|
spec.summary = 'Curation of content'
|
10
10
|
spec.description = %q{When you build content curation tools, you need to extract the content of pages (title, text, image...). This requires different strategies and some fine tuning to work efficiently.}
|
11
|
-
spec.homepage = "https://github.com/
|
11
|
+
spec.homepage = "https://github.com/noesya/curation"
|
12
12
|
spec.license = "MIT"
|
13
13
|
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
14
|
|
15
15
|
spec.metadata["homepage_uri"] = spec.homepage
|
16
|
-
spec.metadata["source_code_uri"] = "https://github.com/
|
16
|
+
spec.metadata["source_code_uri"] = "https://github.com/noesya/curation"
|
17
17
|
|
18
18
|
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
19
19
|
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Image
|
2
|
+
|
3
|
+
def image
|
4
|
+
@image ||= find_image.to_s.gsub('http://', 'https://')
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
def find_image
|
10
|
+
log "Curation::Page find_image #{url}"
|
11
|
+
if json_ld.any?
|
12
|
+
json_ld.each do |ld|
|
13
|
+
ld = ld.first if ld.is_a?(Array)
|
14
|
+
if ld.has_key? 'image'
|
15
|
+
image_data = ld['image']
|
16
|
+
if image_data.is_a? String
|
17
|
+
log "Curation::Page find_image json_ld string"
|
18
|
+
return image_data
|
19
|
+
end
|
20
|
+
if image_data.is_a? Array
|
21
|
+
first = image_data.first
|
22
|
+
if first.is_a? String
|
23
|
+
log "Curation::Page find_image json_ld array"
|
24
|
+
return first
|
25
|
+
end
|
26
|
+
if first.is_a? Hash
|
27
|
+
log "Curation::Page find_image json_ld array url"
|
28
|
+
return first['url']
|
29
|
+
end
|
30
|
+
end
|
31
|
+
if image_data.is_a? Hash
|
32
|
+
log "Curation::Page find_image json_ld url"
|
33
|
+
return image_data['url']
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
begin
|
39
|
+
[
|
40
|
+
metainspector.images.best,
|
41
|
+
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
42
|
+
].each do |possibility|
|
43
|
+
return possibility unless possibility.to_s.empty?
|
44
|
+
end
|
45
|
+
rescue
|
46
|
+
puts 'Curation::Page find_image error'
|
47
|
+
end
|
48
|
+
return ''
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module PublicationDate
|
2
|
+
|
3
|
+
def date
|
4
|
+
@date ||= find_date
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
def find_date
|
10
|
+
if json_ld.any?
|
11
|
+
json_ld.each do |ld|
|
12
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
13
|
+
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
return Date.parse metatags['date'] rescue nil
|
17
|
+
return Date.parse metatags['pubdate'] rescue nil
|
18
|
+
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
19
|
+
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
20
|
+
return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
|
21
|
+
chunks = html.split('DisplayDate')
|
22
|
+
if chunks.count > 1
|
23
|
+
value = chunks[1]
|
24
|
+
value = value.split(',').first
|
25
|
+
value = value.gsub('"', '')
|
26
|
+
value = value[1..-1] if value[0] == ':'
|
27
|
+
return Date.parse value rescue nil
|
28
|
+
end
|
29
|
+
begin
|
30
|
+
value = nokogiri.css('.postDate').first
|
31
|
+
value = value.inner_text
|
32
|
+
value = value.gsub(' — ', '')
|
33
|
+
return Date.parse value
|
34
|
+
rescue
|
35
|
+
end
|
36
|
+
begin
|
37
|
+
value = nokogiri.css('.gta_post_date').first
|
38
|
+
value = value.inner_text
|
39
|
+
return Date.parse value
|
40
|
+
rescue
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Text
|
2
|
+
|
3
|
+
BLACKLIST = [
|
4
|
+
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
5
|
+
'.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
|
6
|
+
'#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
|
7
|
+
'.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
|
8
|
+
'.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
|
9
|
+
'.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
|
10
|
+
'[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
|
11
|
+
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
12
|
+
]
|
13
|
+
|
14
|
+
def text
|
15
|
+
# require 'byebug'; byebug
|
16
|
+
@text ||= find_text
|
17
|
+
end
|
18
|
+
|
19
|
+
protected
|
20
|
+
|
21
|
+
def find_text
|
22
|
+
text = find_text_with_json_ld || find_text_with_nokogiri
|
23
|
+
text.to_s.dup.gsub!('<br><br>', '<br>')
|
24
|
+
# require 'byebug'; byebug
|
25
|
+
text = clean_encoding text
|
26
|
+
text
|
27
|
+
end
|
28
|
+
|
29
|
+
def find_text_with_json_ld
|
30
|
+
if json_ld.any?
|
31
|
+
json_ld.each do |ld|
|
32
|
+
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
33
|
+
return ld['text'] if ld.has_key? 'text'
|
34
|
+
return ld['articleBody'] if ld.has_key? 'articleBody'
|
35
|
+
end
|
36
|
+
end
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
|
40
|
+
def find_text_with_nokogiri
|
41
|
+
h = nokogiri.dup
|
42
|
+
h.xpath('//style').remove
|
43
|
+
BLACKLIST.each do |tag|
|
44
|
+
h.css(tag).remove
|
45
|
+
end
|
46
|
+
nodes = h.css('p')
|
47
|
+
if nodes.any?
|
48
|
+
text = nodes.to_html
|
49
|
+
text
|
50
|
+
else
|
51
|
+
# Cleanup was too hard, let's try softer
|
52
|
+
h = nokogiri.dup
|
53
|
+
h.text
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# réforme -> réforme
|
58
|
+
def clean_encoding(text)
|
59
|
+
clean_text = HTMLEntities.new.decode text
|
60
|
+
double_encoding = false
|
61
|
+
[
|
62
|
+
'é', # é
|
63
|
+
'è', # è
|
64
|
+
'î', # î
|
65
|
+
'ê', # ê
|
66
|
+
].each do |string|
|
67
|
+
# require 'byebug'; byebug
|
68
|
+
double_encoding = true if clean_text.include? string
|
69
|
+
end
|
70
|
+
if double_encoding
|
71
|
+
clean_text.encode('iso-8859-1', undef: :replace)
|
72
|
+
.force_encoding('utf-8')
|
73
|
+
else
|
74
|
+
text
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
module Title
|
2
|
+
|
3
|
+
def title
|
4
|
+
@title ||= find_title.strip.gsub(/\s+/, ' ')
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
def find_title
|
10
|
+
find_title_with_json_ld ||
|
11
|
+
find_title_with_metainspector ||
|
12
|
+
find_title_with_nokogiri ||
|
13
|
+
''
|
14
|
+
end
|
15
|
+
|
16
|
+
def find_title_with_json_ld
|
17
|
+
if json_ld.any?
|
18
|
+
json_ld.each do |ld|
|
19
|
+
# require 'byebug'; byebug
|
20
|
+
ld = ld.first if ld.is_a?(Array)
|
21
|
+
return ld['headline'] if ld.has_key? 'headline'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def find_title_with_metainspector
|
28
|
+
metainspector_best_title = metainspector.best_title
|
29
|
+
metainspector_title = metainspector.title
|
30
|
+
# Problème avec une balise <meta property="title" content="Run 0" />,
|
31
|
+
# metainspector croit que c'est le titre de la page.
|
32
|
+
# Comme le title contient le best title, avec souvent des infos en plus sur le site,
|
33
|
+
# on vérifie si le best title est bien contenu dans le title
|
34
|
+
if metainspector_title.present? &&
|
35
|
+
metainspector_title.present? &&
|
36
|
+
metainspector_best_title.present? &&
|
37
|
+
metainspector_title.include?(metainspector_best_title)
|
38
|
+
return metainspector_best_title
|
39
|
+
elsif metainspector_title.present?
|
40
|
+
return metainspector_title
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def find_title_with_nokogiri
|
45
|
+
begin
|
46
|
+
[
|
47
|
+
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
48
|
+
nokogiri.css('title')&.first&.inner_text
|
49
|
+
].each do |possibility|
|
50
|
+
return possibility unless possibility.to_s.empty?
|
51
|
+
end
|
52
|
+
rescue
|
53
|
+
log 'Curation::Page find_title_with_nokogiri error'
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Jsonld
|
2
|
+
|
3
|
+
def json_ld
|
4
|
+
unless defined?(@json_ld)
|
5
|
+
@json_ld = []
|
6
|
+
begin
|
7
|
+
options = nokogiri.css('[type="application/ld+json"]')
|
8
|
+
options.each do |option|
|
9
|
+
@json_ld << json_ld_from_object(option)
|
10
|
+
end
|
11
|
+
# Some sites have tables in tables
|
12
|
+
@json_ld.flatten!
|
13
|
+
# require 'byebug'; byebug
|
14
|
+
rescue
|
15
|
+
log 'Curation::Page json_ld error'
|
16
|
+
end
|
17
|
+
end
|
18
|
+
@json_ld
|
19
|
+
end
|
20
|
+
|
21
|
+
def json_ld_from_object(object)
|
22
|
+
JSON.parse object.inner_text
|
23
|
+
rescue
|
24
|
+
{}
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Metainspector
|
2
|
+
|
3
|
+
def metainspector
|
4
|
+
unless @metainspector
|
5
|
+
@metainspector = html.nil? ? MetaInspector.new(url)
|
6
|
+
: MetaInspector.new(url, document: html)
|
7
|
+
end
|
8
|
+
@metainspector
|
9
|
+
rescue
|
10
|
+
log 'Curation::Page metainspector error'
|
11
|
+
end
|
12
|
+
|
13
|
+
def metatags
|
14
|
+
@metatags ||= metainspector.meta_tag['name']
|
15
|
+
rescue
|
16
|
+
log 'Curation::Page metatags error'
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Nokogiri
|
2
|
+
|
3
|
+
def nokogiri
|
4
|
+
unless @nokogiri
|
5
|
+
if file.nil?
|
6
|
+
@nokogiri = metainspector.parsed
|
7
|
+
else
|
8
|
+
file.rewind
|
9
|
+
@nokogiri = Nokogiri::HTML file
|
10
|
+
file.rewind
|
11
|
+
end
|
12
|
+
end
|
13
|
+
@nokogiri
|
14
|
+
rescue
|
15
|
+
log 'Curation::Page nokogiri error'
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Raw
|
2
|
+
|
3
|
+
def file
|
4
|
+
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
5
|
+
rescue
|
6
|
+
log "Curation::Page file error with url #{url}"
|
7
|
+
end
|
8
|
+
|
9
|
+
def html
|
10
|
+
unless @html
|
11
|
+
file.rewind
|
12
|
+
@html = file.read
|
13
|
+
file.rewind
|
14
|
+
end
|
15
|
+
@html
|
16
|
+
rescue
|
17
|
+
log "Curation::Page html error"
|
18
|
+
end
|
19
|
+
end
|
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
@@ -1,4 +1,12 @@
|
|
1
1
|
require "curation/version"
|
2
|
+
require "curation/tools/raw"
|
3
|
+
require "curation/tools/nokogiri"
|
4
|
+
require "curation/tools/jsonld"
|
5
|
+
require "curation/tools/metainspector"
|
6
|
+
require "curation/finders/image"
|
7
|
+
require "curation/finders/publication_date"
|
8
|
+
require "curation/finders/text"
|
9
|
+
require "curation/finders/title"
|
2
10
|
require "metainspector"
|
3
11
|
require "open-uri"
|
4
12
|
require "htmlentities"
|
@@ -7,274 +15,29 @@ module Curation
|
|
7
15
|
class Error < StandardError; end
|
8
16
|
|
9
17
|
class Page
|
18
|
+
# Tools
|
19
|
+
include Raw
|
20
|
+
include Jsonld
|
21
|
+
include Metainspector
|
22
|
+
include Nokogiri
|
23
|
+
|
24
|
+
# Finders
|
25
|
+
include Title
|
26
|
+
include Image
|
27
|
+
include PublicationDate
|
28
|
+
include Text
|
29
|
+
|
10
30
|
attr_reader :url
|
11
31
|
attr_accessor :verbose
|
12
32
|
|
13
|
-
BLACKLIST = [
|
14
|
-
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
15
|
-
'.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
|
16
|
-
'#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
|
17
|
-
'.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
|
18
|
-
'.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
|
19
|
-
'.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
|
20
|
-
'[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
|
21
|
-
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
22
|
-
]
|
23
|
-
|
24
33
|
def initialize(url, html = nil)
|
25
34
|
@url = url.to_s.gsub('http://', 'https://')
|
26
35
|
@html = html
|
27
36
|
@verbose = false
|
28
37
|
end
|
29
38
|
|
30
|
-
def title
|
31
|
-
@title ||= find_title
|
32
|
-
end
|
33
|
-
|
34
|
-
def image
|
35
|
-
unless @image
|
36
|
-
@image = find_image
|
37
|
-
@image = @image.to_s.gsub('http://', 'https://')
|
38
|
-
end
|
39
|
-
@image
|
40
|
-
end
|
41
|
-
|
42
|
-
def text
|
43
|
-
# require 'byebug'; byebug
|
44
|
-
@text ||= find_text
|
45
|
-
end
|
46
|
-
|
47
|
-
def date
|
48
|
-
@date ||= find_date
|
49
|
-
end
|
50
|
-
|
51
39
|
protected
|
52
40
|
|
53
|
-
def find_title
|
54
|
-
if json_ld.any?
|
55
|
-
json_ld.each do |ld|
|
56
|
-
# require 'byebug'; byebug
|
57
|
-
ld = ld.first if ld.is_a?(Array)
|
58
|
-
return ld['headline'] if ld.has_key? 'headline'
|
59
|
-
end
|
60
|
-
end
|
61
|
-
begin
|
62
|
-
[
|
63
|
-
metainspector.best_title,
|
64
|
-
metainspector.title,
|
65
|
-
nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
|
66
|
-
nokogiri.css('title')&.first&.inner_text
|
67
|
-
].each do |possibility|
|
68
|
-
return possibility unless possibility.to_s.empty?
|
69
|
-
end
|
70
|
-
rescue
|
71
|
-
log 'Curation::Page find_title error'
|
72
|
-
end
|
73
|
-
return ''
|
74
|
-
end
|
75
|
-
|
76
|
-
def find_image
|
77
|
-
log "Curation::Page find_image #{url}"
|
78
|
-
if json_ld.any?
|
79
|
-
json_ld.each do |ld|
|
80
|
-
ld = ld.first if ld.is_a?(Array)
|
81
|
-
if ld.has_key? 'image'
|
82
|
-
image_data = ld['image']
|
83
|
-
if image_data.is_a? String
|
84
|
-
log "Curation::Page find_image json_ld string"
|
85
|
-
return image_data
|
86
|
-
end
|
87
|
-
if image_data.is_a? Array
|
88
|
-
first = image_data.first
|
89
|
-
if first.is_a? String
|
90
|
-
log "Curation::Page find_image json_ld array"
|
91
|
-
return first
|
92
|
-
end
|
93
|
-
if first.is_a? Hash
|
94
|
-
log "Curation::Page find_image json_ld array url"
|
95
|
-
return first['url']
|
96
|
-
end
|
97
|
-
end
|
98
|
-
if image_data.is_a? Hash
|
99
|
-
log "Curation::Page find_image json_ld url"
|
100
|
-
return image_data['url']
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
begin
|
106
|
-
[
|
107
|
-
metainspector.images.best,
|
108
|
-
nokogiri.css('[property="og:image"]').first&.attributes['content'].value
|
109
|
-
].each do |possibility|
|
110
|
-
return possibility unless possibility.to_s.empty?
|
111
|
-
end
|
112
|
-
rescue
|
113
|
-
puts 'Curation::Page find_image error'
|
114
|
-
end
|
115
|
-
return ''
|
116
|
-
end
|
117
|
-
|
118
|
-
def find_text
|
119
|
-
text = find_text_with_json_ld || find_text_with_nokogiri
|
120
|
-
text.to_s.gsub!('<br><br>', '<br>')
|
121
|
-
# require 'byebug'; byebug
|
122
|
-
text = clean_encoding text
|
123
|
-
text
|
124
|
-
end
|
125
|
-
|
126
|
-
def find_text_with_json_ld
|
127
|
-
if json_ld.any?
|
128
|
-
json_ld.each do |ld|
|
129
|
-
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
130
|
-
return ld['text'] if ld.has_key? 'text'
|
131
|
-
return ld['articleBody'] if ld.has_key? 'articleBody'
|
132
|
-
end
|
133
|
-
end
|
134
|
-
nil
|
135
|
-
end
|
136
|
-
|
137
|
-
def find_text_with_nokogiri
|
138
|
-
h = nokogiri.dup
|
139
|
-
BLACKLIST.each do |tag|
|
140
|
-
h.css(tag).remove
|
141
|
-
end
|
142
|
-
nodes = h.css('p')
|
143
|
-
nodes.xpath('//style').remove
|
144
|
-
text = nodes.to_html
|
145
|
-
text
|
146
|
-
end
|
147
|
-
|
148
|
-
def find_date
|
149
|
-
if json_ld.any?
|
150
|
-
json_ld.each do |ld|
|
151
|
-
next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
|
152
|
-
return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
|
153
|
-
end
|
154
|
-
end
|
155
|
-
return Date.parse metatags['date'] rescue nil
|
156
|
-
return Date.parse metatags['pubdate'] rescue nil
|
157
|
-
return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
|
158
|
-
return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
|
159
|
-
return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
|
160
|
-
chunks = html.split('DisplayDate')
|
161
|
-
if chunks.count > 1
|
162
|
-
value = chunks[1]
|
163
|
-
value = value.split(',').first
|
164
|
-
value = value.gsub('"', '')
|
165
|
-
value = value[1..-1] if value[0] == ':'
|
166
|
-
return Date.parse value rescue nil
|
167
|
-
end
|
168
|
-
begin
|
169
|
-
value = nokogiri.css('.postDate').first
|
170
|
-
value = value.inner_text
|
171
|
-
value = value.gsub(' — ', '')
|
172
|
-
return Date.parse value
|
173
|
-
rescue
|
174
|
-
end
|
175
|
-
begin
|
176
|
-
value = nokogiri.css('.gta_post_date').first
|
177
|
-
value = value.inner_text
|
178
|
-
return Date.parse value
|
179
|
-
rescue
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
private
|
184
|
-
|
185
|
-
def json_ld
|
186
|
-
unless defined?(@json_ld)
|
187
|
-
@json_ld = []
|
188
|
-
begin
|
189
|
-
options = nokogiri.css('[type="application/ld+json"]')
|
190
|
-
options.each do |option|
|
191
|
-
@json_ld << json_ld_from_object(option)
|
192
|
-
end
|
193
|
-
# Some sites have tables in tables
|
194
|
-
@json_ld.flatten!
|
195
|
-
# require 'byebug'; byebug
|
196
|
-
rescue
|
197
|
-
log 'Curation::Page json_ld error'
|
198
|
-
end
|
199
|
-
end
|
200
|
-
@json_ld
|
201
|
-
end
|
202
|
-
|
203
|
-
def json_ld_from_object(object)
|
204
|
-
JSON.parse object.inner_text
|
205
|
-
rescue
|
206
|
-
{}
|
207
|
-
end
|
208
|
-
|
209
|
-
def file
|
210
|
-
@file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
|
211
|
-
rescue
|
212
|
-
log "Curation::Page file error with url #{url}"
|
213
|
-
end
|
214
|
-
|
215
|
-
def html
|
216
|
-
unless @html
|
217
|
-
file.rewind
|
218
|
-
@html = file.read
|
219
|
-
file.rewind
|
220
|
-
end
|
221
|
-
@html
|
222
|
-
rescue
|
223
|
-
log "Curation::Page html error"
|
224
|
-
end
|
225
|
-
|
226
|
-
def nokogiri
|
227
|
-
unless @nokogiri
|
228
|
-
if file.nil?
|
229
|
-
@nokogiri = metainspector.parsed
|
230
|
-
else
|
231
|
-
file.rewind
|
232
|
-
@nokogiri = Nokogiri::HTML file
|
233
|
-
file.rewind
|
234
|
-
end
|
235
|
-
end
|
236
|
-
@nokogiri
|
237
|
-
rescue
|
238
|
-
log 'Curation::Page nokogiri error'
|
239
|
-
end
|
240
|
-
|
241
|
-
def metainspector
|
242
|
-
unless @metainspector
|
243
|
-
@metainspector = html.nil? ? MetaInspector.new(url)
|
244
|
-
: MetaInspector.new(url, document: html)
|
245
|
-
end
|
246
|
-
@metainspector
|
247
|
-
rescue
|
248
|
-
log 'Curation::Page metainspector error'
|
249
|
-
end
|
250
|
-
|
251
|
-
def metatags
|
252
|
-
@metatags ||= metainspector.meta_tag['name']
|
253
|
-
rescue
|
254
|
-
log 'Curation::Page metatags error'
|
255
|
-
end
|
256
|
-
|
257
|
-
# réforme -> réforme
|
258
|
-
def clean_encoding(text)
|
259
|
-
clean_text = HTMLEntities.new.decode text
|
260
|
-
double_encoding = false
|
261
|
-
[
|
262
|
-
'é', # é
|
263
|
-
'è', # è
|
264
|
-
'î', # î
|
265
|
-
'ê', # ê
|
266
|
-
].each do |string|
|
267
|
-
# require 'byebug'; byebug
|
268
|
-
double_encoding = true if clean_text.include? string
|
269
|
-
end
|
270
|
-
if double_encoding
|
271
|
-
clean_text.encode('iso-8859-1', undef: :replace)
|
272
|
-
.force_encoding('utf-8')
|
273
|
-
else
|
274
|
-
text
|
275
|
-
end
|
276
|
-
end
|
277
|
-
|
278
41
|
def log(message)
|
279
42
|
puts message if verbose
|
280
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '
|
4
|
+
version: '2.0'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|
@@ -72,13 +72,21 @@ files:
|
|
72
72
|
- bin/setup
|
73
73
|
- curation.gemspec
|
74
74
|
- lib/curation.rb
|
75
|
+
- lib/curation/finders/image.rb
|
76
|
+
- lib/curation/finders/publication_date.rb
|
77
|
+
- lib/curation/finders/text.rb
|
78
|
+
- lib/curation/finders/title.rb
|
79
|
+
- lib/curation/tools/jsonld.rb
|
80
|
+
- lib/curation/tools/metainspector.rb
|
81
|
+
- lib/curation/tools/nokogiri.rb
|
82
|
+
- lib/curation/tools/raw.rb
|
75
83
|
- lib/curation/version.rb
|
76
|
-
homepage: https://github.com/
|
84
|
+
homepage: https://github.com/noesya/curation
|
77
85
|
licenses:
|
78
86
|
- MIT
|
79
87
|
metadata:
|
80
|
-
homepage_uri: https://github.com/
|
81
|
-
source_code_uri: https://github.com/
|
88
|
+
homepage_uri: https://github.com/noesya/curation
|
89
|
+
source_code_uri: https://github.com/noesya/curation
|
82
90
|
post_install_message:
|
83
91
|
rdoc_options: []
|
84
92
|
require_paths:
|
@@ -94,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
94
102
|
- !ruby/object:Gem::Version
|
95
103
|
version: '0'
|
96
104
|
requirements: []
|
97
|
-
rubygems_version: 3.4.
|
105
|
+
rubygems_version: 3.4.10
|
98
106
|
signing_key:
|
99
107
|
specification_version: 4
|
100
108
|
summary: Curation of content
|