curation 2.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/Gemfile.lock +9 -1
- data/curation.gemspec +1 -0
- data/lib/curation/finders/text.rb +34 -22
- data/lib/curation/finders/title.rb +2 -1
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4c4661f04dfa6c3442537bf91ded409452311936ec3e2dedce8fa48ac6ba5d1
|
4
|
+
data.tar.gz: d48f91295be205e59d3028907f1f6dfa9e0d83bc5322834b4b8b21ce824e62a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbcc3db9b67f0b91773a1e72626d17c9e3ded02054f6b348bc5264005c09518d568e851f080e780316e32455941ee92146daa35c61c6fc3866806d698a949672
|
7
|
+
data.tar.gz: 0c03a44f6f97cf5f0180db1832989c9e0eadd0e7250d3f44a5c4cd6a10d9f42d437081a53449fe40a2c63b4e4b398eedc763fd29f7e3173bab3ee10fd7ca157a
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (2.0)
|
4
|
+
curation (2.0.2)
|
5
5
|
htmlentities
|
6
6
|
metainspector
|
7
7
|
nokogiri
|
8
|
+
rails-html-sanitizer
|
8
9
|
|
9
10
|
GEM
|
10
11
|
remote: https://rubygems.org/
|
@@ -15,6 +16,7 @@ GEM
|
|
15
16
|
base64 (0.2.0)
|
16
17
|
builder (3.2.4)
|
17
18
|
byebug (11.1.3)
|
19
|
+
crass (1.0.6)
|
18
20
|
domain_name (0.5.20190701)
|
19
21
|
unf (>= 0.0.5, < 1.0.0)
|
20
22
|
faraday (2.7.11)
|
@@ -40,6 +42,9 @@ GEM
|
|
40
42
|
htmlentities (4.3.4)
|
41
43
|
http-cookie (1.0.5)
|
42
44
|
domain_name (~> 0.5)
|
45
|
+
loofah (2.21.4)
|
46
|
+
crass (~> 1.0.2)
|
47
|
+
nokogiri (>= 1.12.0)
|
43
48
|
metainspector (5.15.0)
|
44
49
|
addressable (~> 2.8.4)
|
45
50
|
faraday (~> 2.5)
|
@@ -63,6 +68,9 @@ GEM
|
|
63
68
|
racc (~> 1.4)
|
64
69
|
public_suffix (5.0.3)
|
65
70
|
racc (1.7.3)
|
71
|
+
rails-html-sanitizer (1.6.0)
|
72
|
+
loofah (~> 2.21)
|
73
|
+
nokogiri (~> 1.14)
|
66
74
|
rake (12.3.3)
|
67
75
|
ruby-progressbar (1.13.0)
|
68
76
|
ruby2_keywords (0.0.5)
|
data/curation.gemspec
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
module Text
|
2
2
|
|
3
|
-
|
3
|
+
def text
|
4
|
+
@text ||= find_text_and_clean
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
BLACKLIST_HARD = [
|
4
10
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
5
11
|
'.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
|
6
12
|
'#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
|
@@ -11,19 +17,23 @@ module Text
|
|
11
17
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
12
18
|
]
|
13
19
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
20
|
+
BLACKLIST_SOFT = [
|
21
|
+
'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
|
22
|
+
]
|
18
23
|
|
19
|
-
|
24
|
+
def find_text_and_clean
|
25
|
+
text = find_text.to_s.dup
|
26
|
+
text = text.gsub('<br><br>', '<br>')
|
27
|
+
text = text.gsub(/\s+/, ' ')
|
28
|
+
text = clean_encoding(text)
|
29
|
+
text = Rails::HTML5::FullSanitizer.new.sanitize(text)
|
30
|
+
text
|
31
|
+
end
|
20
32
|
|
21
33
|
def find_text
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
text = clean_encoding text
|
26
|
-
text
|
34
|
+
find_text_with_json_ld ||
|
35
|
+
find_text_with_nokogiri_hard ||
|
36
|
+
find_text_with_nokogiri_soft
|
27
37
|
end
|
28
38
|
|
29
39
|
def find_text_with_json_ld
|
@@ -34,24 +44,27 @@ module Text
|
|
34
44
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
35
45
|
end
|
36
46
|
end
|
37
|
-
|
47
|
+
false
|
38
48
|
end
|
39
49
|
|
40
|
-
def
|
50
|
+
def find_text_with_nokogiri_hard
|
41
51
|
h = nokogiri.dup
|
42
52
|
h.xpath('//style').remove
|
43
|
-
|
53
|
+
BLACKLIST_HARD.each do |tag|
|
44
54
|
h.css(tag).remove
|
45
55
|
end
|
46
56
|
nodes = h.css('p')
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
57
|
+
text = nodes.to_html
|
58
|
+
text.present? ? text : false
|
59
|
+
end
|
60
|
+
|
61
|
+
def find_text_with_nokogiri_soft
|
62
|
+
h = nokogiri.dup
|
63
|
+
h.xpath('//style').remove
|
64
|
+
BLACKLIST_SOFT.each do |tag|
|
65
|
+
h.css(tag).remove
|
54
66
|
end
|
67
|
+
h.text
|
55
68
|
end
|
56
69
|
|
57
70
|
# réforme -> réforme
|
@@ -64,7 +77,6 @@ module Text
|
|
64
77
|
'î', # î
|
65
78
|
'ê', # ê
|
66
79
|
].each do |string|
|
67
|
-
# require 'byebug'; byebug
|
68
80
|
double_encoding = true if clean_text.include? string
|
69
81
|
end
|
70
82
|
if double_encoding
|
@@ -21,7 +21,7 @@ module Title
|
|
21
21
|
return ld['headline'] if ld.has_key? 'headline'
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
false
|
25
25
|
end
|
26
26
|
|
27
27
|
def find_title_with_metainspector
|
@@ -39,6 +39,7 @@ module Title
|
|
39
39
|
elsif metainspector_title.present?
|
40
40
|
return metainspector_title
|
41
41
|
end
|
42
|
+
false
|
42
43
|
end
|
43
44
|
|
44
45
|
def find_title_with_nokogiri
|
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rails-html-sanitizer
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: When you build content curation tools, you need to extract the content
|
56
70
|
of pages (title, text, image...). This requires different strategies and some fine
|
57
71
|
tuning to work efficiently.
|