curation 2.0 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -0
- data/Gemfile.lock +9 -1
- data/curation.gemspec +1 -0
- data/lib/curation/finders/text.rb +34 -22
- data/lib/curation/finders/title.rb +2 -1
- data/lib/curation/version.rb +1 -1
- data/lib/curation.rb +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4c4661f04dfa6c3442537bf91ded409452311936ec3e2dedce8fa48ac6ba5d1
|
4
|
+
data.tar.gz: d48f91295be205e59d3028907f1f6dfa9e0d83bc5322834b4b8b21ce824e62a2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dbcc3db9b67f0b91773a1e72626d17c9e3ded02054f6b348bc5264005c09518d568e851f080e780316e32455941ee92146daa35c61c6fc3866806d698a949672
|
7
|
+
data.tar.gz: 0c03a44f6f97cf5f0180db1832989c9e0eadd0e7250d3f44a5c4cd6a10d9f42d437081a53449fe40a2c63b4e4b398eedc763fd29f7e3173bab3ee10fd7ca157a
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
curation (2.0)
|
4
|
+
curation (2.0.2)
|
5
5
|
htmlentities
|
6
6
|
metainspector
|
7
7
|
nokogiri
|
8
|
+
rails-html-sanitizer
|
8
9
|
|
9
10
|
GEM
|
10
11
|
remote: https://rubygems.org/
|
@@ -15,6 +16,7 @@ GEM
|
|
15
16
|
base64 (0.2.0)
|
16
17
|
builder (3.2.4)
|
17
18
|
byebug (11.1.3)
|
19
|
+
crass (1.0.6)
|
18
20
|
domain_name (0.5.20190701)
|
19
21
|
unf (>= 0.0.5, < 1.0.0)
|
20
22
|
faraday (2.7.11)
|
@@ -40,6 +42,9 @@ GEM
|
|
40
42
|
htmlentities (4.3.4)
|
41
43
|
http-cookie (1.0.5)
|
42
44
|
domain_name (~> 0.5)
|
45
|
+
loofah (2.21.4)
|
46
|
+
crass (~> 1.0.2)
|
47
|
+
nokogiri (>= 1.12.0)
|
43
48
|
metainspector (5.15.0)
|
44
49
|
addressable (~> 2.8.4)
|
45
50
|
faraday (~> 2.5)
|
@@ -63,6 +68,9 @@ GEM
|
|
63
68
|
racc (~> 1.4)
|
64
69
|
public_suffix (5.0.3)
|
65
70
|
racc (1.7.3)
|
71
|
+
rails-html-sanitizer (1.6.0)
|
72
|
+
loofah (~> 2.21)
|
73
|
+
nokogiri (~> 1.14)
|
66
74
|
rake (12.3.3)
|
67
75
|
ruby-progressbar (1.13.0)
|
68
76
|
ruby2_keywords (0.0.5)
|
data/curation.gemspec
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
module Text
|
2
2
|
|
3
|
-
|
3
|
+
def text
|
4
|
+
@text ||= find_text_and_clean
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
BLACKLIST_HARD = [
|
4
10
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
5
11
|
'.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
|
6
12
|
'#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
|
@@ -11,19 +17,23 @@ module Text
|
|
11
17
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
12
18
|
]
|
13
19
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
20
|
+
BLACKLIST_SOFT = [
|
21
|
+
'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
|
22
|
+
]
|
18
23
|
|
19
|
-
|
24
|
+
def find_text_and_clean
|
25
|
+
text = find_text.to_s.dup
|
26
|
+
text = text.gsub('<br><br>', '<br>')
|
27
|
+
text = text.gsub(/\s+/, ' ')
|
28
|
+
text = clean_encoding(text)
|
29
|
+
text = Rails::HTML5::FullSanitizer.new.sanitize(text)
|
30
|
+
text
|
31
|
+
end
|
20
32
|
|
21
33
|
def find_text
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
text = clean_encoding text
|
26
|
-
text
|
34
|
+
find_text_with_json_ld ||
|
35
|
+
find_text_with_nokogiri_hard ||
|
36
|
+
find_text_with_nokogiri_soft
|
27
37
|
end
|
28
38
|
|
29
39
|
def find_text_with_json_ld
|
@@ -34,24 +44,27 @@ module Text
|
|
34
44
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
35
45
|
end
|
36
46
|
end
|
37
|
-
|
47
|
+
false
|
38
48
|
end
|
39
49
|
|
40
|
-
def
|
50
|
+
def find_text_with_nokogiri_hard
|
41
51
|
h = nokogiri.dup
|
42
52
|
h.xpath('//style').remove
|
43
|
-
|
53
|
+
BLACKLIST_HARD.each do |tag|
|
44
54
|
h.css(tag).remove
|
45
55
|
end
|
46
56
|
nodes = h.css('p')
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
57
|
+
text = nodes.to_html
|
58
|
+
text.present? ? text : false
|
59
|
+
end
|
60
|
+
|
61
|
+
def find_text_with_nokogiri_soft
|
62
|
+
h = nokogiri.dup
|
63
|
+
h.xpath('//style').remove
|
64
|
+
BLACKLIST_SOFT.each do |tag|
|
65
|
+
h.css(tag).remove
|
54
66
|
end
|
67
|
+
h.text
|
55
68
|
end
|
56
69
|
|
57
70
|
# réforme -> réforme
|
@@ -64,7 +77,6 @@ module Text
|
|
64
77
|
'î', # î
|
65
78
|
'ê', # ê
|
66
79
|
].each do |string|
|
67
|
-
# require 'byebug'; byebug
|
68
80
|
double_encoding = true if clean_text.include? string
|
69
81
|
end
|
70
82
|
if double_encoding
|
@@ -21,7 +21,7 @@ module Title
|
|
21
21
|
return ld['headline'] if ld.has_key? 'headline'
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
false
|
25
25
|
end
|
26
26
|
|
27
27
|
def find_title_with_metainspector
|
@@ -39,6 +39,7 @@ module Title
|
|
39
39
|
elsif metainspector_title.present?
|
40
40
|
return metainspector_title
|
41
41
|
end
|
42
|
+
false
|
42
43
|
end
|
43
44
|
|
44
45
|
def find_title_with_nokogiri
|
data/lib/curation/version.rb
CHANGED
data/lib/curation.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: curation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arnaud Levy
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: metainspector
|
@@ -52,6 +52,20 @@ dependencies:
|
|
52
52
|
- - ">="
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rails-html-sanitizer
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
55
69
|
description: When you build content curation tools, you need to extract the content
|
56
70
|
of pages (title, text, image...). This requires different strategies and some fine
|
57
71
|
tuning to work efficiently.
|