curation 2.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation/finders/text.rb +34 -22
- data/lib/curation/finders/title.rb +2 -1
- data/lib/curation/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2062c7ec7fb444d27f102d26658b386a01a964ea4e03aa0d81a472316012d11
|
4
|
+
data.tar.gz: 634f5216e61801b3ac42c8b340d5012c101024f4d5b5e0ba155c01a89f38cafb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15008b92c6a51fdf9bd79b9f1da01d59323c13589b6e8ccd5d8754b96153173fe8a2cdffa5faa45a4d38c92b097a35012f8dff98fea6244ade8c9af22c13d1cc
|
7
|
+
data.tar.gz: 96a29d3c8482fce0101f91a3f24eafaec51a9115f2169831b67939edf79a47d7f5bb0c3223466de355cf8a367143a262cc54a1d5e4dd7e6bffe2d310879a9cae
|
data/Gemfile.lock
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
module Text
|
2
2
|
|
3
|
-
|
3
|
+
def text
|
4
|
+
@text ||= find_text_and_clean
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
BLACKLIST_HARD = [
|
4
10
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
5
11
|
'.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
|
6
12
|
'#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
|
@@ -11,19 +17,23 @@ module Text
|
|
11
17
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
12
18
|
]
|
13
19
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
20
|
+
BLACKLIST_SOFT = [
|
21
|
+
'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
|
22
|
+
]
|
18
23
|
|
19
|
-
|
24
|
+
def find_text_and_clean
|
25
|
+
text = find_text.to_s.dup
|
26
|
+
text = text.gsub('<br><br>', '<br>')
|
27
|
+
text = text.gsub(/\s+/, ' ')
|
28
|
+
text = clean_encoding(text)
|
29
|
+
text
|
30
|
+
|
31
|
+
end
|
20
32
|
|
21
33
|
def find_text
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
text = clean_encoding text
|
26
|
-
text
|
34
|
+
find_text_with_json_ld ||
|
35
|
+
find_text_with_nokogiri_hard ||
|
36
|
+
find_text_with_nokogiri_soft
|
27
37
|
end
|
28
38
|
|
29
39
|
def find_text_with_json_ld
|
@@ -34,24 +44,27 @@ module Text
|
|
34
44
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
35
45
|
end
|
36
46
|
end
|
37
|
-
|
47
|
+
false
|
38
48
|
end
|
39
49
|
|
40
|
-
def
|
50
|
+
def find_text_with_nokogiri_hard
|
41
51
|
h = nokogiri.dup
|
42
52
|
h.xpath('//style').remove
|
43
|
-
|
53
|
+
BLACKLIST_HARD.each do |tag|
|
44
54
|
h.css(tag).remove
|
45
55
|
end
|
46
56
|
nodes = h.css('p')
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
57
|
+
text = nodes.to_html
|
58
|
+
text.present? ? text : false
|
59
|
+
end
|
60
|
+
|
61
|
+
def find_text_with_nokogiri_soft
|
62
|
+
h = nokogiri.dup
|
63
|
+
h.xpath('//style').remove
|
64
|
+
BLACKLIST_SOFT.each do |tag|
|
65
|
+
h.css(tag).remove
|
54
66
|
end
|
67
|
+
h.text
|
55
68
|
end
|
56
69
|
|
57
70
|
# réforme -> réforme
|
@@ -64,7 +77,6 @@ module Text
|
|
64
77
|
'î', # î
|
65
78
|
'ê', # ê
|
66
79
|
].each do |string|
|
67
|
-
# require 'byebug'; byebug
|
68
80
|
double_encoding = true if clean_text.include? string
|
69
81
|
end
|
70
82
|
if double_encoding
|
@@ -21,7 +21,7 @@ module Title
|
|
21
21
|
return ld['headline'] if ld.has_key? 'headline'
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
false
|
25
25
|
end
|
26
26
|
|
27
27
|
def find_title_with_metainspector
|
@@ -39,6 +39,7 @@ module Title
|
|
39
39
|
elsif metainspector_title.present?
|
40
40
|
return metainspector_title
|
41
41
|
end
|
42
|
+
false
|
42
43
|
end
|
43
44
|
|
44
45
|
def find_title_with_nokogiri
|
data/lib/curation/version.rb
CHANGED