curation 2.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/curation/finders/text.rb +34 -22
- data/lib/curation/finders/title.rb +2 -1
- data/lib/curation/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2062c7ec7fb444d27f102d26658b386a01a964ea4e03aa0d81a472316012d11
|
4
|
+
data.tar.gz: 634f5216e61801b3ac42c8b340d5012c101024f4d5b5e0ba155c01a89f38cafb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15008b92c6a51fdf9bd79b9f1da01d59323c13589b6e8ccd5d8754b96153173fe8a2cdffa5faa45a4d38c92b097a35012f8dff98fea6244ade8c9af22c13d1cc
|
7
|
+
data.tar.gz: 96a29d3c8482fce0101f91a3f24eafaec51a9115f2169831b67939edf79a47d7f5bb0c3223466de355cf8a367143a262cc54a1d5e4dd7e6bffe2d310879a9cae
|
data/Gemfile.lock
CHANGED
@@ -1,6 +1,12 @@
|
|
1
1
|
module Text
|
2
2
|
|
3
|
-
|
3
|
+
def text
|
4
|
+
@text ||= find_text_and_clean
|
5
|
+
end
|
6
|
+
|
7
|
+
protected
|
8
|
+
|
9
|
+
BLACKLIST_HARD = [
|
4
10
|
'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
|
5
11
|
'.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
|
6
12
|
'#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
|
@@ -11,19 +17,23 @@ module Text
|
|
11
17
|
'[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
|
12
18
|
]
|
13
19
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
end
|
20
|
+
BLACKLIST_SOFT = [
|
21
|
+
'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
|
22
|
+
]
|
18
23
|
|
19
|
-
|
24
|
+
def find_text_and_clean
|
25
|
+
text = find_text.to_s.dup
|
26
|
+
text = text.gsub('<br><br>', '<br>')
|
27
|
+
text = text.gsub(/\s+/, ' ')
|
28
|
+
text = clean_encoding(text)
|
29
|
+
text
|
30
|
+
|
31
|
+
end
|
20
32
|
|
21
33
|
def find_text
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
text = clean_encoding text
|
26
|
-
text
|
34
|
+
find_text_with_json_ld ||
|
35
|
+
find_text_with_nokogiri_hard ||
|
36
|
+
find_text_with_nokogiri_soft
|
27
37
|
end
|
28
38
|
|
29
39
|
def find_text_with_json_ld
|
@@ -34,24 +44,27 @@ module Text
|
|
34
44
|
return ld['articleBody'] if ld.has_key? 'articleBody'
|
35
45
|
end
|
36
46
|
end
|
37
|
-
|
47
|
+
false
|
38
48
|
end
|
39
49
|
|
40
|
-
def
|
50
|
+
def find_text_with_nokogiri_hard
|
41
51
|
h = nokogiri.dup
|
42
52
|
h.xpath('//style').remove
|
43
|
-
|
53
|
+
BLACKLIST_HARD.each do |tag|
|
44
54
|
h.css(tag).remove
|
45
55
|
end
|
46
56
|
nodes = h.css('p')
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
57
|
+
text = nodes.to_html
|
58
|
+
text.present? ? text : false
|
59
|
+
end
|
60
|
+
|
61
|
+
def find_text_with_nokogiri_soft
|
62
|
+
h = nokogiri.dup
|
63
|
+
h.xpath('//style').remove
|
64
|
+
BLACKLIST_SOFT.each do |tag|
|
65
|
+
h.css(tag).remove
|
54
66
|
end
|
67
|
+
h.text
|
55
68
|
end
|
56
69
|
|
57
70
|
# réforme -> réforme
|
@@ -64,7 +77,6 @@ module Text
|
|
64
77
|
'î', # î
|
65
78
|
'ê', # ê
|
66
79
|
].each do |string|
|
67
|
-
# require 'byebug'; byebug
|
68
80
|
double_encoding = true if clean_text.include? string
|
69
81
|
end
|
70
82
|
if double_encoding
|
@@ -21,7 +21,7 @@ module Title
|
|
21
21
|
return ld['headline'] if ld.has_key? 'headline'
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
false
|
25
25
|
end
|
26
26
|
|
27
27
|
def find_title_with_metainspector
|
@@ -39,6 +39,7 @@ module Title
|
|
39
39
|
elsif metainspector_title.present?
|
40
40
|
return metainspector_title
|
41
41
|
end
|
42
|
+
false
|
42
43
|
end
|
43
44
|
|
44
45
|
def find_title_with_nokogiri
|
data/lib/curation/version.rb
CHANGED