curation 2.0 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8bfaa5cfe719f5a5f5c9f153a358e74f11eb4217f4bbddca880b8f45829fc913
4
- data.tar.gz: 4abb219144b696da0f33ab12b251036bd67c4ca7250a4576febdcaaa8e409dc0
3
+ metadata.gz: c2062c7ec7fb444d27f102d26658b386a01a964ea4e03aa0d81a472316012d11
4
+ data.tar.gz: 634f5216e61801b3ac42c8b340d5012c101024f4d5b5e0ba155c01a89f38cafb
5
5
  SHA512:
6
- metadata.gz: 6d19e2a775ea069a3e6b0dc50b66f30d52f0c4e1e101cea2c9f0dde5cedde611043cf4267a4764994be8105e422919bf708f850935d531d9bb8f907025393a05
7
- data.tar.gz: 4f0dd67b073cc2003cf798603f7221ba32977645180b671bcde0b1464fc53085cf9a076c2f30f4ce3775a8524246386d576e1bd0d1021c8388db58d13c16068f
6
+ metadata.gz: 15008b92c6a51fdf9bd79b9f1da01d59323c13589b6e8ccd5d8754b96153173fe8a2cdffa5faa45a4d38c92b097a35012f8dff98fea6244ade8c9af22c13d1cc
7
+ data.tar.gz: 96a29d3c8482fce0101f91a3f24eafaec51a9115f2169831b67939edf79a47d7f5bb0c3223466de355cf8a367143a262cc54a1d5e4dd7e6bffe2d310879a9cae
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (2.0)
4
+ curation (2.0.1)
5
5
  htmlentities
6
6
  metainspector
7
7
  nokogiri
@@ -1,6 +1,12 @@
1
1
  module Text
2
2
 
3
- BLACKLIST = [
3
+ def text
4
+ @text ||= find_text_and_clean
5
+ end
6
+
7
+ protected
8
+
9
+ BLACKLIST_HARD = [
4
10
  'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
5
11
  '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
6
12
  '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
@@ -11,19 +17,23 @@ module Text
11
17
  '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
12
18
  ]
13
19
 
14
- def text
15
- # require 'byebug'; byebug
16
- @text ||= find_text
17
- end
20
+ BLACKLIST_SOFT = [
21
+ 'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
22
+ ]
18
23
 
19
- protected
24
+ def find_text_and_clean
25
+ text = find_text.to_s.dup
26
+ text = text.gsub('<br><br>', '<br>')
27
+ text = text.gsub(/\s+/, ' ')
28
+ text = clean_encoding(text)
29
+ text
30
+
31
+ end
20
32
 
21
33
  def find_text
22
- text = find_text_with_json_ld || find_text_with_nokogiri
23
- text.to_s.dup.gsub!('<br><br>', '<br>')
24
- # require 'byebug'; byebug
25
- text = clean_encoding text
26
- text
34
+ find_text_with_json_ld ||
35
+ find_text_with_nokogiri_hard ||
36
+ find_text_with_nokogiri_soft
27
37
  end
28
38
 
29
39
  def find_text_with_json_ld
@@ -34,24 +44,27 @@ module Text
34
44
  return ld['articleBody'] if ld.has_key? 'articleBody'
35
45
  end
36
46
  end
37
- nil
47
+ false
38
48
  end
39
49
 
40
- def find_text_with_nokogiri
50
+ def find_text_with_nokogiri_hard
41
51
  h = nokogiri.dup
42
52
  h.xpath('//style').remove
43
- BLACKLIST.each do |tag|
53
+ BLACKLIST_HARD.each do |tag|
44
54
  h.css(tag).remove
45
55
  end
46
56
  nodes = h.css('p')
47
- if nodes.any?
48
- text = nodes.to_html
49
- text
50
- else
51
- # Cleanup was too hard, let's try softer
52
- h = nokogiri.dup
53
- h.text
57
+ text = nodes.to_html
58
+ text.present? ? text : false
59
+ end
60
+
61
+ def find_text_with_nokogiri_soft
62
+ h = nokogiri.dup
63
+ h.xpath('//style').remove
64
+ BLACKLIST_SOFT.each do |tag|
65
+ h.css(tag).remove
54
66
  end
67
+ h.text
55
68
  end
56
69
 
57
70
  # r&Atilde;&copy;forme -> réforme
@@ -64,7 +77,6 @@ module Text
64
77
  'î', # î
65
78
  'ê', # ê
66
79
  ].each do |string|
67
- # require 'byebug'; byebug
68
80
  double_encoding = true if clean_text.include? string
69
81
  end
70
82
  if double_encoding
@@ -21,7 +21,7 @@ module Title
21
21
  return ld['headline'] if ld.has_key? 'headline'
22
22
  end
23
23
  end
24
- nil
24
+ false
25
25
  end
26
26
 
27
27
  def find_title_with_metainspector
@@ -39,6 +39,7 @@ module Title
39
39
  elsif metainspector_title.present?
40
40
  return metainspector_title
41
41
  end
42
+ false
42
43
  end
43
44
 
44
45
  def find_title_with_nokogiri
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "2.0"
2
+ VERSION = "2.0.1"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '2.0'
4
+ version: 2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy