curation 2.0.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2062c7ec7fb444d27f102d26658b386a01a964ea4e03aa0d81a472316012d11
4
- data.tar.gz: 634f5216e61801b3ac42c8b340d5012c101024f4d5b5e0ba155c01a89f38cafb
3
+ metadata.gz: 7385bf2f2c34ab16df36a865f82bb53effca4f13824122d788f22afab6d5f956
4
+ data.tar.gz: 63d681cb1c06c5aa34caa56deae021ae28292306df00bf85708341ad9006568a
5
5
  SHA512:
6
- metadata.gz: 15008b92c6a51fdf9bd79b9f1da01d59323c13589b6e8ccd5d8754b96153173fe8a2cdffa5faa45a4d38c92b097a35012f8dff98fea6244ade8c9af22c13d1cc
7
- data.tar.gz: 96a29d3c8482fce0101f91a3f24eafaec51a9115f2169831b67939edf79a47d7f5bb0c3223466de355cf8a367143a262cc54a1d5e4dd7e6bffe2d310879a9cae
6
+ metadata.gz: 222924c992b61f62d8347d76a2f167773a83c7b2ac613031a3dc40d17a0fbbe1f1ec8485d4caa127f0bc89133503e4489b989ef1307676f0ef125d555f4b0e6f
7
+ data.tar.gz: 4668058ec845b325b46dac9a22c7ce6c3047f9d536655f5297d01b63a3bc087fca5dcb6494fd6d796720359e41eb61b572e2117878d4c730d6090dabe171332a
data/Gemfile CHANGED
@@ -7,3 +7,4 @@ gem 'rake', '~> 12.0'
7
7
  gem 'minitest'
8
8
  gem 'minitest-reporters'
9
9
  gem 'byebug'
10
+
data/Gemfile.lock CHANGED
@@ -1,10 +1,11 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (2.0.1)
4
+ curation (2.0.3)
5
5
  htmlentities
6
6
  metainspector
7
7
  nokogiri
8
+ rails-html-sanitizer
8
9
 
9
10
  GEM
10
11
  remote: https://rubygems.org/
@@ -15,6 +16,7 @@ GEM
15
16
  base64 (0.2.0)
16
17
  builder (3.2.4)
17
18
  byebug (11.1.3)
19
+ crass (1.0.6)
18
20
  domain_name (0.5.20190701)
19
21
  unf (>= 0.0.5, < 1.0.0)
20
22
  faraday (2.7.11)
@@ -40,6 +42,9 @@ GEM
40
42
  htmlentities (4.3.4)
41
43
  http-cookie (1.0.5)
42
44
  domain_name (~> 0.5)
45
+ loofah (2.21.4)
46
+ crass (~> 1.0.2)
47
+ nokogiri (>= 1.12.0)
43
48
  metainspector (5.15.0)
44
49
  addressable (~> 2.8.4)
45
50
  faraday (~> 2.5)
@@ -52,6 +57,7 @@ GEM
52
57
  fastimage (~> 2.2)
53
58
  nesty (~> 1.0)
54
59
  nokogiri (~> 1.13)
60
+ mini_portile2 (2.8.9)
55
61
  minitest (5.20.0)
56
62
  minitest-reporters (1.6.1)
57
63
  ansi
@@ -59,10 +65,14 @@ GEM
59
65
  minitest (>= 5.0)
60
66
  ruby-progressbar
61
67
  nesty (1.0.2)
62
- nokogiri (1.15.4-x86_64-darwin)
68
+ nokogiri (1.15.4)
69
+ mini_portile2 (~> 2.8.2)
63
70
  racc (~> 1.4)
64
71
  public_suffix (5.0.3)
65
72
  racc (1.7.3)
73
+ rails-html-sanitizer (1.6.0)
74
+ loofah (~> 2.21)
75
+ nokogiri (~> 1.14)
66
76
  rake (12.3.3)
67
77
  ruby-progressbar (1.13.0)
68
78
  ruby2_keywords (0.0.5)
@@ -72,6 +82,7 @@ GEM
72
82
  zlib (2.1.1)
73
83
 
74
84
  PLATFORMS
85
+ arm64-darwin-24
75
86
  x86_64-darwin-21
76
87
  x86_64-darwin-22
77
88
 
data/curation.gemspec CHANGED
@@ -24,4 +24,5 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "metainspector"
25
25
  spec.add_dependency "nokogiri"
26
26
  spec.add_dependency "htmlentities"
27
+ spec.add_dependency "rails-html-sanitizer"
27
28
  end
@@ -26,8 +26,8 @@ module Text
26
26
  text = text.gsub('<br><br>', '<br>')
27
27
  text = text.gsub(/\s+/, ' ')
28
28
  text = clean_encoding(text)
29
+ text = Rails::HTML5::FullSanitizer.new.sanitize(text)
29
30
  text
30
-
31
31
  end
32
32
 
33
33
  def find_text
@@ -9,7 +9,7 @@ module Title
9
9
  def find_title
10
10
  find_title_with_json_ld ||
11
11
  find_title_with_metainspector ||
12
- find_title_with_nokogiri ||
12
+ find_title_with_nokogiri ||
13
13
  ''
14
14
  end
15
15
 
@@ -29,11 +29,11 @@ module Title
29
29
  metainspector_title = metainspector.title
30
30
  # Problème avec une balise <meta property="title" content="Run 0" />,
31
31
  # metainspector croit que c'est le titre de la page.
32
- # Comme le title contient le best title, avec souvent des infos en plus sur le site,
32
+ # Comme le title contient le best title, avec souvent des infos en plus sur le site,
33
33
  # on vérifie si le best title est bien contenu dans le title
34
- if metainspector_title.present? &&
34
+ if metainspector_title.present? &&
35
35
  metainspector_title.present? &&
36
- metainspector_best_title.present? &&
36
+ metainspector_best_title.present? &&
37
37
  metainspector_title.include?(metainspector_best_title)
38
38
  return metainspector_best_title
39
39
  elsif metainspector_title.present?
@@ -55,4 +55,4 @@ module Title
55
55
  end
56
56
  end
57
57
 
58
- end
58
+ end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "2.0.1"
2
+ VERSION = "2.0.3"
3
3
  end
data/lib/curation.rb CHANGED
@@ -10,6 +10,7 @@ require "curation/finders/title"
10
10
  require "metainspector"
11
11
  require "open-uri"
12
12
  require "htmlentities"
13
+ require "rails-html-sanitizer"
13
14
 
14
15
  module Curation
15
16
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2023-11-11 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: metainspector
@@ -52,6 +51,20 @@ dependencies:
52
51
  - - ">="
53
52
  - !ruby/object:Gem::Version
54
53
  version: '0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rails-html-sanitizer
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
55
68
  description: When you build content curation tools, you need to extract the content
56
69
  of pages (title, text, image...). This requires different strategies and some fine
57
70
  tuning to work efficiently.
@@ -87,7 +100,6 @@ licenses:
87
100
  metadata:
88
101
  homepage_uri: https://github.com/noesya/curation
89
102
  source_code_uri: https://github.com/noesya/curation
90
- post_install_message:
91
103
  rdoc_options: []
92
104
  require_paths:
93
105
  - lib
@@ -102,8 +114,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
102
114
  - !ruby/object:Gem::Version
103
115
  version: '0'
104
116
  requirements: []
105
- rubygems_version: 3.4.10
106
- signing_key:
117
+ rubygems_version: 3.6.7
107
118
  specification_version: 4
108
119
  summary: Curation of content
109
120
  test_files: []