curation 1.11 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bcbcb0a8ecb81b659e2ae2aa35e27acac6006265cf2743b6394f81203c634425
4
- data.tar.gz: 25b25c7f30be8f9b004cec7efbd41caae4e69e87000ff585b92c8377b7855439
3
+ metadata.gz: 8bfaa5cfe719f5a5f5c9f153a358e74f11eb4217f4bbddca880b8f45829fc913
4
+ data.tar.gz: 4abb219144b696da0f33ab12b251036bd67c4ca7250a4576febdcaaa8e409dc0
5
5
  SHA512:
6
- metadata.gz: 420056127e2c0ca86a4ad2e08fb37aa3da31a394264e4d7807a844dd9db61bc44124d6e978a076d9f50a1df37f7a5f80cc782145f54d11c7c3ce00566feca80d
7
- data.tar.gz: a3b387e1c7345968c0eaaf1d2e8fbdbf752aa8b7a8b1e3b3807f206c43e7b096419b61b907622065f690d00a55b933038f0f3ed1b4fefa4d970f272afc1438d5
6
+ metadata.gz: 6d19e2a775ea069a3e6b0dc50b66f30d52f0c4e1e101cea2c9f0dde5cedde611043cf4267a4764994be8105e422919bf708f850935d531d9bb8f907025393a05
7
+ data.tar.gz: 4f0dd67b073cc2003cf798603f7221ba32977645180b671bcde0b1464fc53085cf9a076c2f30f4ce3775a8524246386d576e1bd0d1021c8388db58d13c16068f
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.11)
4
+ curation (2.0)
5
5
  htmlentities
6
6
  metainspector
7
7
  nokogiri
@@ -12,11 +12,13 @@ GEM
12
12
  addressable (2.8.5)
13
13
  public_suffix (>= 2.0.2, < 6.0)
14
14
  ansi (1.5.0)
15
+ base64 (0.2.0)
15
16
  builder (3.2.4)
16
17
  byebug (11.1.3)
17
18
  domain_name (0.5.20190701)
18
19
  unf (>= 0.0.5, < 1.0.0)
19
- faraday (2.7.10)
20
+ faraday (2.7.11)
21
+ base64
20
22
  faraday-net_http (>= 2.0, < 3.1)
21
23
  ruby2_keywords (>= 0.0.4)
22
24
  faraday-cookie_jar (0.0.7)
@@ -50,7 +52,7 @@ GEM
50
52
  fastimage (~> 2.2)
51
53
  nesty (~> 1.0)
52
54
  nokogiri (~> 1.13)
53
- minitest (5.19.0)
55
+ minitest (5.20.0)
54
56
  minitest-reporters (1.6.1)
55
57
  ansi
56
58
  builder
@@ -60,13 +62,13 @@ GEM
60
62
  nokogiri (1.15.4-x86_64-darwin)
61
63
  racc (~> 1.4)
62
64
  public_suffix (5.0.3)
63
- racc (1.7.1)
65
+ racc (1.7.3)
64
66
  rake (12.3.3)
65
67
  ruby-progressbar (1.13.0)
66
68
  ruby2_keywords (0.0.5)
67
69
  unf (0.1.4)
68
70
  unf_ext
69
- unf_ext (0.0.8.2)
71
+ unf_ext (0.0.9)
70
72
  zlib (2.1.1)
71
73
 
72
74
  PLATFORMS
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2020 Arnaud Levy
3
+ Copyright (c) 2020 noesya
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -37,7 +37,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
37
37
 
38
38
  ## Contributing
39
39
 
40
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/curation/blob/master/CODE_OF_CONDUCT.md).
40
+ Bug reports and pull requests are welcome on GitHub at https://github.com/noesya/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/noesya/curation/blob/master/CODE_OF_CONDUCT.md).
41
41
 
42
42
 
43
43
  ## License
data/curation.gemspec CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
8
8
 
9
9
  spec.summary = 'Curation of content'
10
10
  spec.description = %q{When you build content curation tools, you need to extract the content of pages (title, text, image...). This requires different strategies and some fine tuning to work efficiently.}
11
- spec.homepage = "https://github.com/arnaudlevy/curation"
11
+ spec.homepage = "https://github.com/noesya/curation"
12
12
  spec.license = "MIT"
13
13
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
14
 
15
15
  spec.metadata["homepage_uri"] = spec.homepage
16
- spec.metadata["source_code_uri"] = "https://github.com/arnaudlevy/curation"
16
+ spec.metadata["source_code_uri"] = "https://github.com/noesya/curation"
17
17
 
18
18
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
19
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
@@ -0,0 +1,51 @@
1
+ module Image
2
+
3
+ def image
4
+ @image ||= find_image.to_s.gsub('http://', 'https://')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_image
10
+ log "Curation::Page find_image #{url}"
11
+ if json_ld.any?
12
+ json_ld.each do |ld|
13
+ ld = ld.first if ld.is_a?(Array)
14
+ if ld.has_key? 'image'
15
+ image_data = ld['image']
16
+ if image_data.is_a? String
17
+ log "Curation::Page find_image json_ld string"
18
+ return image_data
19
+ end
20
+ if image_data.is_a? Array
21
+ first = image_data.first
22
+ if first.is_a? String
23
+ log "Curation::Page find_image json_ld array"
24
+ return first
25
+ end
26
+ if first.is_a? Hash
27
+ log "Curation::Page find_image json_ld array url"
28
+ return first['url']
29
+ end
30
+ end
31
+ if image_data.is_a? Hash
32
+ log "Curation::Page find_image json_ld url"
33
+ return image_data['url']
34
+ end
35
+ end
36
+ end
37
+ end
38
+ begin
39
+ [
40
+ metainspector.images.best,
41
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
42
+ ].each do |possibility|
43
+ return possibility unless possibility.to_s.empty?
44
+ end
45
+ rescue
46
+ puts 'Curation::Page find_image error'
47
+ end
48
+ return ''
49
+ end
50
+
51
+ end
@@ -0,0 +1,43 @@
1
+ module PublicationDate
2
+
3
+ def date
4
+ @date ||= find_date
5
+ end
6
+
7
+ protected
8
+
9
+ def find_date
10
+ if json_ld.any?
11
+ json_ld.each do |ld|
12
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
13
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
14
+ end
15
+ end
16
+ return Date.parse metatags['date'] rescue nil
17
+ return Date.parse metatags['pubdate'] rescue nil
18
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
19
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
20
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
21
+ chunks = html.split('DisplayDate')
22
+ if chunks.count > 1
23
+ value = chunks[1]
24
+ value = value.split(',').first
25
+ value = value.gsub('"', '')
26
+ value = value[1..-1] if value[0] == ':'
27
+ return Date.parse value rescue nil
28
+ end
29
+ begin
30
+ value = nokogiri.css('.postDate').first
31
+ value = value.inner_text
32
+ value = value.gsub(' — ', '')
33
+ return Date.parse value
34
+ rescue
35
+ end
36
+ begin
37
+ value = nokogiri.css('.gta_post_date').first
38
+ value = value.inner_text
39
+ return Date.parse value
40
+ rescue
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,78 @@
1
+ module Text
2
+
3
+ BLACKLIST = [
4
+ 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
5
+ '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
6
+ '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
7
+ '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
8
+ '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
9
+ '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
10
+ '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
11
+ '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
12
+ ]
13
+
14
+ def text
15
+ # require 'byebug'; byebug
16
+ @text ||= find_text
17
+ end
18
+
19
+ protected
20
+
21
+ def find_text
22
+ text = find_text_with_json_ld || find_text_with_nokogiri
23
+ text.to_s.dup.gsub!('<br><br>', '<br>')
24
+ # require 'byebug'; byebug
25
+ text = clean_encoding text
26
+ text
27
+ end
28
+
29
+ def find_text_with_json_ld
30
+ if json_ld.any?
31
+ json_ld.each do |ld|
32
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
33
+ return ld['text'] if ld.has_key? 'text'
34
+ return ld['articleBody'] if ld.has_key? 'articleBody'
35
+ end
36
+ end
37
+ nil
38
+ end
39
+
40
+ def find_text_with_nokogiri
41
+ h = nokogiri.dup
42
+ h.xpath('//style').remove
43
+ BLACKLIST.each do |tag|
44
+ h.css(tag).remove
45
+ end
46
+ nodes = h.css('p')
47
+ if nodes.any?
48
+ text = nodes.to_html
49
+ text
50
+ else
51
+ # Cleanup was too hard, let's try softer
52
+ h = nokogiri.dup
53
+ h.text
54
+ end
55
+ end
56
+
57
+ # r&Atilde;&copy;forme -> réforme
58
+ def clean_encoding(text)
59
+ clean_text = HTMLEntities.new.decode text
60
+ double_encoding = false
61
+ [
62
+ 'é', # é
63
+ 'è', # è
64
+ 'î', # î
65
+ 'ê', # ê
66
+ ].each do |string|
67
+ # require 'byebug'; byebug
68
+ double_encoding = true if clean_text.include? string
69
+ end
70
+ if double_encoding
71
+ clean_text.encode('iso-8859-1', undef: :replace)
72
+ .force_encoding('utf-8')
73
+ else
74
+ text
75
+ end
76
+ end
77
+
78
+ end
@@ -0,0 +1,57 @@
1
+ module Title
2
+
3
+ def title
4
+ @title ||= find_title.strip.gsub(/\s+/, ' ')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_title
10
+ find_title_with_json_ld ||
11
+ find_title_with_metainspector ||
12
+ find_title_with_nokogiri ||
13
+ ''
14
+ end
15
+
16
+ def find_title_with_json_ld
17
+ if json_ld.any?
18
+ json_ld.each do |ld|
19
+ # require 'byebug'; byebug
20
+ ld = ld.first if ld.is_a?(Array)
21
+ return ld['headline'] if ld.has_key? 'headline'
22
+ end
23
+ end
24
+ nil
25
+ end
26
+
27
+ def find_title_with_metainspector
28
+ metainspector_best_title = metainspector.best_title
29
+ metainspector_title = metainspector.title
30
+ # Problème avec une balise <meta property="title" content="Run 0" />,
31
+ # metainspector croit que c'est le titre de la page.
32
+ # Comme le title contient le best title, avec souvent des infos en plus sur le site,
33
+ # on vérifie si le best title est bien contenu dans le title
34
+ if metainspector_title.present? &&
35
+ metainspector_title.present? &&
36
+ metainspector_best_title.present? &&
37
+ metainspector_title.include?(metainspector_best_title)
38
+ return metainspector_best_title
39
+ elsif metainspector_title.present?
40
+ return metainspector_title
41
+ end
42
+ end
43
+
44
+ def find_title_with_nokogiri
45
+ begin
46
+ [
47
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
48
+ nokogiri.css('title')&.first&.inner_text
49
+ ].each do |possibility|
50
+ return possibility unless possibility.to_s.empty?
51
+ end
52
+ rescue
53
+ log 'Curation::Page find_title_with_nokogiri error'
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1,26 @@
1
+ module Jsonld
2
+
3
+ def json_ld
4
+ unless defined?(@json_ld)
5
+ @json_ld = []
6
+ begin
7
+ options = nokogiri.css('[type="application/ld+json"]')
8
+ options.each do |option|
9
+ @json_ld << json_ld_from_object(option)
10
+ end
11
+ # Some sites have tables in tables
12
+ @json_ld.flatten!
13
+ # require 'byebug'; byebug
14
+ rescue
15
+ log 'Curation::Page json_ld error'
16
+ end
17
+ end
18
+ @json_ld
19
+ end
20
+
21
+ def json_ld_from_object(object)
22
+ JSON.parse object.inner_text
23
+ rescue
24
+ {}
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ module Metainspector
2
+
3
+ def metainspector
4
+ unless @metainspector
5
+ @metainspector = html.nil? ? MetaInspector.new(url)
6
+ : MetaInspector.new(url, document: html)
7
+ end
8
+ @metainspector
9
+ rescue
10
+ log 'Curation::Page metainspector error'
11
+ end
12
+
13
+ def metatags
14
+ @metatags ||= metainspector.meta_tag['name']
15
+ rescue
16
+ log 'Curation::Page metatags error'
17
+ end
18
+ end
@@ -0,0 +1,17 @@
1
+ module Nokogiri
2
+
3
+ def nokogiri
4
+ unless @nokogiri
5
+ if file.nil?
6
+ @nokogiri = metainspector.parsed
7
+ else
8
+ file.rewind
9
+ @nokogiri = Nokogiri::HTML file
10
+ file.rewind
11
+ end
12
+ end
13
+ @nokogiri
14
+ rescue
15
+ log 'Curation::Page nokogiri error'
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Raw
2
+
3
+ def file
4
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
5
+ rescue
6
+ log "Curation::Page file error with url #{url}"
7
+ end
8
+
9
+ def html
10
+ unless @html
11
+ file.rewind
12
+ @html = file.read
13
+ file.rewind
14
+ end
15
+ @html
16
+ rescue
17
+ log "Curation::Page html error"
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.11"
2
+ VERSION = "2.0"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,4 +1,12 @@
1
1
  require "curation/version"
2
+ require "curation/tools/raw"
3
+ require "curation/tools/nokogiri"
4
+ require "curation/tools/jsonld"
5
+ require "curation/tools/metainspector"
6
+ require "curation/finders/image"
7
+ require "curation/finders/publication_date"
8
+ require "curation/finders/text"
9
+ require "curation/finders/title"
2
10
  require "metainspector"
3
11
  require "open-uri"
4
12
  require "htmlentities"
@@ -7,274 +15,29 @@ module Curation
7
15
  class Error < StandardError; end
8
16
 
9
17
  class Page
18
+ # Tools
19
+ include Raw
20
+ include Jsonld
21
+ include Metainspector
22
+ include Nokogiri
23
+
24
+ # Finders
25
+ include Title
26
+ include Image
27
+ include PublicationDate
28
+ include Text
29
+
10
30
  attr_reader :url
11
31
  attr_accessor :verbose
12
32
 
13
- BLACKLIST = [
14
- 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
15
- '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
16
- '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
17
- '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
18
- '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
19
- '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
20
- '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
21
- '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
22
- ]
23
-
24
33
  def initialize(url, html = nil)
25
34
  @url = url.to_s.gsub('http://', 'https://')
26
35
  @html = html
27
36
  @verbose = false
28
37
  end
29
38
 
30
- def title
31
- @title ||= find_title
32
- end
33
-
34
- def image
35
- unless @image
36
- @image = find_image
37
- @image = @image.to_s.gsub('http://', 'https://')
38
- end
39
- @image
40
- end
41
-
42
- def text
43
- # require 'byebug'; byebug
44
- @text ||= find_text
45
- end
46
-
47
- def date
48
- @date ||= find_date
49
- end
50
-
51
39
  protected
52
40
 
53
- def find_title
54
- if json_ld.any?
55
- json_ld.each do |ld|
56
- # require 'byebug'; byebug
57
- ld = ld.first if ld.is_a?(Array)
58
- return ld['headline'] if ld.has_key? 'headline'
59
- end
60
- end
61
- begin
62
- [
63
- metainspector.best_title,
64
- metainspector.title,
65
- nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
66
- nokogiri.css('title')&.first&.inner_text
67
- ].each do |possibility|
68
- return possibility unless possibility.to_s.empty?
69
- end
70
- rescue
71
- log 'Curation::Page find_title error'
72
- end
73
- return ''
74
- end
75
-
76
- def find_image
77
- log "Curation::Page find_image #{url}"
78
- if json_ld.any?
79
- json_ld.each do |ld|
80
- ld = ld.first if ld.is_a?(Array)
81
- if ld.has_key? 'image'
82
- image_data = ld['image']
83
- if image_data.is_a? String
84
- log "Curation::Page find_image json_ld string"
85
- return image_data
86
- end
87
- if image_data.is_a? Array
88
- first = image_data.first
89
- if first.is_a? String
90
- log "Curation::Page find_image json_ld array"
91
- return first
92
- end
93
- if first.is_a? Hash
94
- log "Curation::Page find_image json_ld array url"
95
- return first['url']
96
- end
97
- end
98
- if image_data.is_a? Hash
99
- log "Curation::Page find_image json_ld url"
100
- return image_data['url']
101
- end
102
- end
103
- end
104
- end
105
- begin
106
- [
107
- metainspector.images.best,
108
- nokogiri.css('[property="og:image"]').first&.attributes['content'].value
109
- ].each do |possibility|
110
- return possibility unless possibility.to_s.empty?
111
- end
112
- rescue
113
- puts 'Curation::Page find_image error'
114
- end
115
- return ''
116
- end
117
-
118
- def find_text
119
- text = find_text_with_json_ld || find_text_with_nokogiri
120
- text.to_s.gsub!('<br><br>', '<br>')
121
- # require 'byebug'; byebug
122
- text = clean_encoding text
123
- text
124
- end
125
-
126
- def find_text_with_json_ld
127
- if json_ld.any?
128
- json_ld.each do |ld|
129
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
130
- return ld['text'] if ld.has_key? 'text'
131
- return ld['articleBody'] if ld.has_key? 'articleBody'
132
- end
133
- end
134
- nil
135
- end
136
-
137
- def find_text_with_nokogiri
138
- h = nokogiri.dup
139
- BLACKLIST.each do |tag|
140
- h.css(tag).remove
141
- end
142
- nodes = h.css('p')
143
- nodes.xpath('//style').remove
144
- text = nodes.to_html
145
- text
146
- end
147
-
148
- def find_date
149
- if json_ld.any?
150
- json_ld.each do |ld|
151
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
152
- return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
153
- end
154
- end
155
- return Date.parse metatags['date'] rescue nil
156
- return Date.parse metatags['pubdate'] rescue nil
157
- return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
158
- return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
159
- return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
160
- chunks = html.split('DisplayDate')
161
- if chunks.count > 1
162
- value = chunks[1]
163
- value = value.split(',').first
164
- value = value.gsub('"', '')
165
- value = value[1..-1] if value[0] == ':'
166
- return Date.parse value rescue nil
167
- end
168
- begin
169
- value = nokogiri.css('.postDate').first
170
- value = value.inner_text
171
- value = value.gsub(' — ', '')
172
- return Date.parse value
173
- rescue
174
- end
175
- begin
176
- value = nokogiri.css('.gta_post_date').first
177
- value = value.inner_text
178
- return Date.parse value
179
- rescue
180
- end
181
- end
182
-
183
- private
184
-
185
- def json_ld
186
- unless defined?(@json_ld)
187
- @json_ld = []
188
- begin
189
- options = nokogiri.css('[type="application/ld+json"]')
190
- options.each do |option|
191
- @json_ld << json_ld_from_object(option)
192
- end
193
- # Some sites have tables in tables
194
- @json_ld.flatten!
195
- # require 'byebug'; byebug
196
- rescue
197
- log 'Curation::Page json_ld error'
198
- end
199
- end
200
- @json_ld
201
- end
202
-
203
- def json_ld_from_object(object)
204
- JSON.parse object.inner_text
205
- rescue
206
- {}
207
- end
208
-
209
- def file
210
- @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
211
- rescue
212
- log "Curation::Page file error with url #{url}"
213
- end
214
-
215
- def html
216
- unless @html
217
- file.rewind
218
- @html = file.read
219
- file.rewind
220
- end
221
- @html
222
- rescue
223
- log "Curation::Page html error"
224
- end
225
-
226
- def nokogiri
227
- unless @nokogiri
228
- if file.nil?
229
- @nokogiri = metainspector.parsed
230
- else
231
- file.rewind
232
- @nokogiri = Nokogiri::HTML file
233
- file.rewind
234
- end
235
- end
236
- @nokogiri
237
- rescue
238
- log 'Curation::Page nokogiri error'
239
- end
240
-
241
- def metainspector
242
- unless @metainspector
243
- @metainspector = html.nil? ? MetaInspector.new(url)
244
- : MetaInspector.new(url, document: html)
245
- end
246
- @metainspector
247
- rescue
248
- log 'Curation::Page metainspector error'
249
- end
250
-
251
- def metatags
252
- @metatags ||= metainspector.meta_tag['name']
253
- rescue
254
- log 'Curation::Page metatags error'
255
- end
256
-
257
- # r&Atilde;&copy;forme -> réforme
258
- def clean_encoding(text)
259
- clean_text = HTMLEntities.new.decode text
260
- double_encoding = false
261
- [
262
- 'é', # é
263
- 'è', # è
264
- 'î', # î
265
- 'ê', # ê
266
- ].each do |string|
267
- # require 'byebug'; byebug
268
- double_encoding = true if clean_text.include? string
269
- end
270
- if double_encoding
271
- clean_text.encode('iso-8859-1', undef: :replace)
272
- .force_encoding('utf-8')
273
- else
274
- text
275
- end
276
- end
277
-
278
41
  def log(message)
279
42
  puts message if verbose
280
43
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.11'
4
+ version: '2.0'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-29 00:00:00.000000000 Z
11
+ date: 2023-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
@@ -72,13 +72,21 @@ files:
72
72
  - bin/setup
73
73
  - curation.gemspec
74
74
  - lib/curation.rb
75
+ - lib/curation/finders/image.rb
76
+ - lib/curation/finders/publication_date.rb
77
+ - lib/curation/finders/text.rb
78
+ - lib/curation/finders/title.rb
79
+ - lib/curation/tools/jsonld.rb
80
+ - lib/curation/tools/metainspector.rb
81
+ - lib/curation/tools/nokogiri.rb
82
+ - lib/curation/tools/raw.rb
75
83
  - lib/curation/version.rb
76
- homepage: https://github.com/arnaudlevy/curation
84
+ homepage: https://github.com/noesya/curation
77
85
  licenses:
78
86
  - MIT
79
87
  metadata:
80
- homepage_uri: https://github.com/arnaudlevy/curation
81
- source_code_uri: https://github.com/arnaudlevy/curation
88
+ homepage_uri: https://github.com/noesya/curation
89
+ source_code_uri: https://github.com/noesya/curation
82
90
  post_install_message:
83
91
  rdoc_options: []
84
92
  require_paths:
@@ -94,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
102
  - !ruby/object:Gem::Version
95
103
  version: '0'
96
104
  requirements: []
97
- rubygems_version: 3.4.6
105
+ rubygems_version: 3.4.10
98
106
  signing_key:
99
107
  specification_version: 4
100
108
  summary: Curation of content