curation 1.11 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bcbcb0a8ecb81b659e2ae2aa35e27acac6006265cf2743b6394f81203c634425
4
- data.tar.gz: 25b25c7f30be8f9b004cec7efbd41caae4e69e87000ff585b92c8377b7855439
3
+ metadata.gz: c2062c7ec7fb444d27f102d26658b386a01a964ea4e03aa0d81a472316012d11
4
+ data.tar.gz: 634f5216e61801b3ac42c8b340d5012c101024f4d5b5e0ba155c01a89f38cafb
5
5
  SHA512:
6
- metadata.gz: 420056127e2c0ca86a4ad2e08fb37aa3da31a394264e4d7807a844dd9db61bc44124d6e978a076d9f50a1df37f7a5f80cc782145f54d11c7c3ce00566feca80d
7
- data.tar.gz: a3b387e1c7345968c0eaaf1d2e8fbdbf752aa8b7a8b1e3b3807f206c43e7b096419b61b907622065f690d00a55b933038f0f3ed1b4fefa4d970f272afc1438d5
6
+ metadata.gz: 15008b92c6a51fdf9bd79b9f1da01d59323c13589b6e8ccd5d8754b96153173fe8a2cdffa5faa45a4d38c92b097a35012f8dff98fea6244ade8c9af22c13d1cc
7
+ data.tar.gz: 96a29d3c8482fce0101f91a3f24eafaec51a9115f2169831b67939edf79a47d7f5bb0c3223466de355cf8a367143a262cc54a1d5e4dd7e6bffe2d310879a9cae
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.11)
4
+ curation (2.0.1)
5
5
  htmlentities
6
6
  metainspector
7
7
  nokogiri
@@ -12,11 +12,13 @@ GEM
12
12
  addressable (2.8.5)
13
13
  public_suffix (>= 2.0.2, < 6.0)
14
14
  ansi (1.5.0)
15
+ base64 (0.2.0)
15
16
  builder (3.2.4)
16
17
  byebug (11.1.3)
17
18
  domain_name (0.5.20190701)
18
19
  unf (>= 0.0.5, < 1.0.0)
19
- faraday (2.7.10)
20
+ faraday (2.7.11)
21
+ base64
20
22
  faraday-net_http (>= 2.0, < 3.1)
21
23
  ruby2_keywords (>= 0.0.4)
22
24
  faraday-cookie_jar (0.0.7)
@@ -50,7 +52,7 @@ GEM
50
52
  fastimage (~> 2.2)
51
53
  nesty (~> 1.0)
52
54
  nokogiri (~> 1.13)
53
- minitest (5.19.0)
55
+ minitest (5.20.0)
54
56
  minitest-reporters (1.6.1)
55
57
  ansi
56
58
  builder
@@ -60,13 +62,13 @@ GEM
60
62
  nokogiri (1.15.4-x86_64-darwin)
61
63
  racc (~> 1.4)
62
64
  public_suffix (5.0.3)
63
- racc (1.7.1)
65
+ racc (1.7.3)
64
66
  rake (12.3.3)
65
67
  ruby-progressbar (1.13.0)
66
68
  ruby2_keywords (0.0.5)
67
69
  unf (0.1.4)
68
70
  unf_ext
69
- unf_ext (0.0.8.2)
71
+ unf_ext (0.0.9)
70
72
  zlib (2.1.1)
71
73
 
72
74
  PLATFORMS
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2020 Arnaud Levy
3
+ Copyright (c) 2020 noesya
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -37,7 +37,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
37
37
 
38
38
  ## Contributing
39
39
 
40
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/curation/blob/master/CODE_OF_CONDUCT.md).
40
+ Bug reports and pull requests are welcome on GitHub at https://github.com/noesya/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/noesya/curation/blob/master/CODE_OF_CONDUCT.md).
41
41
 
42
42
 
43
43
  ## License
data/curation.gemspec CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
8
8
 
9
9
  spec.summary = 'Curation of content'
10
10
  spec.description = %q{When you build content curation tools, you need to extract the content of pages (title, text, image...). This requires different strategies and some fine tuning to work efficiently.}
11
- spec.homepage = "https://github.com/arnaudlevy/curation"
11
+ spec.homepage = "https://github.com/noesya/curation"
12
12
  spec.license = "MIT"
13
13
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
14
 
15
15
  spec.metadata["homepage_uri"] = spec.homepage
16
- spec.metadata["source_code_uri"] = "https://github.com/arnaudlevy/curation"
16
+ spec.metadata["source_code_uri"] = "https://github.com/noesya/curation"
17
17
 
18
18
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
19
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
@@ -0,0 +1,51 @@
1
+ module Image
2
+
3
+ def image
4
+ @image ||= find_image.to_s.gsub('http://', 'https://')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_image
10
+ log "Curation::Page find_image #{url}"
11
+ if json_ld.any?
12
+ json_ld.each do |ld|
13
+ ld = ld.first if ld.is_a?(Array)
14
+ if ld.has_key? 'image'
15
+ image_data = ld['image']
16
+ if image_data.is_a? String
17
+ log "Curation::Page find_image json_ld string"
18
+ return image_data
19
+ end
20
+ if image_data.is_a? Array
21
+ first = image_data.first
22
+ if first.is_a? String
23
+ log "Curation::Page find_image json_ld array"
24
+ return first
25
+ end
26
+ if first.is_a? Hash
27
+ log "Curation::Page find_image json_ld array url"
28
+ return first['url']
29
+ end
30
+ end
31
+ if image_data.is_a? Hash
32
+ log "Curation::Page find_image json_ld url"
33
+ return image_data['url']
34
+ end
35
+ end
36
+ end
37
+ end
38
+ begin
39
+ [
40
+ metainspector.images.best,
41
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
42
+ ].each do |possibility|
43
+ return possibility unless possibility.to_s.empty?
44
+ end
45
+ rescue
46
+ puts 'Curation::Page find_image error'
47
+ end
48
+ return ''
49
+ end
50
+
51
+ end
@@ -0,0 +1,43 @@
1
+ module PublicationDate
2
+
3
+ def date
4
+ @date ||= find_date
5
+ end
6
+
7
+ protected
8
+
9
+ def find_date
10
+ if json_ld.any?
11
+ json_ld.each do |ld|
12
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
13
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
14
+ end
15
+ end
16
+ return Date.parse metatags['date'] rescue nil
17
+ return Date.parse metatags['pubdate'] rescue nil
18
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
19
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
20
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
21
+ chunks = html.split('DisplayDate')
22
+ if chunks.count > 1
23
+ value = chunks[1]
24
+ value = value.split(',').first
25
+ value = value.gsub('"', '')
26
+ value = value[1..-1] if value[0] == ':'
27
+ return Date.parse value rescue nil
28
+ end
29
+ begin
30
+ value = nokogiri.css('.postDate').first
31
+ value = value.inner_text
32
+ value = value.gsub(' — ', '')
33
+ return Date.parse value
34
+ rescue
35
+ end
36
+ begin
37
+ value = nokogiri.css('.gta_post_date').first
38
+ value = value.inner_text
39
+ return Date.parse value
40
+ rescue
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,90 @@
1
+ module Text
2
+
3
+ def text
4
+ @text ||= find_text_and_clean
5
+ end
6
+
7
+ protected
8
+
9
+ BLACKLIST_HARD = [
10
+ 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
11
+ '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
12
+ '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
13
+ '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
14
+ '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
15
+ '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
16
+ '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
17
+ '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
18
+ ]
19
+
20
+ BLACKLIST_SOFT = [
21
+ 'head', 'script', 'noscript', 'style', 'iframe', 'nav', 'footer', 'aside', '[role="dialog"]'
22
+ ]
23
+
24
+ def find_text_and_clean
25
+ text = find_text.to_s.dup
26
+ text = text.gsub('<br><br>', '<br>')
27
+ text = text.gsub(/\s+/, ' ')
28
+ text = clean_encoding(text)
29
+ text
30
+
31
+ end
32
+
33
+ def find_text
34
+ find_text_with_json_ld ||
35
+ find_text_with_nokogiri_hard ||
36
+ find_text_with_nokogiri_soft
37
+ end
38
+
39
+ def find_text_with_json_ld
40
+ if json_ld.any?
41
+ json_ld.each do |ld|
42
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
43
+ return ld['text'] if ld.has_key? 'text'
44
+ return ld['articleBody'] if ld.has_key? 'articleBody'
45
+ end
46
+ end
47
+ false
48
+ end
49
+
50
+ def find_text_with_nokogiri_hard
51
+ h = nokogiri.dup
52
+ h.xpath('//style').remove
53
+ BLACKLIST_HARD.each do |tag|
54
+ h.css(tag).remove
55
+ end
56
+ nodes = h.css('p')
57
+ text = nodes.to_html
58
+ text.present? ? text : false
59
+ end
60
+
61
+ def find_text_with_nokogiri_soft
62
+ h = nokogiri.dup
63
+ h.xpath('//style').remove
64
+ BLACKLIST_SOFT.each do |tag|
65
+ h.css(tag).remove
66
+ end
67
+ h.text
68
+ end
69
+
70
+ # r&Atilde;&copy;forme -> réforme
71
+ def clean_encoding(text)
72
+ clean_text = HTMLEntities.new.decode text
73
+ double_encoding = false
74
+ [
75
+ 'é', # é
76
+ 'è', # è
77
+ 'î', # î
78
+ 'ê', # ê
79
+ ].each do |string|
80
+ double_encoding = true if clean_text.include? string
81
+ end
82
+ if double_encoding
83
+ clean_text.encode('iso-8859-1', undef: :replace)
84
+ .force_encoding('utf-8')
85
+ else
86
+ text
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,58 @@
1
+ module Title
2
+
3
+ def title
4
+ @title ||= find_title.strip.gsub(/\s+/, ' ')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_title
10
+ find_title_with_json_ld ||
11
+ find_title_with_metainspector ||
12
+ find_title_with_nokogiri ||
13
+ ''
14
+ end
15
+
16
+ def find_title_with_json_ld
17
+ if json_ld.any?
18
+ json_ld.each do |ld|
19
+ # require 'byebug'; byebug
20
+ ld = ld.first if ld.is_a?(Array)
21
+ return ld['headline'] if ld.has_key? 'headline'
22
+ end
23
+ end
24
+ false
25
+ end
26
+
27
+ def find_title_with_metainspector
28
+ metainspector_best_title = metainspector.best_title
29
+ metainspector_title = metainspector.title
30
+ # Problème avec une balise <meta property="title" content="Run 0" />,
31
+ # metainspector croit que c'est le titre de la page.
32
+ # Comme le title contient le best title, avec souvent des infos en plus sur le site,
33
+ # on vérifie si le best title est bien contenu dans le title
34
+ if metainspector_title.present? &&
35
+ metainspector_title.present? &&
36
+ metainspector_best_title.present? &&
37
+ metainspector_title.include?(metainspector_best_title)
38
+ return metainspector_best_title
39
+ elsif metainspector_title.present?
40
+ return metainspector_title
41
+ end
42
+ false
43
+ end
44
+
45
+ def find_title_with_nokogiri
46
+ begin
47
+ [
48
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
49
+ nokogiri.css('title')&.first&.inner_text
50
+ ].each do |possibility|
51
+ return possibility unless possibility.to_s.empty?
52
+ end
53
+ rescue
54
+ log 'Curation::Page find_title_with_nokogiri error'
55
+ end
56
+ end
57
+
58
+ end
@@ -0,0 +1,26 @@
1
+ module Jsonld
2
+
3
+ def json_ld
4
+ unless defined?(@json_ld)
5
+ @json_ld = []
6
+ begin
7
+ options = nokogiri.css('[type="application/ld+json"]')
8
+ options.each do |option|
9
+ @json_ld << json_ld_from_object(option)
10
+ end
11
+ # Some sites have tables in tables
12
+ @json_ld.flatten!
13
+ # require 'byebug'; byebug
14
+ rescue
15
+ log 'Curation::Page json_ld error'
16
+ end
17
+ end
18
+ @json_ld
19
+ end
20
+
21
+ def json_ld_from_object(object)
22
+ JSON.parse object.inner_text
23
+ rescue
24
+ {}
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ module Metainspector
2
+
3
+ def metainspector
4
+ unless @metainspector
5
+ @metainspector = html.nil? ? MetaInspector.new(url)
6
+ : MetaInspector.new(url, document: html)
7
+ end
8
+ @metainspector
9
+ rescue
10
+ log 'Curation::Page metainspector error'
11
+ end
12
+
13
+ def metatags
14
+ @metatags ||= metainspector.meta_tag['name']
15
+ rescue
16
+ log 'Curation::Page metatags error'
17
+ end
18
+ end
@@ -0,0 +1,17 @@
1
+ module Nokogiri
2
+
3
+ def nokogiri
4
+ unless @nokogiri
5
+ if file.nil?
6
+ @nokogiri = metainspector.parsed
7
+ else
8
+ file.rewind
9
+ @nokogiri = Nokogiri::HTML file
10
+ file.rewind
11
+ end
12
+ end
13
+ @nokogiri
14
+ rescue
15
+ log 'Curation::Page nokogiri error'
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Raw
2
+
3
+ def file
4
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
5
+ rescue
6
+ log "Curation::Page file error with url #{url}"
7
+ end
8
+
9
+ def html
10
+ unless @html
11
+ file.rewind
12
+ @html = file.read
13
+ file.rewind
14
+ end
15
+ @html
16
+ rescue
17
+ log "Curation::Page html error"
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.11"
2
+ VERSION = "2.0.1"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,4 +1,12 @@
1
1
  require "curation/version"
2
+ require "curation/tools/raw"
3
+ require "curation/tools/nokogiri"
4
+ require "curation/tools/jsonld"
5
+ require "curation/tools/metainspector"
6
+ require "curation/finders/image"
7
+ require "curation/finders/publication_date"
8
+ require "curation/finders/text"
9
+ require "curation/finders/title"
2
10
  require "metainspector"
3
11
  require "open-uri"
4
12
  require "htmlentities"
@@ -7,274 +15,29 @@ module Curation
7
15
  class Error < StandardError; end
8
16
 
9
17
  class Page
18
+ # Tools
19
+ include Raw
20
+ include Jsonld
21
+ include Metainspector
22
+ include Nokogiri
23
+
24
+ # Finders
25
+ include Title
26
+ include Image
27
+ include PublicationDate
28
+ include Text
29
+
10
30
  attr_reader :url
11
31
  attr_accessor :verbose
12
32
 
13
- BLACKLIST = [
14
- 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
15
- '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
16
- '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
17
- '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
18
- '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
19
- '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
20
- '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
21
- '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
22
- ]
23
-
24
33
  def initialize(url, html = nil)
25
34
  @url = url.to_s.gsub('http://', 'https://')
26
35
  @html = html
27
36
  @verbose = false
28
37
  end
29
38
 
30
- def title
31
- @title ||= find_title
32
- end
33
-
34
- def image
35
- unless @image
36
- @image = find_image
37
- @image = @image.to_s.gsub('http://', 'https://')
38
- end
39
- @image
40
- end
41
-
42
- def text
43
- # require 'byebug'; byebug
44
- @text ||= find_text
45
- end
46
-
47
- def date
48
- @date ||= find_date
49
- end
50
-
51
39
  protected
52
40
 
53
- def find_title
54
- if json_ld.any?
55
- json_ld.each do |ld|
56
- # require 'byebug'; byebug
57
- ld = ld.first if ld.is_a?(Array)
58
- return ld['headline'] if ld.has_key? 'headline'
59
- end
60
- end
61
- begin
62
- [
63
- metainspector.best_title,
64
- metainspector.title,
65
- nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
66
- nokogiri.css('title')&.first&.inner_text
67
- ].each do |possibility|
68
- return possibility unless possibility.to_s.empty?
69
- end
70
- rescue
71
- log 'Curation::Page find_title error'
72
- end
73
- return ''
74
- end
75
-
76
- def find_image
77
- log "Curation::Page find_image #{url}"
78
- if json_ld.any?
79
- json_ld.each do |ld|
80
- ld = ld.first if ld.is_a?(Array)
81
- if ld.has_key? 'image'
82
- image_data = ld['image']
83
- if image_data.is_a? String
84
- log "Curation::Page find_image json_ld string"
85
- return image_data
86
- end
87
- if image_data.is_a? Array
88
- first = image_data.first
89
- if first.is_a? String
90
- log "Curation::Page find_image json_ld array"
91
- return first
92
- end
93
- if first.is_a? Hash
94
- log "Curation::Page find_image json_ld array url"
95
- return first['url']
96
- end
97
- end
98
- if image_data.is_a? Hash
99
- log "Curation::Page find_image json_ld url"
100
- return image_data['url']
101
- end
102
- end
103
- end
104
- end
105
- begin
106
- [
107
- metainspector.images.best,
108
- nokogiri.css('[property="og:image"]').first&.attributes['content'].value
109
- ].each do |possibility|
110
- return possibility unless possibility.to_s.empty?
111
- end
112
- rescue
113
- puts 'Curation::Page find_image error'
114
- end
115
- return ''
116
- end
117
-
118
- def find_text
119
- text = find_text_with_json_ld || find_text_with_nokogiri
120
- text.to_s.gsub!('<br><br>', '<br>')
121
- # require 'byebug'; byebug
122
- text = clean_encoding text
123
- text
124
- end
125
-
126
- def find_text_with_json_ld
127
- if json_ld.any?
128
- json_ld.each do |ld|
129
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
130
- return ld['text'] if ld.has_key? 'text'
131
- return ld['articleBody'] if ld.has_key? 'articleBody'
132
- end
133
- end
134
- nil
135
- end
136
-
137
- def find_text_with_nokogiri
138
- h = nokogiri.dup
139
- BLACKLIST.each do |tag|
140
- h.css(tag).remove
141
- end
142
- nodes = h.css('p')
143
- nodes.xpath('//style').remove
144
- text = nodes.to_html
145
- text
146
- end
147
-
148
- def find_date
149
- if json_ld.any?
150
- json_ld.each do |ld|
151
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
152
- return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
153
- end
154
- end
155
- return Date.parse metatags['date'] rescue nil
156
- return Date.parse metatags['pubdate'] rescue nil
157
- return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
158
- return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
159
- return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
160
- chunks = html.split('DisplayDate')
161
- if chunks.count > 1
162
- value = chunks[1]
163
- value = value.split(',').first
164
- value = value.gsub('"', '')
165
- value = value[1..-1] if value[0] == ':'
166
- return Date.parse value rescue nil
167
- end
168
- begin
169
- value = nokogiri.css('.postDate').first
170
- value = value.inner_text
171
- value = value.gsub(' — ', '')
172
- return Date.parse value
173
- rescue
174
- end
175
- begin
176
- value = nokogiri.css('.gta_post_date').first
177
- value = value.inner_text
178
- return Date.parse value
179
- rescue
180
- end
181
- end
182
-
183
- private
184
-
185
- def json_ld
186
- unless defined?(@json_ld)
187
- @json_ld = []
188
- begin
189
- options = nokogiri.css('[type="application/ld+json"]')
190
- options.each do |option|
191
- @json_ld << json_ld_from_object(option)
192
- end
193
- # Some sites have tables in tables
194
- @json_ld.flatten!
195
- # require 'byebug'; byebug
196
- rescue
197
- log 'Curation::Page json_ld error'
198
- end
199
- end
200
- @json_ld
201
- end
202
-
203
- def json_ld_from_object(object)
204
- JSON.parse object.inner_text
205
- rescue
206
- {}
207
- end
208
-
209
- def file
210
- @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
211
- rescue
212
- log "Curation::Page file error with url #{url}"
213
- end
214
-
215
- def html
216
- unless @html
217
- file.rewind
218
- @html = file.read
219
- file.rewind
220
- end
221
- @html
222
- rescue
223
- log "Curation::Page html error"
224
- end
225
-
226
- def nokogiri
227
- unless @nokogiri
228
- if file.nil?
229
- @nokogiri = metainspector.parsed
230
- else
231
- file.rewind
232
- @nokogiri = Nokogiri::HTML file
233
- file.rewind
234
- end
235
- end
236
- @nokogiri
237
- rescue
238
- log 'Curation::Page nokogiri error'
239
- end
240
-
241
- def metainspector
242
- unless @metainspector
243
- @metainspector = html.nil? ? MetaInspector.new(url)
244
- : MetaInspector.new(url, document: html)
245
- end
246
- @metainspector
247
- rescue
248
- log 'Curation::Page metainspector error'
249
- end
250
-
251
- def metatags
252
- @metatags ||= metainspector.meta_tag['name']
253
- rescue
254
- log 'Curation::Page metatags error'
255
- end
256
-
257
- # r&Atilde;&copy;forme -> réforme
258
- def clean_encoding(text)
259
- clean_text = HTMLEntities.new.decode text
260
- double_encoding = false
261
- [
262
- 'é', # é
263
- 'è', # è
264
- 'î', # î
265
- 'ê', # ê
266
- ].each do |string|
267
- # require 'byebug'; byebug
268
- double_encoding = true if clean_text.include? string
269
- end
270
- if double_encoding
271
- clean_text.encode('iso-8859-1', undef: :replace)
272
- .force_encoding('utf-8')
273
- else
274
- text
275
- end
276
- end
277
-
278
41
  def log(message)
279
42
  puts message if verbose
280
43
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.11'
4
+ version: 2.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-29 00:00:00.000000000 Z
11
+ date: 2023-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
@@ -72,13 +72,21 @@ files:
72
72
  - bin/setup
73
73
  - curation.gemspec
74
74
  - lib/curation.rb
75
+ - lib/curation/finders/image.rb
76
+ - lib/curation/finders/publication_date.rb
77
+ - lib/curation/finders/text.rb
78
+ - lib/curation/finders/title.rb
79
+ - lib/curation/tools/jsonld.rb
80
+ - lib/curation/tools/metainspector.rb
81
+ - lib/curation/tools/nokogiri.rb
82
+ - lib/curation/tools/raw.rb
75
83
  - lib/curation/version.rb
76
- homepage: https://github.com/arnaudlevy/curation
84
+ homepage: https://github.com/noesya/curation
77
85
  licenses:
78
86
  - MIT
79
87
  metadata:
80
- homepage_uri: https://github.com/arnaudlevy/curation
81
- source_code_uri: https://github.com/arnaudlevy/curation
88
+ homepage_uri: https://github.com/noesya/curation
89
+ source_code_uri: https://github.com/noesya/curation
82
90
  post_install_message:
83
91
  rdoc_options: []
84
92
  require_paths:
@@ -94,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
102
  - !ruby/object:Gem::Version
95
103
  version: '0'
96
104
  requirements: []
97
- rubygems_version: 3.4.6
105
+ rubygems_version: 3.4.10
98
106
  signing_key:
99
107
  specification_version: 4
100
108
  summary: Curation of content