curation 1.10 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a23e967e5d017ce61f9719647f45c20e1336aff067bc2323f64641dfac695f75
4
- data.tar.gz: 49427d9325a27034c1969d71875dcd4eacff4f5c3ac9625ccc6cdb554c4c2df4
3
+ metadata.gz: 8bfaa5cfe719f5a5f5c9f153a358e74f11eb4217f4bbddca880b8f45829fc913
4
+ data.tar.gz: 4abb219144b696da0f33ab12b251036bd67c4ca7250a4576febdcaaa8e409dc0
5
5
  SHA512:
6
- metadata.gz: b01f4209b09d6ec09917096159b98be71b31fd0952524a12c7310ba29f88ee8c888bc2927015ef2b1fb529cfe46964c08c743c818807518dee6bfa3cc32f6767
7
- data.tar.gz: 564d14e3afaa17f00ac7b034917c7b28612b54b6f71126a36de4d6d43def7b8c19814475ee5b13651bbea7e7eed752b873f918af10baf169bd153d66b8d05c7d
6
+ metadata.gz: 6d19e2a775ea069a3e6b0dc50b66f30d52f0c4e1e101cea2c9f0dde5cedde611043cf4267a4764994be8105e422919bf708f850935d531d9bb8f907025393a05
7
+ data.tar.gz: 4f0dd67b073cc2003cf798603f7221ba32977645180b671bcde0b1464fc53085cf9a076c2f30f4ce3775a8524246386d576e1bd0d1021c8388db58d13c16068f
data/Gemfile.lock CHANGED
@@ -1,88 +1,79 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.10)
4
+ curation (2.0)
5
5
  htmlentities
6
- metainspector (~> 5.12)
6
+ metainspector
7
7
  nokogiri
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.8.0)
13
- public_suffix (>= 2.0.2, < 5.0)
12
+ addressable (2.8.5)
13
+ public_suffix (>= 2.0.2, < 6.0)
14
14
  ansi (1.5.0)
15
+ base64 (0.2.0)
15
16
  builder (3.2.4)
16
17
  byebug (11.1.3)
17
18
  domain_name (0.5.20190701)
18
19
  unf (>= 0.0.5, < 1.0.0)
19
- faraday (1.10.0)
20
- faraday-em_http (~> 1.0)
21
- faraday-em_synchrony (~> 1.0)
22
- faraday-excon (~> 1.1)
23
- faraday-httpclient (~> 1.0)
24
- faraday-multipart (~> 1.0)
25
- faraday-net_http (~> 1.0)
26
- faraday-net_http_persistent (~> 1.0)
27
- faraday-patron (~> 1.0)
28
- faraday-rack (~> 1.0)
29
- faraday-retry (~> 1.0)
20
+ faraday (2.7.11)
21
+ base64
22
+ faraday-net_http (>= 2.0, < 3.1)
30
23
  ruby2_keywords (>= 0.0.4)
31
24
  faraday-cookie_jar (0.0.7)
32
25
  faraday (>= 0.8.0)
33
26
  http-cookie (~> 1.0.0)
34
- faraday-em_http (1.0.0)
35
- faraday-em_synchrony (1.0.0)
36
27
  faraday-encoding (0.0.5)
37
28
  faraday
38
- faraday-excon (1.1.0)
39
- faraday-http-cache (2.4.0)
29
+ faraday-follow_redirects (0.3.0)
30
+ faraday (>= 1, < 3)
31
+ faraday-gzip (1.0.0)
32
+ faraday (>= 1.0)
33
+ zlib (~> 2.1)
34
+ faraday-http-cache (2.5.0)
40
35
  faraday (>= 0.8)
41
- faraday-httpclient (1.0.1)
42
- faraday-multipart (1.0.4)
43
- multipart-post (~> 2)
44
- faraday-net_http (1.0.1)
45
- faraday-net_http_persistent (1.2.0)
46
- faraday-patron (1.0.0)
47
- faraday-rack (1.0.0)
48
- faraday-retry (1.0.3)
49
- faraday_middleware (1.2.0)
50
- faraday (~> 1.0)
51
- fastimage (2.2.6)
36
+ faraday-net_http (3.0.2)
37
+ faraday-retry (2.2.0)
38
+ faraday (~> 2.0)
39
+ fastimage (2.2.7)
52
40
  htmlentities (4.3.4)
53
41
  http-cookie (1.0.5)
54
42
  domain_name (~> 0.5)
55
- metainspector (5.12.1)
56
- addressable (~> 2.7)
57
- faraday (>= 1.4, < 3.0)
43
+ metainspector (5.15.0)
44
+ addressable (~> 2.8.4)
45
+ faraday (~> 2.5)
58
46
  faraday-cookie_jar (~> 0.0)
59
47
  faraday-encoding (~> 0.0)
60
- faraday-http-cache (~> 2.2)
61
- faraday_middleware (~> 1.0)
48
+ faraday-follow_redirects (~> 0.3)
49
+ faraday-gzip (>= 0.1, < 2.0)
50
+ faraday-http-cache (~> 2.5)
51
+ faraday-retry (~> 2.0)
62
52
  fastimage (~> 2.2)
63
53
  nesty (~> 1.0)
64
- nokogiri (~> 1.11)
65
- minitest (5.15.0)
66
- minitest-reporters (1.5.0)
54
+ nokogiri (~> 1.13)
55
+ minitest (5.20.0)
56
+ minitest-reporters (1.6.1)
67
57
  ansi
68
58
  builder
69
59
  minitest (>= 5.0)
70
60
  ruby-progressbar
71
- multipart-post (2.2.0)
72
61
  nesty (1.0.2)
73
- nokogiri (1.13.6-x86_64-darwin)
62
+ nokogiri (1.15.4-x86_64-darwin)
74
63
  racc (~> 1.4)
75
- public_suffix (4.0.7)
76
- racc (1.6.0)
64
+ public_suffix (5.0.3)
65
+ racc (1.7.3)
77
66
  rake (12.3.3)
78
- ruby-progressbar (1.11.0)
67
+ ruby-progressbar (1.13.0)
79
68
  ruby2_keywords (0.0.5)
80
69
  unf (0.1.4)
81
70
  unf_ext
82
- unf_ext (0.0.8.2)
71
+ unf_ext (0.0.9)
72
+ zlib (2.1.1)
83
73
 
84
74
  PLATFORMS
85
75
  x86_64-darwin-21
76
+ x86_64-darwin-22
86
77
 
87
78
  DEPENDENCIES
88
79
  byebug
@@ -92,4 +83,4 @@ DEPENDENCIES
92
83
  rake (~> 12.0)
93
84
 
94
85
  BUNDLED WITH
95
- 2.3.12
86
+ 2.4.6
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2020 Arnaud Levy
3
+ Copyright (c) 2020 noesya
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -37,7 +37,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
37
37
 
38
38
  ## Contributing
39
39
 
40
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/curation/blob/master/CODE_OF_CONDUCT.md).
40
+ Bug reports and pull requests are welcome on GitHub at https://github.com/noesya/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/noesya/curation/blob/master/CODE_OF_CONDUCT.md).
41
41
 
42
42
 
43
43
  ## License
data/curation.gemspec CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
8
8
 
9
9
  spec.summary = 'Curation of content'
10
10
  spec.description = %q{When you build content curation tools, you need to extract the content of pages (title, text, image...). This requires different strategies and some fine tuning to work efficiently.}
11
- spec.homepage = "https://github.com/arnaudlevy/curation"
11
+ spec.homepage = "https://github.com/noesya/curation"
12
12
  spec.license = "MIT"
13
13
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
14
 
15
15
  spec.metadata["homepage_uri"] = spec.homepage
16
- spec.metadata["source_code_uri"] = "https://github.com/arnaudlevy/curation"
16
+ spec.metadata["source_code_uri"] = "https://github.com/noesya/curation"
17
17
 
18
18
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
19
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.bindir = "exe"
22
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
23
  spec.require_paths = ["lib"]
24
- spec.add_dependency "metainspector", '~> 5.12'
24
+ spec.add_dependency "metainspector"
25
25
  spec.add_dependency "nokogiri"
26
26
  spec.add_dependency "htmlentities"
27
27
  end
@@ -0,0 +1,51 @@
1
+ module Image
2
+
3
+ def image
4
+ @image ||= find_image.to_s.gsub('http://', 'https://')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_image
10
+ log "Curation::Page find_image #{url}"
11
+ if json_ld.any?
12
+ json_ld.each do |ld|
13
+ ld = ld.first if ld.is_a?(Array)
14
+ if ld.has_key? 'image'
15
+ image_data = ld['image']
16
+ if image_data.is_a? String
17
+ log "Curation::Page find_image json_ld string"
18
+ return image_data
19
+ end
20
+ if image_data.is_a? Array
21
+ first = image_data.first
22
+ if first.is_a? String
23
+ log "Curation::Page find_image json_ld array"
24
+ return first
25
+ end
26
+ if first.is_a? Hash
27
+ log "Curation::Page find_image json_ld array url"
28
+ return first['url']
29
+ end
30
+ end
31
+ if image_data.is_a? Hash
32
+ log "Curation::Page find_image json_ld url"
33
+ return image_data['url']
34
+ end
35
+ end
36
+ end
37
+ end
38
+ begin
39
+ [
40
+ metainspector.images.best,
41
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
42
+ ].each do |possibility|
43
+ return possibility unless possibility.to_s.empty?
44
+ end
45
+ rescue
46
+ puts 'Curation::Page find_image error'
47
+ end
48
+ return ''
49
+ end
50
+
51
+ end
@@ -0,0 +1,43 @@
1
+ module PublicationDate
2
+
3
+ def date
4
+ @date ||= find_date
5
+ end
6
+
7
+ protected
8
+
9
+ def find_date
10
+ if json_ld.any?
11
+ json_ld.each do |ld|
12
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
13
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
14
+ end
15
+ end
16
+ return Date.parse metatags['date'] rescue nil
17
+ return Date.parse metatags['pubdate'] rescue nil
18
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
19
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
20
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
21
+ chunks = html.split('DisplayDate')
22
+ if chunks.count > 1
23
+ value = chunks[1]
24
+ value = value.split(',').first
25
+ value = value.gsub('"', '')
26
+ value = value[1..-1] if value[0] == ':'
27
+ return Date.parse value rescue nil
28
+ end
29
+ begin
30
+ value = nokogiri.css('.postDate').first
31
+ value = value.inner_text
32
+ value = value.gsub(' — ', '')
33
+ return Date.parse value
34
+ rescue
35
+ end
36
+ begin
37
+ value = nokogiri.css('.gta_post_date').first
38
+ value = value.inner_text
39
+ return Date.parse value
40
+ rescue
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,78 @@
1
+ module Text
2
+
3
+ BLACKLIST = [
4
+ 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
5
+ '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
6
+ '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
7
+ '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
8
+ '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
9
+ '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
10
+ '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
11
+ '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
12
+ ]
13
+
14
+ def text
15
+ # require 'byebug'; byebug
16
+ @text ||= find_text
17
+ end
18
+
19
+ protected
20
+
21
+ def find_text
22
+ text = find_text_with_json_ld || find_text_with_nokogiri
23
+ text.to_s.dup.gsub!('<br><br>', '<br>')
24
+ # require 'byebug'; byebug
25
+ text = clean_encoding text
26
+ text
27
+ end
28
+
29
+ def find_text_with_json_ld
30
+ if json_ld.any?
31
+ json_ld.each do |ld|
32
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
33
+ return ld['text'] if ld.has_key? 'text'
34
+ return ld['articleBody'] if ld.has_key? 'articleBody'
35
+ end
36
+ end
37
+ nil
38
+ end
39
+
40
+ def find_text_with_nokogiri
41
+ h = nokogiri.dup
42
+ h.xpath('//style').remove
43
+ BLACKLIST.each do |tag|
44
+ h.css(tag).remove
45
+ end
46
+ nodes = h.css('p')
47
+ if nodes.any?
48
+ text = nodes.to_html
49
+ text
50
+ else
51
+ # Cleanup was too hard, let's try softer
52
+ h = nokogiri.dup
53
+ h.text
54
+ end
55
+ end
56
+
57
+ # r&Atilde;&copy;forme -> réforme
58
+ def clean_encoding(text)
59
+ clean_text = HTMLEntities.new.decode text
60
+ double_encoding = false
61
+ [
62
+ 'é', # é
63
+ 'è', # è
64
+ 'î', # î
65
+ 'ê', # ê
66
+ ].each do |string|
67
+ # require 'byebug'; byebug
68
+ double_encoding = true if clean_text.include? string
69
+ end
70
+ if double_encoding
71
+ clean_text.encode('iso-8859-1', undef: :replace)
72
+ .force_encoding('utf-8')
73
+ else
74
+ text
75
+ end
76
+ end
77
+
78
+ end
@@ -0,0 +1,57 @@
1
+ module Title
2
+
3
+ def title
4
+ @title ||= find_title.strip.gsub(/\s+/, ' ')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_title
10
+ find_title_with_json_ld ||
11
+ find_title_with_metainspector ||
12
+ find_title_with_nokogiri ||
13
+ ''
14
+ end
15
+
16
+ def find_title_with_json_ld
17
+ if json_ld.any?
18
+ json_ld.each do |ld|
19
+ # require 'byebug'; byebug
20
+ ld = ld.first if ld.is_a?(Array)
21
+ return ld['headline'] if ld.has_key? 'headline'
22
+ end
23
+ end
24
+ nil
25
+ end
26
+
27
+ def find_title_with_metainspector
28
+ metainspector_best_title = metainspector.best_title
29
+ metainspector_title = metainspector.title
30
+ # Problème avec une balise <meta property="title" content="Run 0" />,
31
+ # metainspector croit que c'est le titre de la page.
32
+ # Comme le title contient le best title, avec souvent des infos en plus sur le site,
33
+ # on vérifie si le best title est bien contenu dans le title
34
+ if metainspector_title.present? &&
35
+ metainspector_title.present? &&
36
+ metainspector_best_title.present? &&
37
+ metainspector_title.include?(metainspector_best_title)
38
+ return metainspector_best_title
39
+ elsif metainspector_title.present?
40
+ return metainspector_title
41
+ end
42
+ end
43
+
44
+ def find_title_with_nokogiri
45
+ begin
46
+ [
47
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
48
+ nokogiri.css('title')&.first&.inner_text
49
+ ].each do |possibility|
50
+ return possibility unless possibility.to_s.empty?
51
+ end
52
+ rescue
53
+ log 'Curation::Page find_title_with_nokogiri error'
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1,26 @@
1
+ module Jsonld
2
+
3
+ def json_ld
4
+ unless defined?(@json_ld)
5
+ @json_ld = []
6
+ begin
7
+ options = nokogiri.css('[type="application/ld+json"]')
8
+ options.each do |option|
9
+ @json_ld << json_ld_from_object(option)
10
+ end
11
+ # Some sites have tables in tables
12
+ @json_ld.flatten!
13
+ # require 'byebug'; byebug
14
+ rescue
15
+ log 'Curation::Page json_ld error'
16
+ end
17
+ end
18
+ @json_ld
19
+ end
20
+
21
+ def json_ld_from_object(object)
22
+ JSON.parse object.inner_text
23
+ rescue
24
+ {}
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ module Metainspector
2
+
3
+ def metainspector
4
+ unless @metainspector
5
+ @metainspector = html.nil? ? MetaInspector.new(url)
6
+ : MetaInspector.new(url, document: html)
7
+ end
8
+ @metainspector
9
+ rescue
10
+ log 'Curation::Page metainspector error'
11
+ end
12
+
13
+ def metatags
14
+ @metatags ||= metainspector.meta_tag['name']
15
+ rescue
16
+ log 'Curation::Page metatags error'
17
+ end
18
+ end
@@ -0,0 +1,17 @@
1
+ module Nokogiri
2
+
3
+ def nokogiri
4
+ unless @nokogiri
5
+ if file.nil?
6
+ @nokogiri = metainspector.parsed
7
+ else
8
+ file.rewind
9
+ @nokogiri = Nokogiri::HTML file
10
+ file.rewind
11
+ end
12
+ end
13
+ @nokogiri
14
+ rescue
15
+ log 'Curation::Page nokogiri error'
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Raw
2
+
3
+ def file
4
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
5
+ rescue
6
+ log "Curation::Page file error with url #{url}"
7
+ end
8
+
9
+ def html
10
+ unless @html
11
+ file.rewind
12
+ @html = file.read
13
+ file.rewind
14
+ end
15
+ @html
16
+ rescue
17
+ log "Curation::Page html error"
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.10"
2
+ VERSION = "2.0"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,4 +1,12 @@
1
1
  require "curation/version"
2
+ require "curation/tools/raw"
3
+ require "curation/tools/nokogiri"
4
+ require "curation/tools/jsonld"
5
+ require "curation/tools/metainspector"
6
+ require "curation/finders/image"
7
+ require "curation/finders/publication_date"
8
+ require "curation/finders/text"
9
+ require "curation/finders/title"
2
10
  require "metainspector"
3
11
  require "open-uri"
4
12
  require "htmlentities"
@@ -7,257 +15,31 @@ module Curation
7
15
  class Error < StandardError; end
8
16
 
9
17
  class Page
10
- attr_reader :url
18
+ # Tools
19
+ include Raw
20
+ include Jsonld
21
+ include Metainspector
22
+ include Nokogiri
23
+
24
+ # Finders
25
+ include Title
26
+ include Image
27
+ include PublicationDate
28
+ include Text
11
29
 
12
- BLACKLIST = [
13
- 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
14
- '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
15
- '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
16
- '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
17
- '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
18
- '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
19
- '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
20
- '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
21
- ]
30
+ attr_reader :url
31
+ attr_accessor :verbose
22
32
 
23
33
  def initialize(url, html = nil)
24
34
  @url = url.to_s.gsub('http://', 'https://')
25
35
  @html = html
26
- end
27
-
28
- def title
29
- @title ||= find_title
30
- end
31
-
32
- def image
33
- unless @image
34
- @image = find_image
35
- @image = @image.to_s.gsub('http://', 'https://')
36
- end
37
- @image
38
- end
39
-
40
- def text
41
- # require 'byebug'; byebug
42
- @text ||= find_text
43
- end
44
-
45
- def date
46
- @date ||= find_date
36
+ @verbose = false
47
37
  end
48
38
 
49
39
  protected
50
40
 
51
- def find_title
52
- if json_ld.any?
53
- json_ld.each do |ld|
54
- # require 'byebug'; byebug
55
- ld = ld.first if ld.is_a?(Array)
56
- return ld['headline'] if ld.has_key? 'headline'
57
- end
58
- end
59
- begin
60
- [
61
- metainspector.best_title,
62
- metainspector.title,
63
- nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
64
- nokogiri.css('title')&.first&.inner_text
65
- ].each do |possibility|
66
- return possibility unless possibility.to_s.empty?
67
- end
68
- rescue
69
- puts 'Curation::Page find_title error'
70
- end
71
- return ''
72
- end
73
-
74
- def find_image
75
- if json_ld.any?
76
- json_ld.each do |ld|
77
- ld = ld.first if ld.is_a?(Array)
78
- if ld.has_key? 'image'
79
- image_data = ld['image']
80
- return image_data if image_data.is_a? String
81
- if image_data.is_a? Array
82
- first = image_data.first
83
- return first if first.is_a? String
84
- return first['url'] if first.is_a? Hash
85
- end
86
- return image_data['url'] if image_data.is_a? Hash
87
- end
88
- end
89
- end
90
- begin
91
- [
92
- metainspector.images.best,
93
- nokogiri.css('[property="og:image"]').first&.attributes['content'].value
94
- ].each do |possibility|
95
- return possibility unless possibility.to_s.empty?
96
- end
97
- rescue
98
- puts 'Curation::Page find_image error'
99
- end
100
- return ''
101
- end
102
-
103
- def find_text
104
- text = find_text_with_json_ld || find_text_with_nokogiri
105
- text.to_s.gsub!('<br><br>', '<br>')
106
- # require 'byebug'; byebug
107
- text = clean_encoding text
108
- text
109
- end
110
-
111
- def find_text_with_json_ld
112
- if json_ld.any?
113
- json_ld.each do |ld|
114
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
115
- return ld['text'] if ld.has_key? 'text'
116
- return ld['articleBody'] if ld.has_key? 'articleBody'
117
- end
118
- end
119
- nil
120
- end
121
-
122
- def find_text_with_nokogiri
123
- h = nokogiri.dup
124
- BLACKLIST.each do |tag|
125
- h.css(tag).remove
126
- end
127
- nodes = h.css('p')
128
- nodes.xpath('//style').remove
129
- text = nodes.to_html
130
- text
131
- end
132
-
133
- def find_date
134
- if json_ld.any?
135
- json_ld.each do |ld|
136
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
137
- return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
138
- end
139
- end
140
- return Date.parse metatags['date'] rescue nil
141
- return Date.parse metatags['pubdate'] rescue nil
142
- return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
143
- return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
144
- return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
145
- chunks = html.split('DisplayDate')
146
- if chunks.count > 1
147
- value = chunks[1]
148
- value = value.split(',').first
149
- value = value.gsub('"', '')
150
- value = value[1..-1] if value[0] == ':'
151
- return Date.parse value rescue nil
152
- end
153
- begin
154
- value = nokogiri.css('.postDate').first
155
- value = value.inner_text
156
- value = value.gsub(' — ', '')
157
- return Date.parse value
158
- rescue
159
- end
160
- begin
161
- value = nokogiri.css('.gta_post_date').first
162
- value = value.inner_text
163
- return Date.parse value
164
- rescue
165
- end
166
- end
167
-
168
- private
169
-
170
- def json_ld
171
- unless defined?(@json_ld)
172
- @json_ld = []
173
- begin
174
- options = nokogiri.css('[type="application/ld+json"]')
175
- options.each do |option|
176
- @json_ld << json_ld_from_object(option)
177
- end
178
- # Some sites have tables in tables
179
- @json_ld.flatten!
180
- # require 'byebug'; byebug
181
- rescue
182
- puts 'Curation::Page json_ld error'
183
- end
184
- end
185
- @json_ld
186
- end
187
-
188
- def json_ld_from_object(object)
189
- JSON.parse object.inner_text
190
- rescue
191
- {}
192
- end
193
-
194
- def file
195
- @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
196
- rescue
197
- puts "Curation::Page file error with url #{url}"
198
- end
199
-
200
- def html
201
- unless @html
202
- file.rewind
203
- @html = file.read
204
- file.rewind
205
- end
206
- @html
207
- rescue
208
- puts "Curation::Page html error"
209
- end
210
-
211
- def nokogiri
212
- unless @nokogiri
213
- if file.nil?
214
- @nokogiri = metainspector.parsed
215
- else
216
- file.rewind
217
- @nokogiri = Nokogiri::HTML file
218
- file.rewind
219
- end
220
- end
221
- @nokogiri
222
- rescue
223
- puts 'Curation::Page nokogiri error'
224
- end
225
-
226
- def metainspector
227
- unless @metainspector
228
- @metainspector = html.nil? ? MetaInspector.new(url)
229
- : MetaInspector.new(url, document: html)
230
- end
231
- @metainspector
232
- rescue
233
- puts 'Curation::Page metainspector error'
234
- end
235
-
236
- def metatags
237
- @metatags ||= metainspector.meta_tag['name']
238
- rescue
239
- puts 'Curation::Page metatags error'
240
- end
241
-
242
- # r&Atilde;&copy;forme -> réforme
243
- def clean_encoding(text)
244
- clean_text = HTMLEntities.new.decode text
245
- double_encoding = false
246
- [
247
- 'é', # é
248
- 'è', # è
249
- 'î', # î
250
- 'ê', # ê
251
- ].each do |string|
252
- # require 'byebug'; byebug
253
- double_encoding = true if clean_text.include? string
254
- end
255
- if double_encoding
256
- clean_text.encode('iso-8859-1', undef: :replace)
257
- .force_encoding('utf-8')
258
- else
259
- text
260
- end
41
+ def log(message)
42
+ puts message if verbose
261
43
  end
262
44
  end
263
45
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.10'
4
+ version: '2.0'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-08 00:00:00.000000000 Z
11
+ date: 2023-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '5.12'
19
+ version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '5.12'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -72,13 +72,21 @@ files:
72
72
  - bin/setup
73
73
  - curation.gemspec
74
74
  - lib/curation.rb
75
+ - lib/curation/finders/image.rb
76
+ - lib/curation/finders/publication_date.rb
77
+ - lib/curation/finders/text.rb
78
+ - lib/curation/finders/title.rb
79
+ - lib/curation/tools/jsonld.rb
80
+ - lib/curation/tools/metainspector.rb
81
+ - lib/curation/tools/nokogiri.rb
82
+ - lib/curation/tools/raw.rb
75
83
  - lib/curation/version.rb
76
- homepage: https://github.com/arnaudlevy/curation
84
+ homepage: https://github.com/noesya/curation
77
85
  licenses:
78
86
  - MIT
79
87
  metadata:
80
- homepage_uri: https://github.com/arnaudlevy/curation
81
- source_code_uri: https://github.com/arnaudlevy/curation
88
+ homepage_uri: https://github.com/noesya/curation
89
+ source_code_uri: https://github.com/noesya/curation
82
90
  post_install_message:
83
91
  rdoc_options: []
84
92
  require_paths:
@@ -94,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
102
  - !ruby/object:Gem::Version
95
103
  version: '0'
96
104
  requirements: []
97
- rubygems_version: 3.1.6
105
+ rubygems_version: 3.4.10
98
106
  signing_key:
99
107
  specification_version: 4
100
108
  summary: Curation of content