curation 1.10 → 2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a23e967e5d017ce61f9719647f45c20e1336aff067bc2323f64641dfac695f75
4
- data.tar.gz: 49427d9325a27034c1969d71875dcd4eacff4f5c3ac9625ccc6cdb554c4c2df4
3
+ metadata.gz: 8bfaa5cfe719f5a5f5c9f153a358e74f11eb4217f4bbddca880b8f45829fc913
4
+ data.tar.gz: 4abb219144b696da0f33ab12b251036bd67c4ca7250a4576febdcaaa8e409dc0
5
5
  SHA512:
6
- metadata.gz: b01f4209b09d6ec09917096159b98be71b31fd0952524a12c7310ba29f88ee8c888bc2927015ef2b1fb529cfe46964c08c743c818807518dee6bfa3cc32f6767
7
- data.tar.gz: 564d14e3afaa17f00ac7b034917c7b28612b54b6f71126a36de4d6d43def7b8c19814475ee5b13651bbea7e7eed752b873f918af10baf169bd153d66b8d05c7d
6
+ metadata.gz: 6d19e2a775ea069a3e6b0dc50b66f30d52f0c4e1e101cea2c9f0dde5cedde611043cf4267a4764994be8105e422919bf708f850935d531d9bb8f907025393a05
7
+ data.tar.gz: 4f0dd67b073cc2003cf798603f7221ba32977645180b671bcde0b1464fc53085cf9a076c2f30f4ce3775a8524246386d576e1bd0d1021c8388db58d13c16068f
data/Gemfile.lock CHANGED
@@ -1,88 +1,79 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (1.10)
4
+ curation (2.0)
5
5
  htmlentities
6
- metainspector (~> 5.12)
6
+ metainspector
7
7
  nokogiri
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
11
11
  specs:
12
- addressable (2.8.0)
13
- public_suffix (>= 2.0.2, < 5.0)
12
+ addressable (2.8.5)
13
+ public_suffix (>= 2.0.2, < 6.0)
14
14
  ansi (1.5.0)
15
+ base64 (0.2.0)
15
16
  builder (3.2.4)
16
17
  byebug (11.1.3)
17
18
  domain_name (0.5.20190701)
18
19
  unf (>= 0.0.5, < 1.0.0)
19
- faraday (1.10.0)
20
- faraday-em_http (~> 1.0)
21
- faraday-em_synchrony (~> 1.0)
22
- faraday-excon (~> 1.1)
23
- faraday-httpclient (~> 1.0)
24
- faraday-multipart (~> 1.0)
25
- faraday-net_http (~> 1.0)
26
- faraday-net_http_persistent (~> 1.0)
27
- faraday-patron (~> 1.0)
28
- faraday-rack (~> 1.0)
29
- faraday-retry (~> 1.0)
20
+ faraday (2.7.11)
21
+ base64
22
+ faraday-net_http (>= 2.0, < 3.1)
30
23
  ruby2_keywords (>= 0.0.4)
31
24
  faraday-cookie_jar (0.0.7)
32
25
  faraday (>= 0.8.0)
33
26
  http-cookie (~> 1.0.0)
34
- faraday-em_http (1.0.0)
35
- faraday-em_synchrony (1.0.0)
36
27
  faraday-encoding (0.0.5)
37
28
  faraday
38
- faraday-excon (1.1.0)
39
- faraday-http-cache (2.4.0)
29
+ faraday-follow_redirects (0.3.0)
30
+ faraday (>= 1, < 3)
31
+ faraday-gzip (1.0.0)
32
+ faraday (>= 1.0)
33
+ zlib (~> 2.1)
34
+ faraday-http-cache (2.5.0)
40
35
  faraday (>= 0.8)
41
- faraday-httpclient (1.0.1)
42
- faraday-multipart (1.0.4)
43
- multipart-post (~> 2)
44
- faraday-net_http (1.0.1)
45
- faraday-net_http_persistent (1.2.0)
46
- faraday-patron (1.0.0)
47
- faraday-rack (1.0.0)
48
- faraday-retry (1.0.3)
49
- faraday_middleware (1.2.0)
50
- faraday (~> 1.0)
51
- fastimage (2.2.6)
36
+ faraday-net_http (3.0.2)
37
+ faraday-retry (2.2.0)
38
+ faraday (~> 2.0)
39
+ fastimage (2.2.7)
52
40
  htmlentities (4.3.4)
53
41
  http-cookie (1.0.5)
54
42
  domain_name (~> 0.5)
55
- metainspector (5.12.1)
56
- addressable (~> 2.7)
57
- faraday (>= 1.4, < 3.0)
43
+ metainspector (5.15.0)
44
+ addressable (~> 2.8.4)
45
+ faraday (~> 2.5)
58
46
  faraday-cookie_jar (~> 0.0)
59
47
  faraday-encoding (~> 0.0)
60
- faraday-http-cache (~> 2.2)
61
- faraday_middleware (~> 1.0)
48
+ faraday-follow_redirects (~> 0.3)
49
+ faraday-gzip (>= 0.1, < 2.0)
50
+ faraday-http-cache (~> 2.5)
51
+ faraday-retry (~> 2.0)
62
52
  fastimage (~> 2.2)
63
53
  nesty (~> 1.0)
64
- nokogiri (~> 1.11)
65
- minitest (5.15.0)
66
- minitest-reporters (1.5.0)
54
+ nokogiri (~> 1.13)
55
+ minitest (5.20.0)
56
+ minitest-reporters (1.6.1)
67
57
  ansi
68
58
  builder
69
59
  minitest (>= 5.0)
70
60
  ruby-progressbar
71
- multipart-post (2.2.0)
72
61
  nesty (1.0.2)
73
- nokogiri (1.13.6-x86_64-darwin)
62
+ nokogiri (1.15.4-x86_64-darwin)
74
63
  racc (~> 1.4)
75
- public_suffix (4.0.7)
76
- racc (1.6.0)
64
+ public_suffix (5.0.3)
65
+ racc (1.7.3)
77
66
  rake (12.3.3)
78
- ruby-progressbar (1.11.0)
67
+ ruby-progressbar (1.13.0)
79
68
  ruby2_keywords (0.0.5)
80
69
  unf (0.1.4)
81
70
  unf_ext
82
- unf_ext (0.0.8.2)
71
+ unf_ext (0.0.9)
72
+ zlib (2.1.1)
83
73
 
84
74
  PLATFORMS
85
75
  x86_64-darwin-21
76
+ x86_64-darwin-22
86
77
 
87
78
  DEPENDENCIES
88
79
  byebug
@@ -92,4 +83,4 @@ DEPENDENCIES
92
83
  rake (~> 12.0)
93
84
 
94
85
  BUNDLED WITH
95
- 2.3.12
86
+ 2.4.6
data/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright (c) 2020 Arnaud Levy
3
+ Copyright (c) 2020 noesya
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
data/README.md CHANGED
@@ -37,7 +37,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
37
37
 
38
38
  ## Contributing
39
39
 
40
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/curation/blob/master/CODE_OF_CONDUCT.md).
40
+ Bug reports and pull requests are welcome on GitHub at https://github.com/noesya/curation. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/noesya/curation/blob/master/CODE_OF_CONDUCT.md).
41
41
 
42
42
 
43
43
  ## License
data/curation.gemspec CHANGED
@@ -8,12 +8,12 @@ Gem::Specification.new do |spec|
8
8
 
9
9
  spec.summary = 'Curation of content'
10
10
  spec.description = %q{When you build content curation tools, you need to extract the content of pages (title, text, image...). This requires different strategies and some fine tuning to work efficiently.}
11
- spec.homepage = "https://github.com/arnaudlevy/curation"
11
+ spec.homepage = "https://github.com/noesya/curation"
12
12
  spec.license = "MIT"
13
13
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
14
 
15
15
  spec.metadata["homepage_uri"] = spec.homepage
16
- spec.metadata["source_code_uri"] = "https://github.com/arnaudlevy/curation"
16
+ spec.metadata["source_code_uri"] = "https://github.com/noesya/curation"
17
17
 
18
18
  spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
19
19
  `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.bindir = "exe"
22
22
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
23
23
  spec.require_paths = ["lib"]
24
- spec.add_dependency "metainspector", '~> 5.12'
24
+ spec.add_dependency "metainspector"
25
25
  spec.add_dependency "nokogiri"
26
26
  spec.add_dependency "htmlentities"
27
27
  end
@@ -0,0 +1,51 @@
1
+ module Image
2
+
3
+ def image
4
+ @image ||= find_image.to_s.gsub('http://', 'https://')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_image
10
+ log "Curation::Page find_image #{url}"
11
+ if json_ld.any?
12
+ json_ld.each do |ld|
13
+ ld = ld.first if ld.is_a?(Array)
14
+ if ld.has_key? 'image'
15
+ image_data = ld['image']
16
+ if image_data.is_a? String
17
+ log "Curation::Page find_image json_ld string"
18
+ return image_data
19
+ end
20
+ if image_data.is_a? Array
21
+ first = image_data.first
22
+ if first.is_a? String
23
+ log "Curation::Page find_image json_ld array"
24
+ return first
25
+ end
26
+ if first.is_a? Hash
27
+ log "Curation::Page find_image json_ld array url"
28
+ return first['url']
29
+ end
30
+ end
31
+ if image_data.is_a? Hash
32
+ log "Curation::Page find_image json_ld url"
33
+ return image_data['url']
34
+ end
35
+ end
36
+ end
37
+ end
38
+ begin
39
+ [
40
+ metainspector.images.best,
41
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
42
+ ].each do |possibility|
43
+ return possibility unless possibility.to_s.empty?
44
+ end
45
+ rescue
46
+ puts 'Curation::Page find_image error'
47
+ end
48
+ return ''
49
+ end
50
+
51
+ end
@@ -0,0 +1,43 @@
1
+ module PublicationDate
2
+
3
+ def date
4
+ @date ||= find_date
5
+ end
6
+
7
+ protected
8
+
9
+ def find_date
10
+ if json_ld.any?
11
+ json_ld.each do |ld|
12
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
13
+ return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
14
+ end
15
+ end
16
+ return Date.parse metatags['date'] rescue nil
17
+ return Date.parse metatags['pubdate'] rescue nil
18
+ return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
19
+ return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
20
+ return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
21
+ chunks = html.split('DisplayDate')
22
+ if chunks.count > 1
23
+ value = chunks[1]
24
+ value = value.split(',').first
25
+ value = value.gsub('"', '')
26
+ value = value[1..-1] if value[0] == ':'
27
+ return Date.parse value rescue nil
28
+ end
29
+ begin
30
+ value = nokogiri.css('.postDate').first
31
+ value = value.inner_text
32
+ value = value.gsub(' — ', '')
33
+ return Date.parse value
34
+ rescue
35
+ end
36
+ begin
37
+ value = nokogiri.css('.gta_post_date').first
38
+ value = value.inner_text
39
+ return Date.parse value
40
+ rescue
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,78 @@
1
+ module Text
2
+
3
+ BLACKLIST = [
4
+ 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
5
+ '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
6
+ '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
7
+ '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
8
+ '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
9
+ '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
10
+ '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
11
+ '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
12
+ ]
13
+
14
+ def text
15
+ # require 'byebug'; byebug
16
+ @text ||= find_text
17
+ end
18
+
19
+ protected
20
+
21
+ def find_text
22
+ text = find_text_with_json_ld || find_text_with_nokogiri
23
+ text.to_s.dup.gsub!('<br><br>', '<br>')
24
+ # require 'byebug'; byebug
25
+ text = clean_encoding text
26
+ text
27
+ end
28
+
29
+ def find_text_with_json_ld
30
+ if json_ld.any?
31
+ json_ld.each do |ld|
32
+ next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
33
+ return ld['text'] if ld.has_key? 'text'
34
+ return ld['articleBody'] if ld.has_key? 'articleBody'
35
+ end
36
+ end
37
+ nil
38
+ end
39
+
40
+ def find_text_with_nokogiri
41
+ h = nokogiri.dup
42
+ h.xpath('//style').remove
43
+ BLACKLIST.each do |tag|
44
+ h.css(tag).remove
45
+ end
46
+ nodes = h.css('p')
47
+ if nodes.any?
48
+ text = nodes.to_html
49
+ text
50
+ else
51
+ # Cleanup was too hard, let's try softer
52
+ h = nokogiri.dup
53
+ h.text
54
+ end
55
+ end
56
+
57
+ # r&Atilde;&copy;forme -> réforme
58
+ def clean_encoding(text)
59
+ clean_text = HTMLEntities.new.decode text
60
+ double_encoding = false
61
+ [
62
+ 'é', # é
63
+ 'è', # è
64
+ 'î', # î
65
+ 'ê', # ê
66
+ ].each do |string|
67
+ # require 'byebug'; byebug
68
+ double_encoding = true if clean_text.include? string
69
+ end
70
+ if double_encoding
71
+ clean_text.encode('iso-8859-1', undef: :replace)
72
+ .force_encoding('utf-8')
73
+ else
74
+ text
75
+ end
76
+ end
77
+
78
+ end
@@ -0,0 +1,57 @@
1
+ module Title
2
+
3
+ def title
4
+ @title ||= find_title.strip.gsub(/\s+/, ' ')
5
+ end
6
+
7
+ protected
8
+
9
+ def find_title
10
+ find_title_with_json_ld ||
11
+ find_title_with_metainspector ||
12
+ find_title_with_nokogiri ||
13
+ ''
14
+ end
15
+
16
+ def find_title_with_json_ld
17
+ if json_ld.any?
18
+ json_ld.each do |ld|
19
+ # require 'byebug'; byebug
20
+ ld = ld.first if ld.is_a?(Array)
21
+ return ld['headline'] if ld.has_key? 'headline'
22
+ end
23
+ end
24
+ nil
25
+ end
26
+
27
+ def find_title_with_metainspector
28
+ metainspector_best_title = metainspector.best_title
29
+ metainspector_title = metainspector.title
30
+ # Problème avec une balise <meta property="title" content="Run 0" />,
31
+ # metainspector croit que c'est le titre de la page.
32
+ # Comme le title contient le best title, avec souvent des infos en plus sur le site,
33
+ # on vérifie si le best title est bien contenu dans le title
34
+ if metainspector_title.present? &&
35
+ metainspector_title.present? &&
36
+ metainspector_best_title.present? &&
37
+ metainspector_title.include?(metainspector_best_title)
38
+ return metainspector_best_title
39
+ elsif metainspector_title.present?
40
+ return metainspector_title
41
+ end
42
+ end
43
+
44
+ def find_title_with_nokogiri
45
+ begin
46
+ [
47
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
48
+ nokogiri.css('title')&.first&.inner_text
49
+ ].each do |possibility|
50
+ return possibility unless possibility.to_s.empty?
51
+ end
52
+ rescue
53
+ log 'Curation::Page find_title_with_nokogiri error'
54
+ end
55
+ end
56
+
57
+ end
@@ -0,0 +1,26 @@
1
+ module Jsonld
2
+
3
+ def json_ld
4
+ unless defined?(@json_ld)
5
+ @json_ld = []
6
+ begin
7
+ options = nokogiri.css('[type="application/ld+json"]')
8
+ options.each do |option|
9
+ @json_ld << json_ld_from_object(option)
10
+ end
11
+ # Some sites have tables in tables
12
+ @json_ld.flatten!
13
+ # require 'byebug'; byebug
14
+ rescue
15
+ log 'Curation::Page json_ld error'
16
+ end
17
+ end
18
+ @json_ld
19
+ end
20
+
21
+ def json_ld_from_object(object)
22
+ JSON.parse object.inner_text
23
+ rescue
24
+ {}
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ module Metainspector
2
+
3
+ def metainspector
4
+ unless @metainspector
5
+ @metainspector = html.nil? ? MetaInspector.new(url)
6
+ : MetaInspector.new(url, document: html)
7
+ end
8
+ @metainspector
9
+ rescue
10
+ log 'Curation::Page metainspector error'
11
+ end
12
+
13
+ def metatags
14
+ @metatags ||= metainspector.meta_tag['name']
15
+ rescue
16
+ log 'Curation::Page metatags error'
17
+ end
18
+ end
@@ -0,0 +1,17 @@
1
+ module Nokogiri
2
+
3
+ def nokogiri
4
+ unless @nokogiri
5
+ if file.nil?
6
+ @nokogiri = metainspector.parsed
7
+ else
8
+ file.rewind
9
+ @nokogiri = Nokogiri::HTML file
10
+ file.rewind
11
+ end
12
+ end
13
+ @nokogiri
14
+ rescue
15
+ log 'Curation::Page nokogiri error'
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Raw
2
+
3
+ def file
4
+ @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
5
+ rescue
6
+ log "Curation::Page file error with url #{url}"
7
+ end
8
+
9
+ def html
10
+ unless @html
11
+ file.rewind
12
+ @html = file.read
13
+ file.rewind
14
+ end
15
+ @html
16
+ rescue
17
+ log "Curation::Page html error"
18
+ end
19
+ end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.10"
2
+ VERSION = "2.0"
3
3
  end
data/lib/curation.rb CHANGED
@@ -1,4 +1,12 @@
1
1
  require "curation/version"
2
+ require "curation/tools/raw"
3
+ require "curation/tools/nokogiri"
4
+ require "curation/tools/jsonld"
5
+ require "curation/tools/metainspector"
6
+ require "curation/finders/image"
7
+ require "curation/finders/publication_date"
8
+ require "curation/finders/text"
9
+ require "curation/finders/title"
2
10
  require "metainspector"
3
11
  require "open-uri"
4
12
  require "htmlentities"
@@ -7,257 +15,31 @@ module Curation
7
15
  class Error < StandardError; end
8
16
 
9
17
  class Page
10
- attr_reader :url
18
+ # Tools
19
+ include Raw
20
+ include Jsonld
21
+ include Metainspector
22
+ include Nokogiri
23
+
24
+ # Finders
25
+ include Title
26
+ include Image
27
+ include PublicationDate
28
+ include Text
11
29
 
12
- BLACKLIST = [
13
- 'head', 'script', 'style', 'iframe', 'nav', 'noscript', 'header', 'footer', 'aside',
14
- '.navigation', '.top-menu-container', '.navbar', '.navbar-header', '.breadcrumb',
15
- '#breadcrumbs', '[typeof="v:Breadcrumb"]', '.skip-link', '.search', '.search-form',
16
- '.categories', '.post-categories', '.datas', '.post-datas', '.twitter-media',
17
- '.instagram-media', '.widget', '.related-post-tags', '.social-list', '.top-scroll',
18
- '.comments', '.signature', '.publicite', '.footer', '.Footer', '.footer-copyright',
19
- '[itemprop*="author"]', '[style*="display:none;"]', '[style*="display:none"]',
20
- '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
21
- ]
30
+ attr_reader :url
31
+ attr_accessor :verbose
22
32
 
23
33
  def initialize(url, html = nil)
24
34
  @url = url.to_s.gsub('http://', 'https://')
25
35
  @html = html
26
- end
27
-
28
- def title
29
- @title ||= find_title
30
- end
31
-
32
- def image
33
- unless @image
34
- @image = find_image
35
- @image = @image.to_s.gsub('http://', 'https://')
36
- end
37
- @image
38
- end
39
-
40
- def text
41
- # require 'byebug'; byebug
42
- @text ||= find_text
43
- end
44
-
45
- def date
46
- @date ||= find_date
36
+ @verbose = false
47
37
  end
48
38
 
49
39
  protected
50
40
 
51
- def find_title
52
- if json_ld.any?
53
- json_ld.each do |ld|
54
- # require 'byebug'; byebug
55
- ld = ld.first if ld.is_a?(Array)
56
- return ld['headline'] if ld.has_key? 'headline'
57
- end
58
- end
59
- begin
60
- [
61
- metainspector.best_title,
62
- metainspector.title,
63
- nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
64
- nokogiri.css('title')&.first&.inner_text
65
- ].each do |possibility|
66
- return possibility unless possibility.to_s.empty?
67
- end
68
- rescue
69
- puts 'Curation::Page find_title error'
70
- end
71
- return ''
72
- end
73
-
74
- def find_image
75
- if json_ld.any?
76
- json_ld.each do |ld|
77
- ld = ld.first if ld.is_a?(Array)
78
- if ld.has_key? 'image'
79
- image_data = ld['image']
80
- return image_data if image_data.is_a? String
81
- if image_data.is_a? Array
82
- first = image_data.first
83
- return first if first.is_a? String
84
- return first['url'] if first.is_a? Hash
85
- end
86
- return image_data['url'] if image_data.is_a? Hash
87
- end
88
- end
89
- end
90
- begin
91
- [
92
- metainspector.images.best,
93
- nokogiri.css('[property="og:image"]').first&.attributes['content'].value
94
- ].each do |possibility|
95
- return possibility unless possibility.to_s.empty?
96
- end
97
- rescue
98
- puts 'Curation::Page find_image error'
99
- end
100
- return ''
101
- end
102
-
103
- def find_text
104
- text = find_text_with_json_ld || find_text_with_nokogiri
105
- text.to_s.gsub!('<br><br>', '<br>')
106
- # require 'byebug'; byebug
107
- text = clean_encoding text
108
- text
109
- end
110
-
111
- def find_text_with_json_ld
112
- if json_ld.any?
113
- json_ld.each do |ld|
114
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
115
- return ld['text'] if ld.has_key? 'text'
116
- return ld['articleBody'] if ld.has_key? 'articleBody'
117
- end
118
- end
119
- nil
120
- end
121
-
122
- def find_text_with_nokogiri
123
- h = nokogiri.dup
124
- BLACKLIST.each do |tag|
125
- h.css(tag).remove
126
- end
127
- nodes = h.css('p')
128
- nodes.xpath('//style').remove
129
- text = nodes.to_html
130
- text
131
- end
132
-
133
- def find_date
134
- if json_ld.any?
135
- json_ld.each do |ld|
136
- next unless ['NewsArticle', 'ReportageNewsArticle'].include? ld['@type']
137
- return Date.parse ld['datePublished'] if ld.has_key? 'datePublished'
138
- end
139
- end
140
- return Date.parse metatags['date'] rescue nil
141
- return Date.parse metatags['pubdate'] rescue nil
142
- return Date.parse nokogiri.css('meta[property="article:published"]').first['content'] rescue nil
143
- return Date.parse nokogiri.css('meta[property="article:published_time"]').first['content'] rescue nil
144
- return Date.parse nokogiri.css('meta[property="og:article:published_time"]').first['content'] rescue nil
145
- chunks = html.split('DisplayDate')
146
- if chunks.count > 1
147
- value = chunks[1]
148
- value = value.split(',').first
149
- value = value.gsub('"', '')
150
- value = value[1..-1] if value[0] == ':'
151
- return Date.parse value rescue nil
152
- end
153
- begin
154
- value = nokogiri.css('.postDate').first
155
- value = value.inner_text
156
- value = value.gsub(' — ', '')
157
- return Date.parse value
158
- rescue
159
- end
160
- begin
161
- value = nokogiri.css('.gta_post_date').first
162
- value = value.inner_text
163
- return Date.parse value
164
- rescue
165
- end
166
- end
167
-
168
- private
169
-
170
- def json_ld
171
- unless defined?(@json_ld)
172
- @json_ld = []
173
- begin
174
- options = nokogiri.css('[type="application/ld+json"]')
175
- options.each do |option|
176
- @json_ld << json_ld_from_object(option)
177
- end
178
- # Some sites have tables in tables
179
- @json_ld.flatten!
180
- # require 'byebug'; byebug
181
- rescue
182
- puts 'Curation::Page json_ld error'
183
- end
184
- end
185
- @json_ld
186
- end
187
-
188
- def json_ld_from_object(object)
189
- JSON.parse object.inner_text
190
- rescue
191
- {}
192
- end
193
-
194
- def file
195
- @file ||= URI.open url, 'User-Agent' => "Mozilla/5.0"
196
- rescue
197
- puts "Curation::Page file error with url #{url}"
198
- end
199
-
200
- def html
201
- unless @html
202
- file.rewind
203
- @html = file.read
204
- file.rewind
205
- end
206
- @html
207
- rescue
208
- puts "Curation::Page html error"
209
- end
210
-
211
- def nokogiri
212
- unless @nokogiri
213
- if file.nil?
214
- @nokogiri = metainspector.parsed
215
- else
216
- file.rewind
217
- @nokogiri = Nokogiri::HTML file
218
- file.rewind
219
- end
220
- end
221
- @nokogiri
222
- rescue
223
- puts 'Curation::Page nokogiri error'
224
- end
225
-
226
- def metainspector
227
- unless @metainspector
228
- @metainspector = html.nil? ? MetaInspector.new(url)
229
- : MetaInspector.new(url, document: html)
230
- end
231
- @metainspector
232
- rescue
233
- puts 'Curation::Page metainspector error'
234
- end
235
-
236
- def metatags
237
- @metatags ||= metainspector.meta_tag['name']
238
- rescue
239
- puts 'Curation::Page metatags error'
240
- end
241
-
242
- # r&Atilde;&copy;forme -> réforme
243
- def clean_encoding(text)
244
- clean_text = HTMLEntities.new.decode text
245
- double_encoding = false
246
- [
247
- 'é', # é
248
- 'è', # è
249
- 'î', # î
250
- 'ê', # ê
251
- ].each do |string|
252
- # require 'byebug'; byebug
253
- double_encoding = true if clean_text.include? string
254
- end
255
- if double_encoding
256
- clean_text.encode('iso-8859-1', undef: :replace)
257
- .force_encoding('utf-8')
258
- else
259
- text
260
- end
41
+ def log(message)
42
+ puts message if verbose
261
43
  end
262
44
  end
263
45
  end
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.10'
4
+ version: '2.0'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-06-08 00:00:00.000000000 Z
11
+ date: 2023-11-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '5.12'
19
+ version: '0'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - "~>"
24
+ - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '5.12'
26
+ version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -72,13 +72,21 @@ files:
72
72
  - bin/setup
73
73
  - curation.gemspec
74
74
  - lib/curation.rb
75
+ - lib/curation/finders/image.rb
76
+ - lib/curation/finders/publication_date.rb
77
+ - lib/curation/finders/text.rb
78
+ - lib/curation/finders/title.rb
79
+ - lib/curation/tools/jsonld.rb
80
+ - lib/curation/tools/metainspector.rb
81
+ - lib/curation/tools/nokogiri.rb
82
+ - lib/curation/tools/raw.rb
75
83
  - lib/curation/version.rb
76
- homepage: https://github.com/arnaudlevy/curation
84
+ homepage: https://github.com/noesya/curation
77
85
  licenses:
78
86
  - MIT
79
87
  metadata:
80
- homepage_uri: https://github.com/arnaudlevy/curation
81
- source_code_uri: https://github.com/arnaudlevy/curation
88
+ homepage_uri: https://github.com/noesya/curation
89
+ source_code_uri: https://github.com/noesya/curation
82
90
  post_install_message:
83
91
  rdoc_options: []
84
92
  require_paths:
@@ -94,7 +102,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
94
102
  - !ruby/object:Gem::Version
95
103
  version: '0'
96
104
  requirements: []
97
- rubygems_version: 3.1.6
105
+ rubygems_version: 3.4.10
98
106
  signing_key:
99
107
  specification_version: 4
100
108
  summary: Curation of content