curation 1.0 → 1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f56e1c572c081875ce6e71608496d64a3d34ef84837c60e15c06e358ab0af44
4
- data.tar.gz: e7b3899aee83e21bad48c59b68d15d792cebdf9c84b95c24012d4329a3de08cf
3
+ metadata.gz: 15ed9203a4164d0cfdfb7e973523008ddfe518ce21016731ae8a1dcddc74b0ab
4
+ data.tar.gz: e2269fbae2d6a6355f90709409de0fa1dd3e555dfd533b10d00d1073af0ac9ec
5
5
  SHA512:
6
- metadata.gz: a71487cae498b7f04e028f7e6f71793e08047bddefc28e8e3700167c5bf1734cccdd5972a997cd30b07d9c69a2ebb86c4e744e9c32f63a3f0b7c75ba23bf92ef
7
- data.tar.gz: b33659760a649ff74aff8fcb821b28d073d4324ce671f1cb737107d4515043de5916d7315a385dfa0cb2517de177aa39f745f3aaad95149339cb4cd116b8cb52
6
+ metadata.gz: 72df82702be64fcee4e1e7725ecb3c1f831c036040c2ef445388feb493a43cd78b979ed63e99dd0168399c2bd014a0a00e2f7211f561b0ba88f13d0098daaa68
7
+ data.tar.gz: dd439269683719541f49204484e7d7fd42cbc0f0320e488d2f12a5a36bc472cd343343c4697d0a84a3f8746a308dad03b5e84e33b03adedd3984de73082e5bbb
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (0.1.0)
4
+ curation (1.5)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -1,4 +1,6 @@
1
1
  require "curation/version"
2
+ require "metainspector"
3
+ require "open-uri"
2
4
 
3
5
  module Curation
4
6
  class Error < StandardError; end
@@ -17,22 +19,18 @@ module Curation
17
19
  '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
18
20
  ]
19
21
 
20
- def initialize(url)
22
+ def initialize(url, html = nil)
21
23
  @url = url
24
+ @html = html
22
25
  end
23
26
 
24
27
  def title
25
- if json_ld.any?
26
- json_ld.each do |ld|
27
- return ld['headline'] if ld.has_key? 'headline'
28
- end
29
- end
30
- metainspector.best_title
28
+ @title = find_title
31
29
  end
32
30
 
33
31
  def image
34
32
  @image = find_image
35
- @image = @image.gsub('http://', 'https://')
33
+ @image = @image.to_s.gsub('http://', 'https://')
36
34
  @image
37
35
  end
38
36
 
@@ -44,8 +42,7 @@ module Curation
44
42
  return ld['articleBody'] if ld.has_key? 'articleBody'
45
43
  end
46
44
  end
47
- text = ''
48
- h = html.dup
45
+ h = nokogiri.dup
49
46
  BLACKLIST.each do |tag|
50
47
  h.css(tag).remove
51
48
  end
@@ -58,54 +55,88 @@ module Curation
58
55
 
59
56
  protected
60
57
 
58
+ def find_title
59
+ if json_ld.any?
60
+ json_ld.each do |ld|
61
+ return ld['headline'] if ld.has_key? 'headline'
62
+ end
63
+ end
64
+ begin
65
+ [
66
+ metainspector.best_title,
67
+ metainspector.title,
68
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
69
+ nokogiri.css('title')&.first&.inner_text
70
+ ].each do |possibility|
71
+ return possibility unless possibility.blank?
72
+ end
73
+ rescue
74
+ puts 'Curation::Page find_title error'
75
+ end
76
+ return ''
77
+ end
78
+
61
79
  def find_image
62
80
  if json_ld.any?
63
81
  json_ld.each do |ld|
64
82
  if ld.has_key? 'image'
65
83
  image_data = ld['image']
66
84
  return image_data if image_data.is_a? String
67
- return image_data.first if image_data.is_a? Array
85
+ if image_data.is_a? Array
86
+ first = image_data.first
87
+ return first if first.is_a? String
88
+ return first['url'] if first.is_a? Hash
89
+ end
68
90
  return image_data['url'] if image_data.is_a? Hash
69
91
  end
70
92
  end
71
93
  end
72
- metainspector.images.best
73
- end
74
-
75
- def html
76
- @html ||= Nokogiri::HTML data
77
- rescue
78
- puts "Nokogiri error"
94
+ begin
95
+ [
96
+ metainspector.images.best,
97
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
98
+ ].each do |possibility|
99
+ return possibility unless possibility.blank?
100
+ end
101
+ rescue
102
+ puts 'Curation::Page find_image error'
103
+ end
104
+ return ''
79
105
  end
80
106
 
81
107
  def json_ld
82
108
  unless @json_ld
83
109
  @json_ld = []
84
110
  begin
85
- options = html.css('[type="application/ld+json"]')
111
+ options = nokogiri.css('[type="application/ld+json"]')
86
112
  options.each do |option|
87
113
  string = option.inner_text
88
114
  hash = JSON.parse(string)
89
115
  @json_ld << hash
90
116
  end
91
117
  rescue
92
- puts "JSON LD error"
118
+ puts 'Curation::Page json_ld error'
93
119
  end
94
120
  end
95
121
  @json_ld
96
122
  end
97
123
 
98
- def data
99
- require 'open-uri'
100
- URI.open url
124
+ def html
125
+ @html ||= URI.open url
101
126
  rescue
102
127
  puts "Impossible to open #{url}"
103
128
  end
104
129
 
130
+ def nokogiri
131
+ @nokogiri ||= Nokogiri::HTML html
132
+ rescue
133
+ puts 'Curation::Page nokogiri error'
134
+ end
135
+
105
136
  def metainspector
106
- @metainspector ||= MetaInspector.new url
137
+ @metainspector ||= MetaInspector.new url, document: html
107
138
  rescue
108
- puts "MetaInspector error"
139
+ puts 'Curation::Page metainspector error'
109
140
  end
110
141
  end
111
142
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.0"
2
+ VERSION = "1.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.0'
4
+ version: '1.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector