curation 1.0 → 1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f56e1c572c081875ce6e71608496d64a3d34ef84837c60e15c06e358ab0af44
4
- data.tar.gz: e7b3899aee83e21bad48c59b68d15d792cebdf9c84b95c24012d4329a3de08cf
3
+ metadata.gz: 15ed9203a4164d0cfdfb7e973523008ddfe518ce21016731ae8a1dcddc74b0ab
4
+ data.tar.gz: e2269fbae2d6a6355f90709409de0fa1dd3e555dfd533b10d00d1073af0ac9ec
5
5
  SHA512:
6
- metadata.gz: a71487cae498b7f04e028f7e6f71793e08047bddefc28e8e3700167c5bf1734cccdd5972a997cd30b07d9c69a2ebb86c4e744e9c32f63a3f0b7c75ba23bf92ef
7
- data.tar.gz: b33659760a649ff74aff8fcb821b28d073d4324ce671f1cb737107d4515043de5916d7315a385dfa0cb2517de177aa39f745f3aaad95149339cb4cd116b8cb52
6
+ metadata.gz: 72df82702be64fcee4e1e7725ecb3c1f831c036040c2ef445388feb493a43cd78b979ed63e99dd0168399c2bd014a0a00e2f7211f561b0ba88f13d0098daaa68
7
+ data.tar.gz: dd439269683719541f49204484e7d7fd42cbc0f0320e488d2f12a5a36bc472cd343343c4697d0a84a3f8746a308dad03b5e84e33b03adedd3984de73082e5bbb
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (0.1.0)
4
+ curation (1.5)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -1,4 +1,6 @@
1
1
  require "curation/version"
2
+ require "metainspector"
3
+ require "open-uri"
2
4
 
3
5
  module Curation
4
6
  class Error < StandardError; end
@@ -17,22 +19,18 @@ module Curation
17
19
  '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
18
20
  ]
19
21
 
20
- def initialize(url)
22
+ def initialize(url, html = nil)
21
23
  @url = url
24
+ @html = html
22
25
  end
23
26
 
24
27
  def title
25
- if json_ld.any?
26
- json_ld.each do |ld|
27
- return ld['headline'] if ld.has_key? 'headline'
28
- end
29
- end
30
- metainspector.best_title
28
+ @title = find_title
31
29
  end
32
30
 
33
31
  def image
34
32
  @image = find_image
35
- @image = @image.gsub('http://', 'https://')
33
+ @image = @image.to_s.gsub('http://', 'https://')
36
34
  @image
37
35
  end
38
36
 
@@ -44,8 +42,7 @@ module Curation
44
42
  return ld['articleBody'] if ld.has_key? 'articleBody'
45
43
  end
46
44
  end
47
- text = ''
48
- h = html.dup
45
+ h = nokogiri.dup
49
46
  BLACKLIST.each do |tag|
50
47
  h.css(tag).remove
51
48
  end
@@ -58,54 +55,88 @@ module Curation
58
55
 
59
56
  protected
60
57
 
58
+ def find_title
59
+ if json_ld.any?
60
+ json_ld.each do |ld|
61
+ return ld['headline'] if ld.has_key? 'headline'
62
+ end
63
+ end
64
+ begin
65
+ [
66
+ metainspector.best_title,
67
+ metainspector.title,
68
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
69
+ nokogiri.css('title')&.first&.inner_text
70
+ ].each do |possibility|
71
+ return possibility unless possibility.blank?
72
+ end
73
+ rescue
74
+ puts 'Curation::Page find_title error'
75
+ end
76
+ return ''
77
+ end
78
+
61
79
  def find_image
62
80
  if json_ld.any?
63
81
  json_ld.each do |ld|
64
82
  if ld.has_key? 'image'
65
83
  image_data = ld['image']
66
84
  return image_data if image_data.is_a? String
67
- return image_data.first if image_data.is_a? Array
85
+ if image_data.is_a? Array
86
+ first = image_data.first
87
+ return first if first.is_a? String
88
+ return first['url'] if first.is_a? Hash
89
+ end
68
90
  return image_data['url'] if image_data.is_a? Hash
69
91
  end
70
92
  end
71
93
  end
72
- metainspector.images.best
73
- end
74
-
75
- def html
76
- @html ||= Nokogiri::HTML data
77
- rescue
78
- puts "Nokogiri error"
94
+ begin
95
+ [
96
+ metainspector.images.best,
97
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
98
+ ].each do |possibility|
99
+ return possibility unless possibility.blank?
100
+ end
101
+ rescue
102
+ puts 'Curation::Page find_image error'
103
+ end
104
+ return ''
79
105
  end
80
106
 
81
107
  def json_ld
82
108
  unless @json_ld
83
109
  @json_ld = []
84
110
  begin
85
- options = html.css('[type="application/ld+json"]')
111
+ options = nokogiri.css('[type="application/ld+json"]')
86
112
  options.each do |option|
87
113
  string = option.inner_text
88
114
  hash = JSON.parse(string)
89
115
  @json_ld << hash
90
116
  end
91
117
  rescue
92
- puts "JSON LD error"
118
+ puts 'Curation::Page json_ld error'
93
119
  end
94
120
  end
95
121
  @json_ld
96
122
  end
97
123
 
98
- def data
99
- require 'open-uri'
100
- URI.open url
124
+ def html
125
+ @html ||= URI.open url
101
126
  rescue
102
127
  puts "Impossible to open #{url}"
103
128
  end
104
129
 
130
+ def nokogiri
131
+ @nokogiri ||= Nokogiri::HTML html
132
+ rescue
133
+ puts 'Curation::Page nokogiri error'
134
+ end
135
+
105
136
  def metainspector
106
- @metainspector ||= MetaInspector.new url
137
+ @metainspector ||= MetaInspector.new url, document: html
107
138
  rescue
108
- puts "MetaInspector error"
139
+ puts 'Curation::Page metainspector error'
109
140
  end
110
141
  end
111
142
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.0"
2
+ VERSION = "1.5"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.0'
4
+ version: '1.5'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector