curation 1.1 → 1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14637da5bf7b047f9c34b40ff4bf108110dce7408d3c12af9e76222141bdcf65
4
- data.tar.gz: 144487e448354476958895db783fdc681d0116ec3eca5e7ede1cd3cf1bd1d13a
3
+ metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
4
+ data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
5
5
  SHA512:
6
- metadata.gz: 303babe96daee5e792672e1ece05b49d77c115e393af00ded6902b10b9d4adb51210923c7d6c0e68a359246078618a8d09606ab57dd57fa0882db7e3a474e3e4
7
- data.tar.gz: 451a1ad8866adc91850ff9b150847564bb9de43b15432de2dbf06fb2936a3788ed2c0bf4d9e3c9c462e13bdb5c6236bf5dc0c97a6697a8c3ea9c307bf2f35410
6
+ metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
7
+ data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (0.1.0)
4
+ curation (1.6)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -19,17 +19,13 @@ module Curation
19
19
  '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
20
20
  ]
21
21
 
22
- def initialize(url)
22
+ def initialize(url, html = nil)
23
23
  @url = url
24
+ @html = html
24
25
  end
25
26
 
26
27
  def title
27
- if json_ld.any?
28
- json_ld.each do |ld|
29
- return ld['headline'] if ld.has_key? 'headline'
30
- end
31
- end
32
- metainspector.best_title
28
+ @title = find_title
33
29
  end
34
30
 
35
31
  def image
@@ -46,7 +42,7 @@ module Curation
46
42
  return ld['articleBody'] if ld.has_key? 'articleBody'
47
43
  end
48
44
  end
49
- h = html.dup
45
+ h = nokogiri.dup
50
46
  BLACKLIST.each do |tag|
51
47
  h.css(tag).remove
52
48
  end
@@ -59,53 +55,90 @@ module Curation
59
55
 
60
56
  protected
61
57
 
58
+ def find_title
59
+ if json_ld.any?
60
+ # Some sites have tables in tables
61
+ json_ld.flatten!
62
+ json_ld.each do |ld|
63
+ return ld['headline'] if ld.has_key? 'headline'
64
+ end
65
+ end
66
+ begin
67
+ [
68
+ metainspector.best_title,
69
+ metainspector.title,
70
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
71
+ nokogiri.css('title')&.first&.inner_text
72
+ ].each do |possibility|
73
+ return possibility unless possibility.blank?
74
+ end
75
+ rescue
76
+ puts 'Curation::Page find_title error'
77
+ end
78
+ return ''
79
+ end
80
+
62
81
  def find_image
63
82
  if json_ld.any?
64
83
  json_ld.each do |ld|
65
84
  if ld.has_key? 'image'
66
85
  image_data = ld['image']
67
86
  return image_data if image_data.is_a? String
68
- return image_data.first if image_data.is_a? Array
87
+ if image_data.is_a? Array
88
+ first = image_data.first
89
+ return first if first.is_a? String
90
+ return first['url'] if first.is_a? Hash
91
+ end
69
92
  return image_data['url'] if image_data.is_a? Hash
70
93
  end
71
94
  end
72
95
  end
73
- metainspector.images.best
74
- end
75
-
76
- def html
77
- @html ||= Nokogiri::HTML data
78
- rescue
79
- puts "Nokogiri error"
96
+ begin
97
+ [
98
+ metainspector.images.best,
99
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
100
+ ].each do |possibility|
101
+ return possibility unless possibility.blank?
102
+ end
103
+ rescue
104
+ puts 'Curation::Page find_image error'
105
+ end
106
+ return ''
80
107
  end
81
108
 
82
109
  def json_ld
83
110
  unless @json_ld
84
111
  @json_ld = []
85
112
  begin
86
- options = html.css('[type="application/ld+json"]')
113
+ options = nokogiri.css('[type="application/ld+json"]')
87
114
  options.each do |option|
88
115
  string = option.inner_text
89
116
  hash = JSON.parse(string)
90
117
  @json_ld << hash
91
118
  end
92
119
  rescue
93
- puts "JSON LD error"
120
+ puts 'Curation::Page json_ld error'
94
121
  end
95
122
  end
96
123
  @json_ld
97
124
  end
98
125
 
99
- def data
100
- URI.open url
126
+ def html
127
+ @html ||= URI.open url
101
128
  rescue
102
129
  puts "Impossible to open #{url}"
103
130
  end
104
131
 
132
+ def nokogiri
133
+ @nokogiri ||= Nokogiri::HTML html
134
+ rescue
135
+ puts 'Curation::Page nokogiri error'
136
+ end
137
+
105
138
  def metainspector
106
- @metainspector ||= MetaInspector.new url
139
+ @metainspector ||= MetaInspector.new url, document: html
107
140
  rescue
108
- puts "MetaInspector error"
141
+ puts 'Curation::Page metainspector error'
109
142
  end
110
143
  end
111
144
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.1"
2
+ VERSION = "1.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.1'
4
+ version: '1.6'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector