curation 1.1 → 1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 14637da5bf7b047f9c34b40ff4bf108110dce7408d3c12af9e76222141bdcf65
4
- data.tar.gz: 144487e448354476958895db783fdc681d0116ec3eca5e7ede1cd3cf1bd1d13a
3
+ metadata.gz: f059bacdf3a2deedf5363721c8186af8a62d5b69709022b3d076f725433be26f
4
+ data.tar.gz: ac36789448e1b3a58161f98362dbed1d889537612939f86d4a2db31b935811c3
5
5
  SHA512:
6
- metadata.gz: 303babe96daee5e792672e1ece05b49d77c115e393af00ded6902b10b9d4adb51210923c7d6c0e68a359246078618a8d09606ab57dd57fa0882db7e3a474e3e4
7
- data.tar.gz: 451a1ad8866adc91850ff9b150847564bb9de43b15432de2dbf06fb2936a3788ed2c0bf4d9e3c9c462e13bdb5c6236bf5dc0c97a6697a8c3ea9c307bf2f35410
6
+ metadata.gz: 311113b9b172cfb54f917b694ca5a8f2bb17184c38fc99c8930592f73de9219c9ecda9217562180ef0e17def15682f56255b5f7e671555071d32b9149479cc2b
7
+ data.tar.gz: bbbe41e2fce674677dac3f3d7c5b724c6bbde6e8941f8be79f8d695ce722578046cb061d7c25fcac771ecb1bb57223d4c88748f36c85c0d6d08cac354ecb6a40
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- curation (0.1.0)
4
+ curation (1.6)
5
5
  metainspector
6
6
  nokogiri
7
7
 
@@ -19,17 +19,13 @@ module Curation
19
19
  '[style*="display: none;"]', '[style*="display: none"]', '[aria-hidden="true"]'
20
20
  ]
21
21
 
22
- def initialize(url)
22
+ def initialize(url, html = nil)
23
23
  @url = url
24
+ @html = html
24
25
  end
25
26
 
26
27
  def title
27
- if json_ld.any?
28
- json_ld.each do |ld|
29
- return ld['headline'] if ld.has_key? 'headline'
30
- end
31
- end
32
- metainspector.best_title
28
+ @title = find_title
33
29
  end
34
30
 
35
31
  def image
@@ -46,7 +42,7 @@ module Curation
46
42
  return ld['articleBody'] if ld.has_key? 'articleBody'
47
43
  end
48
44
  end
49
- h = html.dup
45
+ h = nokogiri.dup
50
46
  BLACKLIST.each do |tag|
51
47
  h.css(tag).remove
52
48
  end
@@ -59,53 +55,90 @@ module Curation
59
55
 
60
56
  protected
61
57
 
58
+ def find_title
59
+ if json_ld.any?
60
+ # Some sites have tables in tables
61
+ json_ld.flatten!
62
+ json_ld.each do |ld|
63
+ return ld['headline'] if ld.has_key? 'headline'
64
+ end
65
+ end
66
+ begin
67
+ [
68
+ metainspector.best_title,
69
+ metainspector.title,
70
+ nokogiri.css('[itemprop="headline"]')&.first&.inner_text,
71
+ nokogiri.css('title')&.first&.inner_text
72
+ ].each do |possibility|
73
+ return possibility unless possibility.blank?
74
+ end
75
+ rescue
76
+ puts 'Curation::Page find_title error'
77
+ end
78
+ return ''
79
+ end
80
+
62
81
  def find_image
63
82
  if json_ld.any?
64
83
  json_ld.each do |ld|
65
84
  if ld.has_key? 'image'
66
85
  image_data = ld['image']
67
86
  return image_data if image_data.is_a? String
68
- return image_data.first if image_data.is_a? Array
87
+ if image_data.is_a? Array
88
+ first = image_data.first
89
+ return first if first.is_a? String
90
+ return first['url'] if first.is_a? Hash
91
+ end
69
92
  return image_data['url'] if image_data.is_a? Hash
70
93
  end
71
94
  end
72
95
  end
73
- metainspector.images.best
74
- end
75
-
76
- def html
77
- @html ||= Nokogiri::HTML data
78
- rescue
79
- puts "Nokogiri error"
96
+ begin
97
+ [
98
+ metainspector.images.best,
99
+ nokogiri.css('[property="og:image"]').first&.attributes['content'].value
100
+ ].each do |possibility|
101
+ return possibility unless possibility.blank?
102
+ end
103
+ rescue
104
+ puts 'Curation::Page find_image error'
105
+ end
106
+ return ''
80
107
  end
81
108
 
82
109
  def json_ld
83
110
  unless @json_ld
84
111
  @json_ld = []
85
112
  begin
86
- options = html.css('[type="application/ld+json"]')
113
+ options = nokogiri.css('[type="application/ld+json"]')
87
114
  options.each do |option|
88
115
  string = option.inner_text
89
116
  hash = JSON.parse(string)
90
117
  @json_ld << hash
91
118
  end
92
119
  rescue
93
- puts "JSON LD error"
120
+ puts 'Curation::Page json_ld error'
94
121
  end
95
122
  end
96
123
  @json_ld
97
124
  end
98
125
 
99
- def data
100
- URI.open url
126
+ def html
127
+ @html ||= URI.open url
101
128
  rescue
102
129
  puts "Impossible to open #{url}"
103
130
  end
104
131
 
132
+ def nokogiri
133
+ @nokogiri ||= Nokogiri::HTML html
134
+ rescue
135
+ puts 'Curation::Page nokogiri error'
136
+ end
137
+
105
138
  def metainspector
106
- @metainspector ||= MetaInspector.new url
139
+ @metainspector ||= MetaInspector.new url, document: html
107
140
  rescue
108
- puts "MetaInspector error"
141
+ puts 'Curation::Page metainspector error'
109
142
  end
110
143
  end
111
144
  end
@@ -1,3 +1,3 @@
1
1
  module Curation
2
- VERSION = "1.1"
2
+ VERSION = "1.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: curation
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.1'
4
+ version: '1.6'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arnaud Levy
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-07-20 00:00:00.000000000 Z
11
+ date: 2020-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: metainspector