onebox 2.2.5 → 2.2.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +1 -1
- data/lib/onebox.rb +1 -2
- data/lib/onebox/engine/allowlisted_generic_onebox.rb +2 -0
- data/lib/onebox/engine/amazon_onebox.rb +31 -11
- data/lib/onebox/engine/gfycat_onebox.rb +1 -1
- data/lib/onebox/engine/google_docs_onebox.rb +1 -1
- data/lib/onebox/engine/html.rb +5 -1
- data/lib/onebox/engine/pastebin_onebox.rb +1 -1
- data/lib/onebox/engine/twitter_status_onebox.rb +1 -1
- data/lib/onebox/engine/youtube_onebox.rb +32 -4
- data/lib/onebox/helpers.rb +27 -12
- data/lib/onebox/open_graph.rb +2 -1
- data/lib/onebox/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00cdd8f25e8df4b1c992886e522e6dc6671973de81194796974f527c4f18aa7c
|
4
|
+
data.tar.gz: 96f9e2de815bef82f86170e0abf97234072c824d97a3322daef992e1dbc41a91
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57ff4ab92abbd8f2a5cbbedb7a88f3d7a517e4c1c779810f1259b58cc60c3591c43e9d4782984ab56a1389b00ee3acbaebc6298208813b694a5f73255bc3c0a7
|
7
|
+
data.tar.gz: 8baea1e77f32c09724a03452ba6018721681e9832d03b23bbcf1bc61609a56cabd8098d114dd49c66c468919b1d1c4cee4cc97212a6ac97fac229bc8f4eb886b
|
data/.github/workflows/ci.yml
CHANGED
data/lib/onebox.rb
CHANGED
@@ -227,8 +227,10 @@ module Onebox
|
|
227
227
|
d[:image] = d[:image_secure_url] || d[:image_url] || d[:thumbnail_url] || d[:image]
|
228
228
|
d[:image] = Onebox::Helpers::get_absolute_image_url(d[:image], @url)
|
229
229
|
d[:image] = Onebox::Helpers::normalize_url_for_output(html_entities.decode(d[:image]))
|
230
|
+
d[:image] = nil if Onebox::Helpers.blank?(d[:image])
|
230
231
|
|
231
232
|
d[:video] = d[:video_secure_url] || d[:video_url] || d[:video]
|
233
|
+
d[:video] = nil if Onebox::Helpers.blank?(d[:video])
|
232
234
|
|
233
235
|
d[:published_time] = d[:article_published_time] unless Onebox::Helpers.blank?(d[:article_published_time])
|
234
236
|
if !Onebox::Helpers.blank?(d[:published_time])
|
@@ -11,11 +11,25 @@ module Onebox
|
|
11
11
|
include HTML
|
12
12
|
|
13
13
|
always_https
|
14
|
-
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx)\//)
|
14
|
+
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)
|
15
15
|
|
16
16
|
def url
|
17
|
+
# If possible, fetch the cached HTML body immediately so we can
|
18
|
+
# try to grab the canonical URL from that document,
|
19
|
+
# rather than guess at the best URL structure to use
|
20
|
+
if body_cacher&.respond_to?('cache_response_body?')
|
21
|
+
if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
|
22
|
+
@raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
if @raw
|
27
|
+
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
|
28
|
+
return canonical_link.to_s if canonical_link
|
29
|
+
end
|
30
|
+
|
17
31
|
if match && match[:id]
|
18
|
-
return "https://www.amazon.#{tld}/
|
32
|
+
return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
|
19
33
|
end
|
20
34
|
|
21
35
|
@url
|
@@ -26,16 +40,15 @@ module Onebox
|
|
26
40
|
end
|
27
41
|
|
28
42
|
def http_params
|
29
|
-
|
30
|
-
'User-Agent' =>
|
31
|
-
|
32
|
-
}
|
43
|
+
if @options && @options[:user_agent]
|
44
|
+
{ 'User-Agent' => @options[:user_agent] }
|
45
|
+
end
|
33
46
|
end
|
34
47
|
|
35
48
|
private
|
36
49
|
|
37
50
|
def match
|
38
|
-
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[
|
51
|
+
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
|
39
52
|
end
|
40
53
|
|
41
54
|
def image
|
@@ -50,6 +63,10 @@ module Onebox
|
|
50
63
|
end
|
51
64
|
|
52
65
|
if (landing_image = raw.css("#landingImage")) && landing_image.any?
|
66
|
+
attributes = landing_image.first.attributes
|
67
|
+
|
68
|
+
return attributes["data-old-hires"].to_s if attributes["data-old-hires"]
|
69
|
+
|
53
70
|
landing_image.first["src"].to_s
|
54
71
|
end
|
55
72
|
|
@@ -100,7 +117,7 @@ module Onebox
|
|
100
117
|
end
|
101
118
|
|
102
119
|
result = {
|
103
|
-
link:
|
120
|
+
link: url,
|
104
121
|
title: title,
|
105
122
|
by_info: authors,
|
106
123
|
image: og.image || image,
|
@@ -131,7 +148,7 @@ module Onebox
|
|
131
148
|
end
|
132
149
|
|
133
150
|
result = {
|
134
|
-
link:
|
151
|
+
link: url,
|
135
152
|
title: title,
|
136
153
|
by_info: authors,
|
137
154
|
image: og.image || image,
|
@@ -147,7 +164,7 @@ module Onebox
|
|
147
164
|
else
|
148
165
|
title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
|
149
166
|
result = {
|
150
|
-
link:
|
167
|
+
link: url,
|
151
168
|
title: title,
|
152
169
|
image: og.image || image,
|
153
170
|
price: price
|
@@ -157,7 +174,10 @@ module Onebox
|
|
157
174
|
result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]
|
158
175
|
|
159
176
|
summary = raw.at("#productDescription")
|
160
|
-
|
177
|
+
|
178
|
+
description = og.description || summary&.inner_text
|
179
|
+
description ||= raw.css("meta[name=description]").first&.[]("content")
|
180
|
+
result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
|
161
181
|
end
|
162
182
|
|
163
183
|
result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
|
@@ -47,7 +47,7 @@ module Onebox
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def get_og_data
|
50
|
-
response = Onebox::Helpers.fetch_response(url, 10) rescue nil
|
50
|
+
response = Onebox::Helpers.fetch_response(url, redirect_limit: 10) rescue nil
|
51
51
|
html = Nokogiri::HTML(response)
|
52
52
|
og_data = {}
|
53
53
|
html.css('meta').each do |m|
|
data/lib/onebox/engine/html.rb
CHANGED
@@ -11,7 +11,11 @@ module Onebox
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def raw
|
14
|
-
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params)
|
14
|
+
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, body_cacher)
|
15
|
+
end
|
16
|
+
|
17
|
+
def body_cacher
|
18
|
+
self.options&.[](:body_cacher)
|
15
19
|
end
|
16
20
|
|
17
21
|
def html?
|
@@ -31,7 +31,7 @@ module Onebox
|
|
31
31
|
|
32
32
|
def lines
|
33
33
|
return @lines if @lines
|
34
|
-
response = Onebox::Helpers.fetch_response("http://pastebin.com/raw/#{paste_key}", 1) rescue ""
|
34
|
+
response = Onebox::Helpers.fetch_response("http://pastebin.com/raw/#{paste_key}", redirect_limit: 1) rescue ""
|
35
35
|
@lines = response.split("\n")
|
36
36
|
end
|
37
37
|
|
@@ -17,7 +17,7 @@ module Onebox
|
|
17
17
|
private
|
18
18
|
|
19
19
|
def get_twitter_data
|
20
|
-
response = Onebox::Helpers.fetch_response(url,
|
20
|
+
response = Onebox::Helpers.fetch_response(url, headers: http_params) rescue nil
|
21
21
|
html = Nokogiri::HTML(response)
|
22
22
|
twitter_data = {}
|
23
23
|
html.css('meta').each do |m|
|
@@ -13,11 +13,36 @@ module Onebox
|
|
13
13
|
WIDTH ||= 480
|
14
14
|
HEIGHT ||= 360
|
15
15
|
|
16
|
-
def
|
17
|
-
|
16
|
+
def parse_embed_response
|
17
|
+
return unless video_id
|
18
|
+
return @parse_embed_response if defined?(@parse_embed_response)
|
19
|
+
|
20
|
+
embed_url = "https://www.youtube.com/embed/#{video_id}"
|
21
|
+
@embed_doc ||= Onebox::Helpers.fetch_html_doc(embed_url)
|
22
|
+
|
23
|
+
begin
|
24
|
+
script_tag = @embed_doc.xpath('//script').find { |tag| tag.to_s.include?('ytcfg.set') }.to_s
|
25
|
+
match = script_tag.to_s.match(/ytcfg\.set\((?<json>.*)\)/)
|
26
|
+
|
27
|
+
yt_json = ::JSON.parse(match[:json])
|
28
|
+
renderer = ::JSON.parse(yt_json['PLAYER_VARS']['embedded_player_response'])['embedPreview']['thumbnailPreviewRenderer']
|
29
|
+
|
30
|
+
title = renderer['title']['runs'].first['text']
|
31
|
+
|
32
|
+
image = "https://img.youtube.com/vi/#{video_id}/hqdefault.jpg"
|
33
|
+
rescue
|
34
|
+
return
|
35
|
+
end
|
36
|
+
|
37
|
+
@parse_embed_response = { image: image, title: title }
|
38
|
+
end
|
18
39
|
|
40
|
+
def placeholder_html
|
19
41
|
if video_id || list_id
|
20
|
-
|
42
|
+
result = parse_embed_response
|
43
|
+
result ||= get_opengraph.data
|
44
|
+
|
45
|
+
"<img src='#{result[:image]}' width='#{WIDTH}' height='#{HEIGHT}' title='#{result[:title]}'>"
|
21
46
|
else
|
22
47
|
to_html
|
23
48
|
end
|
@@ -52,7 +77,10 @@ module Onebox
|
|
52
77
|
end
|
53
78
|
|
54
79
|
def video_title
|
55
|
-
@video_title ||=
|
80
|
+
@video_title ||= begin
|
81
|
+
result = parse_embed_response || get_opengraph.data
|
82
|
+
result[:title]
|
83
|
+
end
|
56
84
|
end
|
57
85
|
|
58
86
|
private
|
data/lib/onebox/helpers.rb
CHANGED
@@ -7,7 +7,7 @@ module Onebox
|
|
7
7
|
|
8
8
|
class DownloadTooLarge < StandardError; end
|
9
9
|
|
10
|
-
IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com']
|
10
|
+
IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com', 'youtube.com']
|
11
11
|
|
12
12
|
def self.symbolize_keys(hash)
|
13
13
|
return {} if hash.nil?
|
@@ -24,8 +24,8 @@ module Onebox
|
|
24
24
|
html.gsub(/<[^>]+>/, ' ').gsub(/\n/, '')
|
25
25
|
end
|
26
26
|
|
27
|
-
def self.fetch_html_doc(url, headers = nil)
|
28
|
-
response = (fetch_response(url,
|
27
|
+
def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
|
28
|
+
response = (fetch_response(url, headers: headers, body_cacher: body_cacher) rescue nil)
|
29
29
|
doc = Nokogiri::HTML(response)
|
30
30
|
uri = Addressable::URI.parse(url)
|
31
31
|
|
@@ -37,7 +37,7 @@ module Onebox
|
|
37
37
|
canonical_link = doc.at('//link[@rel="canonical"]/@href')
|
38
38
|
canonical_uri = Addressable::URI.parse(canonical_link)
|
39
39
|
if canonical_link && "#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}"
|
40
|
-
response = (fetch_response(canonical_uri.to_s,
|
40
|
+
response = (fetch_response(canonical_uri.to_s, headers: headers, body_cacher: body_cacher) rescue nil)
|
41
41
|
doc = Nokogiri::HTML(response) if response
|
42
42
|
end
|
43
43
|
end
|
@@ -45,16 +45,23 @@ module Onebox
|
|
45
45
|
doc
|
46
46
|
end
|
47
47
|
|
48
|
-
def self.fetch_response(location,
|
48
|
+
def self.fetch_response(location, redirect_limit: 5, domain: nil, headers: nil, body_cacher: nil)
|
49
|
+
redirect_limit = Onebox.options.redirect_limit if redirect_limit > Onebox.options.redirect_limit
|
49
50
|
|
50
|
-
|
51
|
-
limit = Onebox.options.redirect_limit if limit > Onebox.options.redirect_limit
|
52
|
-
|
53
|
-
raise Net::HTTPError.new('HTTP redirect too deep', location) if limit == 0
|
51
|
+
raise Net::HTTPError.new('HTTP redirect too deep', location) if redirect_limit == 0
|
54
52
|
|
55
53
|
uri = Addressable::URI.parse(location)
|
56
54
|
uri = Addressable::URI.join(domain, uri) if !uri.host
|
57
55
|
|
56
|
+
use_body_cacher = body_cacher && body_cacher.respond_to?('fetch_cached_response_body')
|
57
|
+
if use_body_cacher
|
58
|
+
response_body = body_cacher.fetch_cached_response_body(uri.to_s)
|
59
|
+
|
60
|
+
if response_body.present?
|
61
|
+
return response_body
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
58
65
|
result = StringIO.new
|
59
66
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.normalized_scheme == 'https') do |http|
|
60
67
|
http.open_timeout = Onebox.options.connect_timeout
|
@@ -86,9 +93,9 @@ module Onebox
|
|
86
93
|
response.error! unless [301, 302].include?(code)
|
87
94
|
return fetch_response(
|
88
95
|
response['location'],
|
89
|
-
|
90
|
-
"#{uri.scheme}://#{uri.host}",
|
91
|
-
redir_header
|
96
|
+
redirect_limit: redirect_limit - 1,
|
97
|
+
domain: "#{uri.scheme}://#{uri.host}",
|
98
|
+
headers: redir_header
|
92
99
|
)
|
93
100
|
end
|
94
101
|
|
@@ -98,6 +105,10 @@ module Onebox
|
|
98
105
|
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
|
99
106
|
end
|
100
107
|
|
108
|
+
if use_body_cacher && body_cacher.cache_response_body?(uri)
|
109
|
+
body_cacher.cache_response_body(uri.to_s, result.string)
|
110
|
+
end
|
111
|
+
|
101
112
|
return result.string
|
102
113
|
end
|
103
114
|
end
|
@@ -178,6 +189,10 @@ module Onebox
|
|
178
189
|
url.gsub!("'", "'")
|
179
190
|
url.gsub!('"', """)
|
180
191
|
url.gsub!(/[^\w\-`.~:\/?#\[\]@!$&'\(\)*+,;=%\p{M}’]/, "")
|
192
|
+
|
193
|
+
parsed = Addressable::URI.parse(url)
|
194
|
+
return "" unless parsed.host
|
195
|
+
|
181
196
|
url
|
182
197
|
end
|
183
198
|
|
data/lib/onebox/open_graph.rb
CHANGED
@@ -32,7 +32,8 @@ module Onebox
|
|
32
32
|
if method_name.end_with?(*integer_suffixes)
|
33
33
|
value.to_i
|
34
34
|
elsif method_name.end_with?(*url_suffixes)
|
35
|
-
|
35
|
+
result = Onebox::Helpers.normalize_url_for_output(value)
|
36
|
+
result unless Onebox::Helpers::blank?(result)
|
36
37
|
else
|
37
38
|
value
|
38
39
|
end
|
data/lib/onebox/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: onebox
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.2.
|
4
|
+
version: 2.2.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joanna Zeta
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-
|
13
|
+
date: 2021-04-07 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: addressable
|