onebox 2.2.5 → 2.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +1 -1
- data/lib/onebox.rb +1 -2
- data/lib/onebox/engine/allowlisted_generic_onebox.rb +2 -0
- data/lib/onebox/engine/amazon_onebox.rb +31 -11
- data/lib/onebox/engine/gfycat_onebox.rb +1 -1
- data/lib/onebox/engine/google_docs_onebox.rb +1 -1
- data/lib/onebox/engine/html.rb +5 -1
- data/lib/onebox/engine/pastebin_onebox.rb +1 -1
- data/lib/onebox/engine/twitter_status_onebox.rb +1 -1
- data/lib/onebox/engine/youtube_onebox.rb +32 -4
- data/lib/onebox/helpers.rb +27 -12
- data/lib/onebox/open_graph.rb +2 -1
- data/lib/onebox/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 00cdd8f25e8df4b1c992886e522e6dc6671973de81194796974f527c4f18aa7c
|
4
|
+
data.tar.gz: 96f9e2de815bef82f86170e0abf97234072c824d97a3322daef992e1dbc41a91
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 57ff4ab92abbd8f2a5cbbedb7a88f3d7a517e4c1c779810f1259b58cc60c3591c43e9d4782984ab56a1389b00ee3acbaebc6298208813b694a5f73255bc3c0a7
|
7
|
+
data.tar.gz: 8baea1e77f32c09724a03452ba6018721681e9832d03b23bbcf1bc61609a56cabd8098d114dd49c66c468919b1d1c4cee4cc97212a6ac97fac229bc8f4eb886b
|
data/.github/workflows/ci.yml
CHANGED
data/lib/onebox.rb
CHANGED
@@ -227,8 +227,10 @@ module Onebox
|
|
227
227
|
d[:image] = d[:image_secure_url] || d[:image_url] || d[:thumbnail_url] || d[:image]
|
228
228
|
d[:image] = Onebox::Helpers::get_absolute_image_url(d[:image], @url)
|
229
229
|
d[:image] = Onebox::Helpers::normalize_url_for_output(html_entities.decode(d[:image]))
|
230
|
+
d[:image] = nil if Onebox::Helpers.blank?(d[:image])
|
230
231
|
|
231
232
|
d[:video] = d[:video_secure_url] || d[:video_url] || d[:video]
|
233
|
+
d[:video] = nil if Onebox::Helpers.blank?(d[:video])
|
232
234
|
|
233
235
|
d[:published_time] = d[:article_published_time] unless Onebox::Helpers.blank?(d[:article_published_time])
|
234
236
|
if !Onebox::Helpers.blank?(d[:published_time])
|
@@ -11,11 +11,25 @@ module Onebox
|
|
11
11
|
include HTML
|
12
12
|
|
13
13
|
always_https
|
14
|
-
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx)\//)
|
14
|
+
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)
|
15
15
|
|
16
16
|
def url
|
17
|
+
# If possible, fetch the cached HTML body immediately so we can
|
18
|
+
# try to grab the canonical URL from that document,
|
19
|
+
# rather than guess at the best URL structure to use
|
20
|
+
if body_cacher&.respond_to?('cache_response_body?')
|
21
|
+
if body_cacher.cache_response_body?(uri.to_s) && body_cacher.cached_response_body_exists?(uri.to_s)
|
22
|
+
@raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, body_cacher)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
if @raw
|
27
|
+
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
|
28
|
+
return canonical_link.to_s if canonical_link
|
29
|
+
end
|
30
|
+
|
17
31
|
if match && match[:id]
|
18
|
-
return "https://www.amazon.#{tld}/
|
32
|
+
return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
|
19
33
|
end
|
20
34
|
|
21
35
|
@url
|
@@ -26,16 +40,15 @@ module Onebox
|
|
26
40
|
end
|
27
41
|
|
28
42
|
def http_params
|
29
|
-
|
30
|
-
'User-Agent' =>
|
31
|
-
|
32
|
-
}
|
43
|
+
if @options && @options[:user_agent]
|
44
|
+
{ 'User-Agent' => @options[:user_agent] }
|
45
|
+
end
|
33
46
|
end
|
34
47
|
|
35
48
|
private
|
36
49
|
|
37
50
|
def match
|
38
|
-
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[
|
51
|
+
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
|
39
52
|
end
|
40
53
|
|
41
54
|
def image
|
@@ -50,6 +63,10 @@ module Onebox
|
|
50
63
|
end
|
51
64
|
|
52
65
|
if (landing_image = raw.css("#landingImage")) && landing_image.any?
|
66
|
+
attributes = landing_image.first.attributes
|
67
|
+
|
68
|
+
return attributes["data-old-hires"].to_s if attributes["data-old-hires"]
|
69
|
+
|
53
70
|
landing_image.first["src"].to_s
|
54
71
|
end
|
55
72
|
|
@@ -100,7 +117,7 @@ module Onebox
|
|
100
117
|
end
|
101
118
|
|
102
119
|
result = {
|
103
|
-
link:
|
120
|
+
link: url,
|
104
121
|
title: title,
|
105
122
|
by_info: authors,
|
106
123
|
image: og.image || image,
|
@@ -131,7 +148,7 @@ module Onebox
|
|
131
148
|
end
|
132
149
|
|
133
150
|
result = {
|
134
|
-
link:
|
151
|
+
link: url,
|
135
152
|
title: title,
|
136
153
|
by_info: authors,
|
137
154
|
image: og.image || image,
|
@@ -147,7 +164,7 @@ module Onebox
|
|
147
164
|
else
|
148
165
|
title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
|
149
166
|
result = {
|
150
|
-
link:
|
167
|
+
link: url,
|
151
168
|
title: title,
|
152
169
|
image: og.image || image,
|
153
170
|
price: price
|
@@ -157,7 +174,10 @@ module Onebox
|
|
157
174
|
result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]
|
158
175
|
|
159
176
|
summary = raw.at("#productDescription")
|
160
|
-
|
177
|
+
|
178
|
+
description = og.description || summary&.inner_text
|
179
|
+
description ||= raw.css("meta[name=description]").first&.[]("content")
|
180
|
+
result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
|
161
181
|
end
|
162
182
|
|
163
183
|
result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
|
@@ -47,7 +47,7 @@ module Onebox
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def get_og_data
|
50
|
-
response = Onebox::Helpers.fetch_response(url, 10) rescue nil
|
50
|
+
response = Onebox::Helpers.fetch_response(url, redirect_limit: 10) rescue nil
|
51
51
|
html = Nokogiri::HTML(response)
|
52
52
|
og_data = {}
|
53
53
|
html.css('meta').each do |m|
|
data/lib/onebox/engine/html.rb
CHANGED
@@ -11,7 +11,11 @@ module Onebox
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def raw
|
14
|
-
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params)
|
14
|
+
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, body_cacher)
|
15
|
+
end
|
16
|
+
|
17
|
+
def body_cacher
|
18
|
+
self.options&.[](:body_cacher)
|
15
19
|
end
|
16
20
|
|
17
21
|
def html?
|
@@ -31,7 +31,7 @@ module Onebox
|
|
31
31
|
|
32
32
|
def lines
|
33
33
|
return @lines if @lines
|
34
|
-
response = Onebox::Helpers.fetch_response("http://pastebin.com/raw/#{paste_key}", 1) rescue ""
|
34
|
+
response = Onebox::Helpers.fetch_response("http://pastebin.com/raw/#{paste_key}", redirect_limit: 1) rescue ""
|
35
35
|
@lines = response.split("\n")
|
36
36
|
end
|
37
37
|
|
@@ -17,7 +17,7 @@ module Onebox
|
|
17
17
|
private
|
18
18
|
|
19
19
|
def get_twitter_data
|
20
|
-
response = Onebox::Helpers.fetch_response(url,
|
20
|
+
response = Onebox::Helpers.fetch_response(url, headers: http_params) rescue nil
|
21
21
|
html = Nokogiri::HTML(response)
|
22
22
|
twitter_data = {}
|
23
23
|
html.css('meta').each do |m|
|
@@ -13,11 +13,36 @@ module Onebox
|
|
13
13
|
WIDTH ||= 480
|
14
14
|
HEIGHT ||= 360
|
15
15
|
|
16
|
-
def
|
17
|
-
|
16
|
+
def parse_embed_response
|
17
|
+
return unless video_id
|
18
|
+
return @parse_embed_response if defined?(@parse_embed_response)
|
19
|
+
|
20
|
+
embed_url = "https://www.youtube.com/embed/#{video_id}"
|
21
|
+
@embed_doc ||= Onebox::Helpers.fetch_html_doc(embed_url)
|
22
|
+
|
23
|
+
begin
|
24
|
+
script_tag = @embed_doc.xpath('//script').find { |tag| tag.to_s.include?('ytcfg.set') }.to_s
|
25
|
+
match = script_tag.to_s.match(/ytcfg\.set\((?<json>.*)\)/)
|
26
|
+
|
27
|
+
yt_json = ::JSON.parse(match[:json])
|
28
|
+
renderer = ::JSON.parse(yt_json['PLAYER_VARS']['embedded_player_response'])['embedPreview']['thumbnailPreviewRenderer']
|
29
|
+
|
30
|
+
title = renderer['title']['runs'].first['text']
|
31
|
+
|
32
|
+
image = "https://img.youtube.com/vi/#{video_id}/hqdefault.jpg"
|
33
|
+
rescue
|
34
|
+
return
|
35
|
+
end
|
36
|
+
|
37
|
+
@parse_embed_response = { image: image, title: title }
|
38
|
+
end
|
18
39
|
|
40
|
+
def placeholder_html
|
19
41
|
if video_id || list_id
|
20
|
-
|
42
|
+
result = parse_embed_response
|
43
|
+
result ||= get_opengraph.data
|
44
|
+
|
45
|
+
"<img src='#{result[:image]}' width='#{WIDTH}' height='#{HEIGHT}' title='#{result[:title]}'>"
|
21
46
|
else
|
22
47
|
to_html
|
23
48
|
end
|
@@ -52,7 +77,10 @@ module Onebox
|
|
52
77
|
end
|
53
78
|
|
54
79
|
def video_title
|
55
|
-
@video_title ||=
|
80
|
+
@video_title ||= begin
|
81
|
+
result = parse_embed_response || get_opengraph.data
|
82
|
+
result[:title]
|
83
|
+
end
|
56
84
|
end
|
57
85
|
|
58
86
|
private
|
data/lib/onebox/helpers.rb
CHANGED
@@ -7,7 +7,7 @@ module Onebox
|
|
7
7
|
|
8
8
|
class DownloadTooLarge < StandardError; end
|
9
9
|
|
10
|
-
IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com']
|
10
|
+
IGNORE_CANONICAL_DOMAINS ||= ['www.instagram.com', 'youtube.com']
|
11
11
|
|
12
12
|
def self.symbolize_keys(hash)
|
13
13
|
return {} if hash.nil?
|
@@ -24,8 +24,8 @@ module Onebox
|
|
24
24
|
html.gsub(/<[^>]+>/, ' ').gsub(/\n/, '')
|
25
25
|
end
|
26
26
|
|
27
|
-
def self.fetch_html_doc(url, headers = nil)
|
28
|
-
response = (fetch_response(url,
|
27
|
+
def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
|
28
|
+
response = (fetch_response(url, headers: headers, body_cacher: body_cacher) rescue nil)
|
29
29
|
doc = Nokogiri::HTML(response)
|
30
30
|
uri = Addressable::URI.parse(url)
|
31
31
|
|
@@ -37,7 +37,7 @@ module Onebox
|
|
37
37
|
canonical_link = doc.at('//link[@rel="canonical"]/@href')
|
38
38
|
canonical_uri = Addressable::URI.parse(canonical_link)
|
39
39
|
if canonical_link && "#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}"
|
40
|
-
response = (fetch_response(canonical_uri.to_s,
|
40
|
+
response = (fetch_response(canonical_uri.to_s, headers: headers, body_cacher: body_cacher) rescue nil)
|
41
41
|
doc = Nokogiri::HTML(response) if response
|
42
42
|
end
|
43
43
|
end
|
@@ -45,16 +45,23 @@ module Onebox
|
|
45
45
|
doc
|
46
46
|
end
|
47
47
|
|
48
|
-
def self.fetch_response(location,
|
48
|
+
def self.fetch_response(location, redirect_limit: 5, domain: nil, headers: nil, body_cacher: nil)
|
49
|
+
redirect_limit = Onebox.options.redirect_limit if redirect_limit > Onebox.options.redirect_limit
|
49
50
|
|
50
|
-
|
51
|
-
limit = Onebox.options.redirect_limit if limit > Onebox.options.redirect_limit
|
52
|
-
|
53
|
-
raise Net::HTTPError.new('HTTP redirect too deep', location) if limit == 0
|
51
|
+
raise Net::HTTPError.new('HTTP redirect too deep', location) if redirect_limit == 0
|
54
52
|
|
55
53
|
uri = Addressable::URI.parse(location)
|
56
54
|
uri = Addressable::URI.join(domain, uri) if !uri.host
|
57
55
|
|
56
|
+
use_body_cacher = body_cacher && body_cacher.respond_to?('fetch_cached_response_body')
|
57
|
+
if use_body_cacher
|
58
|
+
response_body = body_cacher.fetch_cached_response_body(uri.to_s)
|
59
|
+
|
60
|
+
if response_body.present?
|
61
|
+
return response_body
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
58
65
|
result = StringIO.new
|
59
66
|
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.normalized_scheme == 'https') do |http|
|
60
67
|
http.open_timeout = Onebox.options.connect_timeout
|
@@ -86,9 +93,9 @@ module Onebox
|
|
86
93
|
response.error! unless [301, 302].include?(code)
|
87
94
|
return fetch_response(
|
88
95
|
response['location'],
|
89
|
-
|
90
|
-
"#{uri.scheme}://#{uri.host}",
|
91
|
-
redir_header
|
96
|
+
redirect_limit: redirect_limit - 1,
|
97
|
+
domain: "#{uri.scheme}://#{uri.host}",
|
98
|
+
headers: redir_header
|
92
99
|
)
|
93
100
|
end
|
94
101
|
|
@@ -98,6 +105,10 @@ module Onebox
|
|
98
105
|
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
|
99
106
|
end
|
100
107
|
|
108
|
+
if use_body_cacher && body_cacher.cache_response_body?(uri)
|
109
|
+
body_cacher.cache_response_body(uri.to_s, result.string)
|
110
|
+
end
|
111
|
+
|
101
112
|
return result.string
|
102
113
|
end
|
103
114
|
end
|
@@ -178,6 +189,10 @@ module Onebox
|
|
178
189
|
url.gsub!("'", "'")
|
179
190
|
url.gsub!('"', """)
|
180
191
|
url.gsub!(/[^\w\-`.~:\/?#\[\]@!$&'\(\)*+,;=%\p{M}’]/, "")
|
192
|
+
|
193
|
+
parsed = Addressable::URI.parse(url)
|
194
|
+
return "" unless parsed.host
|
195
|
+
|
181
196
|
url
|
182
197
|
end
|
183
198
|
|
data/lib/onebox/open_graph.rb
CHANGED
@@ -32,7 +32,8 @@ module Onebox
|
|
32
32
|
if method_name.end_with?(*integer_suffixes)
|
33
33
|
value.to_i
|
34
34
|
elsif method_name.end_with?(*url_suffixes)
|
35
|
-
|
35
|
+
result = Onebox::Helpers.normalize_url_for_output(value)
|
36
|
+
result unless Onebox::Helpers::blank?(result)
|
36
37
|
else
|
37
38
|
value
|
38
39
|
end
|
data/lib/onebox/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: onebox
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.2.
|
4
|
+
version: 2.2.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joanna Zeta
|
@@ -10,7 +10,7 @@ authors:
|
|
10
10
|
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date: 2021-
|
13
|
+
date: 2021-04-07 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: addressable
|