link_preview 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +109 -0
- data/Rakefile +15 -0
- data/lib/faraday/follow_redirects.rb +155 -0
- data/lib/link_preview/configuration.rb +86 -0
- data/lib/link_preview/content.rb +350 -0
- data/lib/link_preview/http_client.rb +91 -0
- data/lib/link_preview/http_crawler.rb +114 -0
- data/lib/link_preview/null_crawler.rb +46 -0
- data/lib/link_preview/parser.rb +172 -0
- data/lib/link_preview/spec_helper.rb +1 -0
- data/lib/link_preview/uri.rb +149 -0
- data/lib/link_preview/version.rb +23 -0
- data/lib/link_preview.rb +51 -0
- data/spec/files/requests/bad_utf8.yml +186 -0
- data/spec/files/requests/elasticsearch.yml +2258 -0
- data/spec/files/requests/ggp_png.yml +256 -0
- data/spec/files/requests/kaltura.yml +3612 -0
- data/spec/files/requests/kaltura_opengraph.yml +1266 -0
- data/spec/files/requests/ogp_me.yml +880 -0
- data/spec/files/requests/sliderocket.yml +387 -0
- data/spec/files/requests/support_apple_com.yml +833 -0
- data/spec/files/requests/youtube.yml +3513 -0
- data/spec/files/requests/youtube_404.yml +1055 -0
- data/spec/link_preview/http_crawler_spec.rb +50 -0
- data/spec/link_preview/uri_spec.rb +99 -0
- data/spec/link_preview_spec.rb +383 -0
- data/spec/spec_helper.rb +39 -0
- data/spec/support/link_preview/link_preview_stubs.rb +26 -0
- metadata +241 -0
@@ -0,0 +1,350 @@
|
|
1
|
+
# Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights to
|
6
|
+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
# of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
# so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in all
|
11
|
+
# copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
# SOFTWARE.
|
20
|
+
|
21
|
+
require 'link_preview/uri'
|
22
|
+
require 'link_preview/parser'
|
23
|
+
require 'link_preview/http_crawler'
|
24
|
+
require 'link_preview/null_crawler'
|
25
|
+
|
26
|
+
require 'active_support/core_ext/object'
|
27
|
+
|
28
|
+
module LinkPreview
|
29
|
+
class Content
|
30
|
+
PROPERTIES = [
|
31
|
+
:title,
|
32
|
+
:description,
|
33
|
+
:site_name,
|
34
|
+
:site_url,
|
35
|
+
:image_url,
|
36
|
+
:image_data,
|
37
|
+
:image_content_type,
|
38
|
+
:image_file_name,
|
39
|
+
:content_url,
|
40
|
+
:content_type,
|
41
|
+
:content_width,
|
42
|
+
:content_height ]
|
43
|
+
|
44
|
+
SOURCES = [:initial, :image, :oembed, :opengraph, :html]
|
45
|
+
|
46
|
+
SOURCE_PROPERTIES_TABLE =
|
47
|
+
{
|
48
|
+
:oembed =>
|
49
|
+
{
|
50
|
+
:site_name => :provider_name,
|
51
|
+
:site_url => :provider_url,
|
52
|
+
:image_url => :thumbnail_url
|
53
|
+
},
|
54
|
+
:opengraph =>
|
55
|
+
{
|
56
|
+
:image_url => [:image_secure_url, :image, :image_url],
|
57
|
+
:content_url => [:video_secure_url, :video, :video_url],
|
58
|
+
:content_type => :video_type,
|
59
|
+
:content_width => :video_width,
|
60
|
+
:content_height => :video_height
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
PROPERTIES_SOURCE_TABLE =
|
65
|
+
Hash.new { |h,k| h[k] = {} }.tap do |reverse_property_table|
|
66
|
+
SOURCE_PROPERTIES_TABLE.each do |source, table|
|
67
|
+
table.invert.each_pair do |keys, val|
|
68
|
+
Array.wrap(keys).each do |key|
|
69
|
+
reverse_property_table[source][key] = val
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def initialize(config, content_uri, options = {}, sources = {})
|
76
|
+
@config = config
|
77
|
+
@content_uri = content_uri
|
78
|
+
@options = options
|
79
|
+
@sources = Hash.new { |h,k| h[k] = {} }
|
80
|
+
crawler.enqueue!(@content_uri)
|
81
|
+
|
82
|
+
add_source_properties!(sources)
|
83
|
+
end
|
84
|
+
|
85
|
+
# @return [String] permalink URL of resource
|
86
|
+
def url
|
87
|
+
extract(:url) || @content_uri
|
88
|
+
end
|
89
|
+
|
90
|
+
PROPERTIES.each do |property|
|
91
|
+
define_method(property) do
|
92
|
+
extract(property)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# @return [Boolean] true of at least related content URI has been successfully fetched
|
97
|
+
def found?
|
98
|
+
extract_all
|
99
|
+
crawler.success?
|
100
|
+
end
|
101
|
+
|
102
|
+
# @return [Boolean] true of at least one content property is present
|
103
|
+
def empty?
|
104
|
+
extract_all
|
105
|
+
SOURCES.none? do |source|
|
106
|
+
@sources[source].any?(&:present?)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
def sources
|
111
|
+
@sources
|
112
|
+
end
|
113
|
+
|
114
|
+
def as_oembed
|
115
|
+
if content_type == 'application/x-shockwave-flash'
|
116
|
+
@sources[:oembed].reverse_merge(as_oembed_video)
|
117
|
+
else
|
118
|
+
@sources[:oembed].reverse_merge(as_oembed_link)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
protected
|
123
|
+
|
124
|
+
def crawler
|
125
|
+
@crawler ||= @options.fetch(:allow_requests, true) ?
|
126
|
+
LinkPreview::HTTPCrawler.new(@config, @options) :
|
127
|
+
LinkPreview::NullCrawler.new(@config, @options)
|
128
|
+
end
|
129
|
+
|
130
|
+
def parser
|
131
|
+
@parser ||= LinkPreview::Parser.new(@config, @options)
|
132
|
+
end
|
133
|
+
|
134
|
+
def parsed_url
|
135
|
+
LinkPreview::URI.parse(url, @options)
|
136
|
+
end
|
137
|
+
|
138
|
+
def default_property(property)
|
139
|
+
if respond_to?("default_#{property}", true)
|
140
|
+
send("default_#{property}")
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# called via default_property
|
145
|
+
def default_title
|
146
|
+
parsed_url.for_display.to_s
|
147
|
+
end
|
148
|
+
|
149
|
+
# called via default_property
|
150
|
+
def default_site_name
|
151
|
+
parsed_url.host
|
152
|
+
end
|
153
|
+
|
154
|
+
# called via default_property
|
155
|
+
def default_site_url
|
156
|
+
if parsed_url.scheme && parsed_url.host
|
157
|
+
"#{parsed_url.scheme}://#{parsed_url.host}"
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def normalize_property(property, value)
|
162
|
+
if respond_to?("normalize_#{property}", true)
|
163
|
+
send("normalize_#{property}", value)
|
164
|
+
else
|
165
|
+
normalize_generic(property, value)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def normalize_generic(property, value)
|
170
|
+
case value
|
171
|
+
when String
|
172
|
+
strip_html(value.strip)
|
173
|
+
when Array
|
174
|
+
value.compact.map { |elem| normalize_property(property, elem ) }
|
175
|
+
else
|
176
|
+
value
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# called via normalize_property
|
181
|
+
def normalize_image_url(partial_image_url)
|
182
|
+
return unless partial_image_url
|
183
|
+
parsed_partial_image_url = LinkPreview::URI.parse(partial_image_url, @options)
|
184
|
+
parsed_absolute_image_url = parsed_partial_image_url.to_absolute(@content_uri)
|
185
|
+
parsed_absolute_image_url.to_s.tap do |absolute_image_url|
|
186
|
+
crawler.enqueue!(absolute_image_url, :image)
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
# called via normalize_property
|
191
|
+
def normalize_url(partial_url)
|
192
|
+
return unless partial_url
|
193
|
+
partial_unencoded_url = LinkPreview::URI.unescape(partial_url)
|
194
|
+
parsed_partial_url = LinkPreview::URI.parse(partial_unencoded_url, @options)
|
195
|
+
parsed_absolute_url = parsed_partial_url.to_absolute(@content_uri)
|
196
|
+
crawler.enqueue!(parsed_absolute_url, :html)
|
197
|
+
parsed_absolute_url.for_display.to_s
|
198
|
+
end
|
199
|
+
|
200
|
+
# called via normalize_property
|
201
|
+
def normalize_content_url(content_url)
|
202
|
+
return unless content_url
|
203
|
+
LinkPreview::URI.safe_escape(content_url).to_s
|
204
|
+
end
|
205
|
+
|
206
|
+
# called via normalize_property
|
207
|
+
def normalize_title(title)
|
208
|
+
CGI.unescapeHTML(title)
|
209
|
+
end
|
210
|
+
|
211
|
+
# called via normalize_property
|
212
|
+
def normalize_html(html)
|
213
|
+
html
|
214
|
+
end
|
215
|
+
|
216
|
+
def get_property(property)
|
217
|
+
SOURCES.map do |source|
|
218
|
+
@sources[source][property_alias(source, property)]
|
219
|
+
end.compact.first || default_property(property)
|
220
|
+
end
|
221
|
+
|
222
|
+
def has_property?(property)
|
223
|
+
SOURCES.map do |source|
|
224
|
+
@sources[source][property_alias(source, property)]
|
225
|
+
end.any?(&:present?)
|
226
|
+
end
|
227
|
+
|
228
|
+
def property_alias(source, property)
|
229
|
+
property_aliases(source,property).detect { |property| @sources[source].has_key?(property) }
|
230
|
+
end
|
231
|
+
|
232
|
+
def property_aliases(source, property)
|
233
|
+
Array.wrap(SOURCE_PROPERTIES_TABLE.fetch(source, {}).fetch(property, property))
|
234
|
+
end
|
235
|
+
|
236
|
+
def property_unalias(source, property)
|
237
|
+
PROPERTIES_SOURCE_TABLE.fetch(source, {}).fetch(property, property)
|
238
|
+
end
|
239
|
+
|
240
|
+
def property_source_priority(property)
|
241
|
+
case property
|
242
|
+
when :description
|
243
|
+
[:html, :oembed, :default]
|
244
|
+
when :image_data, :image_content_type, :image_file_name
|
245
|
+
[:image, :oembed, :default]
|
246
|
+
else
|
247
|
+
[:oembed, :html, :image, :default]
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def add_source_properties!(sources)
|
252
|
+
sources.symbolize_keys!
|
253
|
+
sources.reject!{ |_, properties| properties.empty? }
|
254
|
+
sources.select! { |source,_| SOURCES.include?(source) }
|
255
|
+
sources.each do |source, properties|
|
256
|
+
properties.symbolize_keys!
|
257
|
+
properties.reject!{ |_, value| value.blank? }
|
258
|
+
properties.each do |property, value|
|
259
|
+
next if @sources[source][property]
|
260
|
+
@sources[source][property] = normalize_property(property_unalias(source, property), value)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
parser.discovered_uris.each do |uri|
|
264
|
+
crawler.enqueue!(uri)
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
def extract(property)
|
269
|
+
while !crawler.finished? do
|
270
|
+
break if has_property?(property)
|
271
|
+
data = crawler.dequeue!(property_source_priority(property))
|
272
|
+
properties = parser.parse(data)
|
273
|
+
add_source_properties!(properties)
|
274
|
+
end
|
275
|
+
get_property(property)
|
276
|
+
end
|
277
|
+
|
278
|
+
def extract_all
|
279
|
+
PROPERTIES.each do |property|
|
280
|
+
send(property)
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# FIXME this is expensive
|
285
|
+
def strip_html(value)
|
286
|
+
Nokogiri::HTML(value).xpath('//text()').remove.to_s
|
287
|
+
end
|
288
|
+
|
289
|
+
def as_oembed_link
|
290
|
+
{
|
291
|
+
:version => '1.0',
|
292
|
+
:provider_name => site_name,
|
293
|
+
:provider_url => site_url,
|
294
|
+
:title => title,
|
295
|
+
:description => description,
|
296
|
+
:type => 'link',
|
297
|
+
:thumbnail_url => image_url
|
298
|
+
}.reject { |_,v| v.nil? }
|
299
|
+
end
|
300
|
+
|
301
|
+
def as_oembed_video
|
302
|
+
as_oembed_link.merge({
|
303
|
+
:type => 'video',
|
304
|
+
:html => content_html,
|
305
|
+
:width => content_width_scaled.to_i,
|
306
|
+
:height => content_height_scaled.to_i})
|
307
|
+
end
|
308
|
+
|
309
|
+
def content_html
|
310
|
+
return nil unless content_url.present?
|
311
|
+
|
312
|
+
<<-EOF.strip.gsub(/\s+/, ' ').gsub(/>\s+</, '><')
|
313
|
+
<object width="#{content_width_scaled}" height="#{content_height_scaled}">
|
314
|
+
<param name="movie" value="#{content_url}"></param>
|
315
|
+
<param name="allowScriptAccess" value="always"></param>
|
316
|
+
<param name="allowFullScreen" value="true"></param>
|
317
|
+
<embed src="#{content_url}"
|
318
|
+
type="#{content_type}"
|
319
|
+
allowscriptaccess="always"
|
320
|
+
allowfullscreen="true"
|
321
|
+
width="#{content_width_scaled}" height="#{content_height_scaled}"></embed>
|
322
|
+
</object>
|
323
|
+
EOF
|
324
|
+
end
|
325
|
+
|
326
|
+
def content_width_scaled
|
327
|
+
# Width takes precedence over height
|
328
|
+
if @options[:width].to_i > 0
|
329
|
+
@options[:width]
|
330
|
+
elsif @options[:height].to_i > 0 && content_height.to_i > 0
|
331
|
+
# Compute scaled width using the ratio of requested height to actual height, round up to prevent truncation
|
332
|
+
(((@options[:height].to_i * 1.0) / (content_height.to_i * 1.0)) * content_width.to_i).ceil
|
333
|
+
else
|
334
|
+
content_width.to_i
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
def content_height_scaled
|
339
|
+
# Width takes precedence over height
|
340
|
+
if @options[:width].to_i > 0 && content_width.to_i > 0
|
341
|
+
# Compute scaled height using the ratio of requested width to actual width, round up to prevent truncation
|
342
|
+
(((@options[:width].to_i * 1.0) / (content_width.to_i * 1.0)) * content_height.to_i).ceil
|
343
|
+
elsif @options[:height].to_i > 0
|
344
|
+
@options[:height]
|
345
|
+
else
|
346
|
+
content_height.to_i
|
347
|
+
end
|
348
|
+
end
|
349
|
+
end
|
350
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights to
|
6
|
+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
# of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
# so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in all
|
11
|
+
# copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
# SOFTWARE.
|
20
|
+
|
21
|
+
require 'faraday'
|
22
|
+
require 'faraday/follow_redirects'
|
23
|
+
|
24
|
+
module LinkPreview
|
25
|
+
class ExtraEnv < Faraday::Middleware
|
26
|
+
class << self
|
27
|
+
attr_accessor :extra
|
28
|
+
end
|
29
|
+
|
30
|
+
def call(env)
|
31
|
+
env[:link_preview] = self.class.extra || {}
|
32
|
+
@app.call(env)
|
33
|
+
ensure
|
34
|
+
env[:link_preview] = nil
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
class NormalizeURI < Faraday::Middleware
|
39
|
+
def call(env)
|
40
|
+
env[:url] = env[:url].normalize
|
41
|
+
@app.call(env)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
class ForceUTF8Body < Faraday::Middleware
|
46
|
+
def force_utf8_body!(env)
|
47
|
+
return if env[:body].encoding == Encoding::UTF_8 && env[:body].valid_encoding?
|
48
|
+
return unless env[:response_headers][:content_type] =~ /text/
|
49
|
+
env[:body].encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
50
|
+
unless env[:body].valid_encoding?
|
51
|
+
# cleanse untrusted invalid bytes with a double transcode as suggested here:
|
52
|
+
# http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8
|
53
|
+
env[:body].encode!('UTF-16', 'binary', invalid: :replace, undef: :replace, replace: '')
|
54
|
+
env[:body].encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def call(env)
|
59
|
+
@app.call(env).on_complete do |env|
|
60
|
+
force_utf8_body!(env)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class HTTPClient
|
66
|
+
extend Forwardable
|
67
|
+
|
68
|
+
def initialize(config)
|
69
|
+
@config = config
|
70
|
+
end
|
71
|
+
|
72
|
+
def_delegator :faraday_connection, :get
|
73
|
+
|
74
|
+
private
|
75
|
+
|
76
|
+
def faraday_connection
|
77
|
+
@faraday_connection ||= Faraday.new do |builder|
|
78
|
+
builder.options[:timeout] = @config.timeout
|
79
|
+
builder.options[:open_timeout] = @config.open_timeout
|
80
|
+
|
81
|
+
builder.use ExtraEnv
|
82
|
+
builder.use Faraday::FollowRedirects, limit: @config.max_redirects if @config.follow_redirects
|
83
|
+
builder.use NormalizeURI
|
84
|
+
builder.use ForceUTF8Body
|
85
|
+
@config.middleware.each { |middleware| builder.use middleware }
|
86
|
+
|
87
|
+
builder.use @config.http_adapter
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights to
|
6
|
+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
# of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
# so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in all
|
11
|
+
# copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
# SOFTWARE.
|
20
|
+
|
21
|
+
require 'link_preview'
|
22
|
+
require 'link_preview/uri'
|
23
|
+
|
24
|
+
module LinkPreview
|
25
|
+
module ResponseWithURL
|
26
|
+
attr_accessor :url
|
27
|
+
end
|
28
|
+
|
29
|
+
class HTTPCrawler
|
30
|
+
def initialize(config, options = {})
|
31
|
+
@config = config
|
32
|
+
@options = options
|
33
|
+
@status = {}
|
34
|
+
@queue = Hash.new { |h,k| h[k] = [] }
|
35
|
+
end
|
36
|
+
|
37
|
+
# @param [String] URI of content to crawl
|
38
|
+
def enqueue!(uri, priority = :default)
|
39
|
+
return if full?
|
40
|
+
return unless uri
|
41
|
+
parsed_uri = LinkPreview::URI.parse(uri, @options)
|
42
|
+
|
43
|
+
if oembed_uri = parsed_uri.as_oembed_uri
|
44
|
+
enqueue_uri(oembed_uri, :oembed)
|
45
|
+
end
|
46
|
+
|
47
|
+
if content_uri = parsed_uri.as_content_uri
|
48
|
+
enqueue_uri(content_uri, priority)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
# @return [Hash] latest normalized content discovered by crawling
|
53
|
+
def dequeue!(priority_order = [])
|
54
|
+
return if finished?
|
55
|
+
uri = dequeue_by_priority(priority_order)
|
56
|
+
with_extra_env do
|
57
|
+
@config.http_client.get(uri).tap do |response|
|
58
|
+
response.extend ResponseWithURL
|
59
|
+
response.url = uri
|
60
|
+
@status[uri] = response.status.to_i
|
61
|
+
end
|
62
|
+
end
|
63
|
+
rescue => e
|
64
|
+
@status[uri] ||= 500
|
65
|
+
@config.error_handler.call(e)
|
66
|
+
Faraday::Response.new
|
67
|
+
end
|
68
|
+
|
69
|
+
# @return [Boolean] true if any content discovered thus far has been successfully fetched
|
70
|
+
def success?
|
71
|
+
@status.any? { |_, status| status == 200 }
|
72
|
+
end
|
73
|
+
|
74
|
+
# @return [Boolean] true if all known discovered content has been crawled
|
75
|
+
def finished?
|
76
|
+
@queue.values.flatten.empty?
|
77
|
+
end
|
78
|
+
|
79
|
+
# @return [Boolean] true crawler is at capacity
|
80
|
+
def full?
|
81
|
+
@queue.values.flatten.size > @config.max_requests
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def dequeue_by_priority(priority_order)
|
87
|
+
priority = priority_order.detect { |priority| @queue[priority].any? }
|
88
|
+
priority ||= @queue.keys.detect { |priority| @queue[priority].any? }
|
89
|
+
@queue[priority].shift
|
90
|
+
end
|
91
|
+
|
92
|
+
def enqueue_uri(parsed_uri, priority = :default)
|
93
|
+
uri = parsed_uri.to_s
|
94
|
+
if !(processed?(uri) || enqueued?(uri))
|
95
|
+
@queue[priority] << uri
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def processed?(uri)
|
100
|
+
@status.has_key?(uri)
|
101
|
+
end
|
102
|
+
|
103
|
+
def enqueued?(uri)
|
104
|
+
@queue.values.flatten.uniq.include?(uri)
|
105
|
+
end
|
106
|
+
|
107
|
+
def with_extra_env(&block)
|
108
|
+
LinkPreview::ExtraEnv.extra = @options
|
109
|
+
yield
|
110
|
+
ensure
|
111
|
+
LinkPreview::ExtraEnv.extra = nil
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights to
|
6
|
+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
# of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
# so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in all
|
11
|
+
# copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
# SOFTWARE.
|
20
|
+
|
21
|
+
require 'link_preview'
|
22
|
+
|
23
|
+
module LinkPreview
|
24
|
+
class NullCrawler
|
25
|
+
def initialize(config, options = {})
|
26
|
+
end
|
27
|
+
|
28
|
+
def enqueue!(uri, priority = :default)
|
29
|
+
end
|
30
|
+
|
31
|
+
def dequeue!(priority_order = [])
|
32
|
+
end
|
33
|
+
|
34
|
+
def success?
|
35
|
+
true
|
36
|
+
end
|
37
|
+
|
38
|
+
def finished?
|
39
|
+
true
|
40
|
+
end
|
41
|
+
|
42
|
+
def full?
|
43
|
+
false
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|