link_preview 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,350 @@
1
+ # Copyright (c) 2014, VMware, Inc. All Rights Reserved.
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights to
6
+ # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ # of the Software, and to permit persons to whom the Software is furnished to do
8
+ # so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require 'link_preview/uri'
22
+ require 'link_preview/parser'
23
+ require 'link_preview/http_crawler'
24
+ require 'link_preview/null_crawler'
25
+
26
+ require 'active_support/core_ext/object'
27
+
28
+ module LinkPreview
29
+ class Content
30
+ PROPERTIES = [
31
+ :title,
32
+ :description,
33
+ :site_name,
34
+ :site_url,
35
+ :image_url,
36
+ :image_data,
37
+ :image_content_type,
38
+ :image_file_name,
39
+ :content_url,
40
+ :content_type,
41
+ :content_width,
42
+ :content_height ]
43
+
44
+ SOURCES = [:initial, :image, :oembed, :opengraph, :html]
45
+
46
+ SOURCE_PROPERTIES_TABLE =
47
+ {
48
+ :oembed =>
49
+ {
50
+ :site_name => :provider_name,
51
+ :site_url => :provider_url,
52
+ :image_url => :thumbnail_url
53
+ },
54
+ :opengraph =>
55
+ {
56
+ :image_url => [:image_secure_url, :image, :image_url],
57
+ :content_url => [:video_secure_url, :video, :video_url],
58
+ :content_type => :video_type,
59
+ :content_width => :video_width,
60
+ :content_height => :video_height
61
+ }
62
+ }
63
+
64
+ PROPERTIES_SOURCE_TABLE =
65
+ Hash.new { |h,k| h[k] = {} }.tap do |reverse_property_table|
66
+ SOURCE_PROPERTIES_TABLE.each do |source, table|
67
+ table.invert.each_pair do |keys, val|
68
+ Array.wrap(keys).each do |key|
69
+ reverse_property_table[source][key] = val
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def initialize(config, content_uri, options = {}, sources = {})
76
+ @config = config
77
+ @content_uri = content_uri
78
+ @options = options
79
+ @sources = Hash.new { |h,k| h[k] = {} }
80
+ crawler.enqueue!(@content_uri)
81
+
82
+ add_source_properties!(sources)
83
+ end
84
+
85
+ # @return [String] permalink URL of resource
86
+ def url
87
+ extract(:url) || @content_uri
88
+ end
89
+
90
+ PROPERTIES.each do |property|
91
+ define_method(property) do
92
+ extract(property)
93
+ end
94
+ end
95
+
96
+ # @return [Boolean] true of at least related content URI has been successfully fetched
97
+ def found?
98
+ extract_all
99
+ crawler.success?
100
+ end
101
+
102
+ # @return [Boolean] true of at least one content property is present
103
+ def empty?
104
+ extract_all
105
+ SOURCES.none? do |source|
106
+ @sources[source].any?(&:present?)
107
+ end
108
+ end
109
+
110
+ def sources
111
+ @sources
112
+ end
113
+
114
+ def as_oembed
115
+ if content_type == 'application/x-shockwave-flash'
116
+ @sources[:oembed].reverse_merge(as_oembed_video)
117
+ else
118
+ @sources[:oembed].reverse_merge(as_oembed_link)
119
+ end
120
+ end
121
+
122
+ protected
123
+
124
+ def crawler
125
+ @crawler ||= @options.fetch(:allow_requests, true) ?
126
+ LinkPreview::HTTPCrawler.new(@config, @options) :
127
+ LinkPreview::NullCrawler.new(@config, @options)
128
+ end
129
+
130
+ def parser
131
+ @parser ||= LinkPreview::Parser.new(@config, @options)
132
+ end
133
+
134
+ def parsed_url
135
+ LinkPreview::URI.parse(url, @options)
136
+ end
137
+
138
+ def default_property(property)
139
+ if respond_to?("default_#{property}", true)
140
+ send("default_#{property}")
141
+ end
142
+ end
143
+
144
+ # called via default_property
145
+ def default_title
146
+ parsed_url.for_display.to_s
147
+ end
148
+
149
+ # called via default_property
150
+ def default_site_name
151
+ parsed_url.host
152
+ end
153
+
154
+ # called via default_property
155
+ def default_site_url
156
+ if parsed_url.scheme && parsed_url.host
157
+ "#{parsed_url.scheme}://#{parsed_url.host}"
158
+ end
159
+ end
160
+
161
+ def normalize_property(property, value)
162
+ if respond_to?("normalize_#{property}", true)
163
+ send("normalize_#{property}", value)
164
+ else
165
+ normalize_generic(property, value)
166
+ end
167
+ end
168
+
169
+ def normalize_generic(property, value)
170
+ case value
171
+ when String
172
+ strip_html(value.strip)
173
+ when Array
174
+ value.compact.map { |elem| normalize_property(property, elem ) }
175
+ else
176
+ value
177
+ end
178
+ end
179
+
180
+ # called via normalize_property
181
+ def normalize_image_url(partial_image_url)
182
+ return unless partial_image_url
183
+ parsed_partial_image_url = LinkPreview::URI.parse(partial_image_url, @options)
184
+ parsed_absolute_image_url = parsed_partial_image_url.to_absolute(@content_uri)
185
+ parsed_absolute_image_url.to_s.tap do |absolute_image_url|
186
+ crawler.enqueue!(absolute_image_url, :image)
187
+ end
188
+ end
189
+
190
+ # called via normalize_property
191
+ def normalize_url(partial_url)
192
+ return unless partial_url
193
+ partial_unencoded_url = LinkPreview::URI.unescape(partial_url)
194
+ parsed_partial_url = LinkPreview::URI.parse(partial_unencoded_url, @options)
195
+ parsed_absolute_url = parsed_partial_url.to_absolute(@content_uri)
196
+ crawler.enqueue!(parsed_absolute_url, :html)
197
+ parsed_absolute_url.for_display.to_s
198
+ end
199
+
200
+ # called via normalize_property
201
+ def normalize_content_url(content_url)
202
+ return unless content_url
203
+ LinkPreview::URI.safe_escape(content_url).to_s
204
+ end
205
+
206
+ # called via normalize_property
207
+ def normalize_title(title)
208
+ CGI.unescapeHTML(title)
209
+ end
210
+
211
+ # called via normalize_property
212
+ def normalize_html(html)
213
+ html
214
+ end
215
+
216
+ def get_property(property)
217
+ SOURCES.map do |source|
218
+ @sources[source][property_alias(source, property)]
219
+ end.compact.first || default_property(property)
220
+ end
221
+
222
+ def has_property?(property)
223
+ SOURCES.map do |source|
224
+ @sources[source][property_alias(source, property)]
225
+ end.any?(&:present?)
226
+ end
227
+
228
+ def property_alias(source, property)
229
+ property_aliases(source,property).detect { |property| @sources[source].has_key?(property) }
230
+ end
231
+
232
+ def property_aliases(source, property)
233
+ Array.wrap(SOURCE_PROPERTIES_TABLE.fetch(source, {}).fetch(property, property))
234
+ end
235
+
236
+ def property_unalias(source, property)
237
+ PROPERTIES_SOURCE_TABLE.fetch(source, {}).fetch(property, property)
238
+ end
239
+
240
+ def property_source_priority(property)
241
+ case property
242
+ when :description
243
+ [:html, :oembed, :default]
244
+ when :image_data, :image_content_type, :image_file_name
245
+ [:image, :oembed, :default]
246
+ else
247
+ [:oembed, :html, :image, :default]
248
+ end
249
+ end
250
+
251
+ def add_source_properties!(sources)
252
+ sources.symbolize_keys!
253
+ sources.reject!{ |_, properties| properties.empty? }
254
+ sources.select! { |source,_| SOURCES.include?(source) }
255
+ sources.each do |source, properties|
256
+ properties.symbolize_keys!
257
+ properties.reject!{ |_, value| value.blank? }
258
+ properties.each do |property, value|
259
+ next if @sources[source][property]
260
+ @sources[source][property] = normalize_property(property_unalias(source, property), value)
261
+ end
262
+ end
263
+ parser.discovered_uris.each do |uri|
264
+ crawler.enqueue!(uri)
265
+ end
266
+ end
267
+
268
+ def extract(property)
269
+ while !crawler.finished? do
270
+ break if has_property?(property)
271
+ data = crawler.dequeue!(property_source_priority(property))
272
+ properties = parser.parse(data)
273
+ add_source_properties!(properties)
274
+ end
275
+ get_property(property)
276
+ end
277
+
278
+ def extract_all
279
+ PROPERTIES.each do |property|
280
+ send(property)
281
+ end
282
+ end
283
+
284
+ # FIXME this is expensive
285
+ def strip_html(value)
286
+ Nokogiri::HTML(value).xpath('//text()').remove.to_s
287
+ end
288
+
289
+ def as_oembed_link
290
+ {
291
+ :version => '1.0',
292
+ :provider_name => site_name,
293
+ :provider_url => site_url,
294
+ :title => title,
295
+ :description => description,
296
+ :type => 'link',
297
+ :thumbnail_url => image_url
298
+ }.reject { |_,v| v.nil? }
299
+ end
300
+
301
+ def as_oembed_video
302
+ as_oembed_link.merge({
303
+ :type => 'video',
304
+ :html => content_html,
305
+ :width => content_width_scaled.to_i,
306
+ :height => content_height_scaled.to_i})
307
+ end
308
+
309
+ def content_html
310
+ return nil unless content_url.present?
311
+
312
+ <<-EOF.strip.gsub(/\s+/, ' ').gsub(/>\s+</, '><')
313
+ <object width="#{content_width_scaled}" height="#{content_height_scaled}">
314
+ <param name="movie" value="#{content_url}"></param>
315
+ <param name="allowScriptAccess" value="always"></param>
316
+ <param name="allowFullScreen" value="true"></param>
317
+ <embed src="#{content_url}"
318
+ type="#{content_type}"
319
+ allowscriptaccess="always"
320
+ allowfullscreen="true"
321
+ width="#{content_width_scaled}" height="#{content_height_scaled}"></embed>
322
+ </object>
323
+ EOF
324
+ end
325
+
326
+ def content_width_scaled
327
+ # Width takes precedence over height
328
+ if @options[:width].to_i > 0
329
+ @options[:width]
330
+ elsif @options[:height].to_i > 0 && content_height.to_i > 0
331
+ # Compute scaled width using the ratio of requested height to actual height, round up to prevent truncation
332
+ (((@options[:height].to_i * 1.0) / (content_height.to_i * 1.0)) * content_width.to_i).ceil
333
+ else
334
+ content_width.to_i
335
+ end
336
+ end
337
+
338
+ def content_height_scaled
339
+ # Width takes precedence over height
340
+ if @options[:width].to_i > 0 && content_width.to_i > 0
341
+ # Compute scaled height using the ratio of requested width to actual width, round up to prevent truncation
342
+ (((@options[:width].to_i * 1.0) / (content_width.to_i * 1.0)) * content_height.to_i).ceil
343
+ elsif @options[:height].to_i > 0
344
+ @options[:height]
345
+ else
346
+ content_height.to_i
347
+ end
348
+ end
349
+ end
350
+ end
@@ -0,0 +1,91 @@
1
+ # Copyright (c) 2014, VMware, Inc. All Rights Reserved.
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights to
6
+ # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ # of the Software, and to permit persons to whom the Software is furnished to do
8
+ # so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require 'faraday'
22
+ require 'faraday/follow_redirects'
23
+
24
+ module LinkPreview
25
+ class ExtraEnv < Faraday::Middleware
26
+ class << self
27
+ attr_accessor :extra
28
+ end
29
+
30
+ def call(env)
31
+ env[:link_preview] = self.class.extra || {}
32
+ @app.call(env)
33
+ ensure
34
+ env[:link_preview] = nil
35
+ end
36
+ end
37
+
38
+ class NormalizeURI < Faraday::Middleware
39
+ def call(env)
40
+ env[:url] = env[:url].normalize
41
+ @app.call(env)
42
+ end
43
+ end
44
+
45
+ class ForceUTF8Body < Faraday::Middleware
46
+ def force_utf8_body!(env)
47
+ return if env[:body].encoding == Encoding::UTF_8 && env[:body].valid_encoding?
48
+ return unless env[:response_headers][:content_type] =~ /text/
49
+ env[:body].encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
50
+ unless env[:body].valid_encoding?
51
+ # cleanse untrusted invalid bytes with a double transcode as suggested here:
52
+ # http://stackoverflow.com/questions/2982677/ruby-1-9-invalid-byte-sequence-in-utf-8
53
+ env[:body].encode!('UTF-16', 'binary', invalid: :replace, undef: :replace, replace: '')
54
+ env[:body].encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
55
+ end
56
+ end
57
+
58
+ def call(env)
59
+ @app.call(env).on_complete do |env|
60
+ force_utf8_body!(env)
61
+ end
62
+ end
63
+ end
64
+
65
+ class HTTPClient
66
+ extend Forwardable
67
+
68
+ def initialize(config)
69
+ @config = config
70
+ end
71
+
72
+ def_delegator :faraday_connection, :get
73
+
74
+ private
75
+
76
+ def faraday_connection
77
+ @faraday_connection ||= Faraday.new do |builder|
78
+ builder.options[:timeout] = @config.timeout
79
+ builder.options[:open_timeout] = @config.open_timeout
80
+
81
+ builder.use ExtraEnv
82
+ builder.use Faraday::FollowRedirects, limit: @config.max_redirects if @config.follow_redirects
83
+ builder.use NormalizeURI
84
+ builder.use ForceUTF8Body
85
+ @config.middleware.each { |middleware| builder.use middleware }
86
+
87
+ builder.use @config.http_adapter
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,114 @@
1
+ # Copyright (c) 2014, VMware, Inc. All Rights Reserved.
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights to
6
+ # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ # of the Software, and to permit persons to whom the Software is furnished to do
8
+ # so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require 'link_preview'
22
+ require 'link_preview/uri'
23
+
24
+ module LinkPreview
25
+ module ResponseWithURL
26
+ attr_accessor :url
27
+ end
28
+
29
+ class HTTPCrawler
30
+ def initialize(config, options = {})
31
+ @config = config
32
+ @options = options
33
+ @status = {}
34
+ @queue = Hash.new { |h,k| h[k] = [] }
35
+ end
36
+
37
+ # @param [String] URI of content to crawl
38
+ def enqueue!(uri, priority = :default)
39
+ return if full?
40
+ return unless uri
41
+ parsed_uri = LinkPreview::URI.parse(uri, @options)
42
+
43
+ if oembed_uri = parsed_uri.as_oembed_uri
44
+ enqueue_uri(oembed_uri, :oembed)
45
+ end
46
+
47
+ if content_uri = parsed_uri.as_content_uri
48
+ enqueue_uri(content_uri, priority)
49
+ end
50
+ end
51
+
52
+ # @return [Hash] latest normalized content discovered by crawling
53
+ def dequeue!(priority_order = [])
54
+ return if finished?
55
+ uri = dequeue_by_priority(priority_order)
56
+ with_extra_env do
57
+ @config.http_client.get(uri).tap do |response|
58
+ response.extend ResponseWithURL
59
+ response.url = uri
60
+ @status[uri] = response.status.to_i
61
+ end
62
+ end
63
+ rescue => e
64
+ @status[uri] ||= 500
65
+ @config.error_handler.call(e)
66
+ Faraday::Response.new
67
+ end
68
+
69
+ # @return [Boolean] true if any content discovered thus far has been successfully fetched
70
+ def success?
71
+ @status.any? { |_, status| status == 200 }
72
+ end
73
+
74
+ # @return [Boolean] true if all known discovered content has been crawled
75
+ def finished?
76
+ @queue.values.flatten.empty?
77
+ end
78
+
79
+ # @return [Boolean] true crawler is at capacity
80
+ def full?
81
+ @queue.values.flatten.size > @config.max_requests
82
+ end
83
+
84
+ private
85
+
86
+ def dequeue_by_priority(priority_order)
87
+ priority = priority_order.detect { |priority| @queue[priority].any? }
88
+ priority ||= @queue.keys.detect { |priority| @queue[priority].any? }
89
+ @queue[priority].shift
90
+ end
91
+
92
+ def enqueue_uri(parsed_uri, priority = :default)
93
+ uri = parsed_uri.to_s
94
+ if !(processed?(uri) || enqueued?(uri))
95
+ @queue[priority] << uri
96
+ end
97
+ end
98
+
99
+ def processed?(uri)
100
+ @status.has_key?(uri)
101
+ end
102
+
103
+ def enqueued?(uri)
104
+ @queue.values.flatten.uniq.include?(uri)
105
+ end
106
+
107
+ def with_extra_env(&block)
108
+ LinkPreview::ExtraEnv.extra = @options
109
+ yield
110
+ ensure
111
+ LinkPreview::ExtraEnv.extra = nil
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,46 @@
1
+ # Copyright (c) 2014, VMware, Inc. All Rights Reserved.
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights to
6
+ # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ # of the Software, and to permit persons to whom the Software is furnished to do
8
+ # so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require 'link_preview'
22
+
23
+ module LinkPreview
24
+ class NullCrawler
25
+ def initialize(config, options = {})
26
+ end
27
+
28
+ def enqueue!(uri, priority = :default)
29
+ end
30
+
31
+ def dequeue!(priority_order = [])
32
+ end
33
+
34
+ def success?
35
+ true
36
+ end
37
+
38
+ def finished?
39
+ true
40
+ end
41
+
42
+ def full?
43
+ false
44
+ end
45
+ end
46
+ end