link_preview 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +109 -0
- data/Rakefile +15 -0
- data/lib/faraday/follow_redirects.rb +155 -0
- data/lib/link_preview/configuration.rb +86 -0
- data/lib/link_preview/content.rb +350 -0
- data/lib/link_preview/http_client.rb +91 -0
- data/lib/link_preview/http_crawler.rb +114 -0
- data/lib/link_preview/null_crawler.rb +46 -0
- data/lib/link_preview/parser.rb +172 -0
- data/lib/link_preview/spec_helper.rb +1 -0
- data/lib/link_preview/uri.rb +149 -0
- data/lib/link_preview/version.rb +23 -0
- data/lib/link_preview.rb +51 -0
- data/spec/files/requests/bad_utf8.yml +186 -0
- data/spec/files/requests/elasticsearch.yml +2258 -0
- data/spec/files/requests/ggp_png.yml +256 -0
- data/spec/files/requests/kaltura.yml +3612 -0
- data/spec/files/requests/kaltura_opengraph.yml +1266 -0
- data/spec/files/requests/ogp_me.yml +880 -0
- data/spec/files/requests/sliderocket.yml +387 -0
- data/spec/files/requests/support_apple_com.yml +833 -0
- data/spec/files/requests/youtube.yml +3513 -0
- data/spec/files/requests/youtube_404.yml +1055 -0
- data/spec/link_preview/http_crawler_spec.rb +50 -0
- data/spec/link_preview/uri_spec.rb +99 -0
- data/spec/link_preview_spec.rb +383 -0
- data/spec/spec_helper.rb +39 -0
- data/spec/support/link_preview/link_preview_stubs.rb +26 -0
- metadata +241 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9ecbcafc85c04588af5ed7ef77f971756f97418e
|
4
|
+
data.tar.gz: f77f0214f0de673b6920e22d37b64f94adb41e00
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 711a72d409dafcde725db0de6acefd785fc968cec14914f5098630e79978ebd7f3dc0bf7e419d0b3c8af48ee0e0943d1b8dd95bac864c731c661f66d97023112
|
7
|
+
data.tar.gz: 43db146cbef88b3d00724c740e14979b9317b50d2a85cc4f13fa67f9b6ea94e0a590b26eb3044b94dac2fd61c391fef77b50033830e619010fc81d2139fd25b6
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
[![Build Status](https://secure.travis-ci.org/socialcast/link_preview.png?branch=master)](http://travis-ci.org/socialcast/link_preview)
|
2
|
+
[![Code Climate](https://codeclimate.com/github/socialcast/link_preview.png)](https://codeclimate.com/github/socialcast/link_preview)
|
3
|
+
|
4
|
+
link_preview
|
5
|
+
==============
|
6
|
+
|
7
|
+
Generate an [oEmbed](http://oembed.com/) response for any URL.
|
8
|
+
|
9
|
+
Usage
|
10
|
+
------
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
content = LinkPreview.fetch(url)
|
14
|
+
content.as_oembed
|
15
|
+
```
|
16
|
+
|
17
|
+
Serialize content sources:
|
18
|
+
```ruby
|
19
|
+
content.sources
|
20
|
+
```
|
21
|
+
|
22
|
+
Load previous content via sources:
|
23
|
+
```ruby
|
24
|
+
previous_content = LinkPreview.load_content(url, options, content.sources)
|
25
|
+
```
|
26
|
+
|
27
|
+
Features
|
28
|
+
--------
|
29
|
+
- Designed to make the minimal number of HTTP requests to generate a preview
|
30
|
+
- Configurable via [Faraday](https://github.com/lostisland/faraday) middleware
|
31
|
+
- Battletested on wide variety of URLs and HTML in the wild
|
32
|
+
- Includes test helper for stubbing `LinkPreview::Content`
|
33
|
+
|
34
|
+
Installation
|
35
|
+
-------------
|
36
|
+
```shell
|
37
|
+
gem install link_preview
|
38
|
+
```
|
39
|
+
|
40
|
+
Configuration
|
41
|
+
--------------
|
42
|
+
LinkPreview is configured via [`Faraday`](https://github.com/lostisland/faraday) with some additional middleware:
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
# $RAILS_ROOT/config/initializer/link_preview.rb
|
46
|
+
|
47
|
+
# Cache responses in Rails.cache
|
48
|
+
class HTTPCache < Faraday::Middleware
|
49
|
+
CACHE_PREFIX = name
|
50
|
+
EXPIRES_IN = 10.minutes
|
51
|
+
|
52
|
+
def call(env)
|
53
|
+
url = env[:url].to_s
|
54
|
+
Rails.cache.fetch("#{CACHE_PREFIX}::#{url}", :expires_in => EXPIRES_IN) do
|
55
|
+
@app.call(env)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Report unknown exceptions to Airbrake
|
61
|
+
module ErrorHandler
|
62
|
+
IGNORED_EXCEPTIONS = [
|
63
|
+
IOError,
|
64
|
+
SocketError,
|
65
|
+
Timeout::Error,
|
66
|
+
Errno::ECONNREFUSED,
|
67
|
+
Errno::ECONNRESET,
|
68
|
+
Errno::EHOSTUNREACH,
|
69
|
+
Errno::ENETUNREACH,
|
70
|
+
Errno::ETIMEDOUT,
|
71
|
+
Net::ProtocolError,
|
72
|
+
Net::NetworkTimeoutError,
|
73
|
+
OpenSSL::SSL::SSLError
|
74
|
+
]
|
75
|
+
|
76
|
+
class << self
|
77
|
+
def error_handler(e)
|
78
|
+
case e
|
79
|
+
when *IGNORED_EXCEPTIONS
|
80
|
+
# Ignore
|
81
|
+
else
|
82
|
+
Airbrake.notify_or_ignore(e)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
LinkPreview.configure do |config|
|
89
|
+
config.http_adapter = Faraday::Adapter::NetHttp
|
90
|
+
config.max_requests = 10
|
91
|
+
config.follow_redirects = true
|
92
|
+
config.middleware = HTTPCache
|
93
|
+
config.error_handler = ErrorHandler.method(:error_handler)
|
94
|
+
end
|
95
|
+
```
|
96
|
+
|
97
|
+
Contributing
|
98
|
+
--------------
|
99
|
+
* Fork the project
|
100
|
+
* Fix the issue
|
101
|
+
* Add unit tests
|
102
|
+
* Submit pull request on github
|
103
|
+
|
104
|
+
See CONTRIBUTORS.txt for list of project contributors
|
105
|
+
|
106
|
+
Copyright
|
107
|
+
---------
|
108
|
+
Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
109
|
+
See LICENSE.txt for further details.
|
data/Rakefile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
|
3
|
+
begin
|
4
|
+
require 'bundler/setup'
|
5
|
+
rescue LoadError
|
6
|
+
puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
|
7
|
+
end
|
8
|
+
|
9
|
+
Bundler::GemHelper.install_tasks
|
10
|
+
|
11
|
+
require 'rspec/core/rake_task'
|
12
|
+
|
13
|
+
RSpec::Core::RakeTask.new(:spec)
|
14
|
+
|
15
|
+
task :default => :spec
|
@@ -0,0 +1,155 @@
|
|
1
|
+
# Copyright (c) 2011 Erik Michaels-Ober, Wynn Netherland, et al.
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
# a copy of this software and associated documentation files (the
|
5
|
+
# "Software"), to deal in the Software without restriction, including
|
6
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
# the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be
|
12
|
+
# included in all copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
21
|
+
|
22
|
+
# NOTE faraday-middleware is not compatible with faraday 0.9.0
|
23
|
+
# https://github.com/lostisland/faraday_middleware/pull/59
|
24
|
+
#
|
25
|
+
# Copied from https://github.com/lostisland/faraday_middleware
|
26
|
+
require 'faraday'
|
27
|
+
require 'set'
|
28
|
+
|
29
|
+
module Faraday
|
30
|
+
# Public: Exception thrown when the maximum amount of requests is exceeded.
|
31
|
+
class RedirectLimitReached < Faraday::Error::ClientError
|
32
|
+
attr_reader :response
|
33
|
+
|
34
|
+
def initialize(response)
|
35
|
+
super "too many redirects; last one to: #{response['location']}"
|
36
|
+
@response = response
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Public: Follow HTTP 301, 302, 303, and 307 redirects.
|
41
|
+
#
|
42
|
+
# For HTTP 301, 302, and 303, the original GET, POST, PUT, DELETE, or PATCH
|
43
|
+
# request gets converted into a GET. With `:standards_compliant => true`,
|
44
|
+
# however, the HTTP method after 301/302 remains unchanged. This allows you
|
45
|
+
# to opt into HTTP/1.1 compliance and act unlike the major web browsers.
|
46
|
+
#
|
47
|
+
# This middleware currently only works with synchronous requests; i.e. it
|
48
|
+
# doesn't support parallelism.
|
49
|
+
class FollowRedirects < Faraday::Middleware
|
50
|
+
# HTTP methods for which 30x redirects can be followed
|
51
|
+
ALLOWED_METHODS = Set.new [:head, :options, :get, :post, :put, :patch, :delete]
|
52
|
+
# HTTP redirect status codes that this middleware implements
|
53
|
+
REDIRECT_CODES = Set.new [301, 302, 303, 307]
|
54
|
+
# Keys in env hash which will get cleared between requests
|
55
|
+
ENV_TO_CLEAR = Set.new [:status, :response, :response_headers]
|
56
|
+
|
57
|
+
# Default value for max redirects followed
|
58
|
+
FOLLOW_LIMIT = 3
|
59
|
+
|
60
|
+
# Public: Initialize the middleware.
|
61
|
+
#
|
62
|
+
# options - An options Hash (default: {}):
|
63
|
+
# :limit - A Numeric redirect limit (default: 3)
|
64
|
+
# :standards_compliant - A Boolean indicating whether to respect
|
65
|
+
# the HTTP spec when following 301/302
|
66
|
+
# (default: false)
|
67
|
+
# :cookies - An Array of Strings (e.g.
|
68
|
+
# ['cookie1', 'cookie2']) to choose
|
69
|
+
# cookies to be kept, or :all to keep
|
70
|
+
# all cookies (default: []).
|
71
|
+
def initialize(app, options = {})
|
72
|
+
super(app)
|
73
|
+
@options = options
|
74
|
+
|
75
|
+
@convert_to_get = Set.new [303]
|
76
|
+
@convert_to_get << 301 << 302 unless standards_compliant?
|
77
|
+
end
|
78
|
+
|
79
|
+
def call(env)
|
80
|
+
perform_with_redirection(env, follow_limit)
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def convert_to_get?(response)
|
86
|
+
![:head, :options].include?(response.env[:method]) &&
|
87
|
+
@convert_to_get.include?(response.status)
|
88
|
+
end
|
89
|
+
|
90
|
+
def perform_with_redirection(env, follows)
|
91
|
+
request_body = env[:body]
|
92
|
+
response = @app.call(env)
|
93
|
+
|
94
|
+
response.on_complete do |env|
|
95
|
+
if follow_redirect?(env, response)
|
96
|
+
raise RedirectLimitReached, response if follows.zero?
|
97
|
+
env = update_env(env, request_body, response)
|
98
|
+
response = perform_with_redirection(env, follows - 1)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
response
|
102
|
+
end
|
103
|
+
|
104
|
+
def update_env(env, request_body, response)
|
105
|
+
env[:url] += response['location']
|
106
|
+
if @options[:cookies]
|
107
|
+
cookies = keep_cookies(env)
|
108
|
+
env[:request_headers][:cookies] = cookies unless cookies.nil?
|
109
|
+
end
|
110
|
+
|
111
|
+
if convert_to_get?(response)
|
112
|
+
env[:method] = :get
|
113
|
+
env[:body] = nil
|
114
|
+
else
|
115
|
+
env[:body] = request_body
|
116
|
+
end
|
117
|
+
|
118
|
+
ENV_TO_CLEAR.each {|key| env.delete key }
|
119
|
+
|
120
|
+
env
|
121
|
+
end
|
122
|
+
|
123
|
+
def follow_redirect?(env, response)
|
124
|
+
ALLOWED_METHODS.include? env[:method] and
|
125
|
+
REDIRECT_CODES.include? response.status
|
126
|
+
end
|
127
|
+
|
128
|
+
def follow_limit
|
129
|
+
@options.fetch(:limit, FOLLOW_LIMIT)
|
130
|
+
end
|
131
|
+
|
132
|
+
def keep_cookies(env)
|
133
|
+
cookies = @options.fetch(:cookies, [])
|
134
|
+
response_cookies = env[:response_headers][:cookies]
|
135
|
+
cookies == :all ? response_cookies : selected_request_cookies(response_cookies)
|
136
|
+
end
|
137
|
+
|
138
|
+
def selected_request_cookies(cookies)
|
139
|
+
selected_cookies(cookies)[0...-1]
|
140
|
+
end
|
141
|
+
|
142
|
+
def selected_cookies(cookies)
|
143
|
+
"".tap do |cookie_string|
|
144
|
+
@options[:cookies].each do |cookie|
|
145
|
+
string = /#{cookie}=?[^;]*/.match(cookies)[0] + ';'
|
146
|
+
cookie_string << string
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def standards_compliant?
|
152
|
+
@options.fetch(:standards_compliant, false)
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
# Copyright (c) 2014, VMware, Inc. All Rights Reserved.
|
2
|
+
#
|
3
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
# of this software and associated documentation files (the "Software"), to deal
|
5
|
+
# in the Software without restriction, including without limitation the rights to
|
6
|
+
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
7
|
+
# of the Software, and to permit persons to whom the Software is furnished to do
|
8
|
+
# so, subject to the following conditions:
|
9
|
+
#
|
10
|
+
# The above copyright notice and this permission notice shall be included in all
|
11
|
+
# copies or substantial portions of the Software.
|
12
|
+
#
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19
|
+
# SOFTWARE.
|
20
|
+
|
21
|
+
require 'link_preview/http_client'
|
22
|
+
|
23
|
+
module LinkPreview
|
24
|
+
class Configuration
|
25
|
+
attr_accessor :http_client
|
26
|
+
attr_accessor :http_adapter
|
27
|
+
attr_accessor :follow_redirects
|
28
|
+
attr_accessor :max_redirects
|
29
|
+
attr_accessor :max_requests
|
30
|
+
attr_accessor :timeout
|
31
|
+
attr_accessor :open_timeout
|
32
|
+
attr_accessor :error_handler
|
33
|
+
attr_accessor :middleware
|
34
|
+
|
35
|
+
def http_client
|
36
|
+
@http_client ||= HTTPClient.new(self)
|
37
|
+
end
|
38
|
+
|
39
|
+
def http_client=(http_client)
|
40
|
+
@http_client = http_client
|
41
|
+
end
|
42
|
+
|
43
|
+
def http_adapter
|
44
|
+
@http_adapter ||= Faraday::Adapter::NetHttp
|
45
|
+
end
|
46
|
+
|
47
|
+
def http_adapter=(http_adapter)
|
48
|
+
@http_adapter = http_adapter
|
49
|
+
end
|
50
|
+
def follow_redirects
|
51
|
+
@follow_redirects ||= true
|
52
|
+
end
|
53
|
+
|
54
|
+
def follow_redirects=(follow_redirects)
|
55
|
+
@follow_redirects = follow_redirects
|
56
|
+
end
|
57
|
+
|
58
|
+
def max_redirects
|
59
|
+
@max_redirects || 3
|
60
|
+
end
|
61
|
+
|
62
|
+
def max_requests
|
63
|
+
@max_requests || 10
|
64
|
+
end
|
65
|
+
|
66
|
+
def timeout
|
67
|
+
@timeout || 5 # seconds
|
68
|
+
end
|
69
|
+
|
70
|
+
def open_timeout
|
71
|
+
@open_timeout || 2 # seconds
|
72
|
+
end
|
73
|
+
|
74
|
+
def error_handler
|
75
|
+
@error_handler ||= Proc.new() { |_| }
|
76
|
+
end
|
77
|
+
|
78
|
+
def middleware
|
79
|
+
@middleware || []
|
80
|
+
end
|
81
|
+
|
82
|
+
def middleware=(*middleware)
|
83
|
+
@middleware = middleware
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|