true_url 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +1 -0
- data/Rakefile +5 -0
- data/lib/true_url/context.rb +39 -0
- data/lib/true_url/strategy/dailymotion.rb +56 -0
- data/lib/true_url/strategy/nicovideo.rb +25 -0
- data/lib/true_url/strategy/twitter.rb +25 -0
- data/lib/true_url/strategy/vimeo.rb +28 -0
- data/lib/true_url/strategy/youtube.rb +39 -0
- data/lib/true_url/strategy.rb +23 -0
- data/lib/true_url/version.rb +3 -0
- data/lib/true_url.rb +140 -0
- data/spec/spec_helper.rb +96 -0
- data/spec/true_url_spec.rb +204 -0
- data/true_url.gemspec +22 -0
- metadata +151 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 496c04ece239c5302c9548b0a4b7d93630125172
|
4
|
+
data.tar.gz: 74c68ce403f401579bbc9b354b283595b5516302
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cfbf6e4099c911e66ff6061ecd98b0e98c60b8411659904d57d76c9bf1b4a2da207f0a19b9f11597ff8bf22a8ae716a7d49706b0a46730b6e824c99e970542cd
|
7
|
+
data.tar.gz: c186b1d04c781de99d61d64b9b3052e2f667ba140cba10c05db5e51ab0b43beddc0ee20d2318a534b84ed7ac8019294949a4fd18bcfe50a4492230431e13db6a
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Jonathan Wong
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# true_url
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
class TrueURL
|
2
|
+
class Context
|
3
|
+
attr_reader :original_url, :options, :attributes, :working_url
|
4
|
+
|
5
|
+
def initialize(original_url, options)
|
6
|
+
@original_url = parse(original_url)
|
7
|
+
@options = options
|
8
|
+
@finalized = false
|
9
|
+
@attributes = {}
|
10
|
+
|
11
|
+
set_working_url(original_url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set_working_url(url, base_url = nil)
|
15
|
+
@working_url = base_url.nil? ? parse(url) : parse(base_url).join(parse(url))
|
16
|
+
|
17
|
+
# If the URL has no scheme, then we assume HTTP
|
18
|
+
if @working_url.scheme.nil?
|
19
|
+
@working_url = url.to_s.start_with?('//') ? parse("http:#{url}") : parse("http://#{url}")
|
20
|
+
end
|
21
|
+
|
22
|
+
@working_url.normalize
|
23
|
+
end
|
24
|
+
|
25
|
+
def finalize
|
26
|
+
@finalized = true
|
27
|
+
end
|
28
|
+
|
29
|
+
def finalized?
|
30
|
+
@finalized
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def parse(url)
|
36
|
+
(url.is_a? Addressable::URI) ? url : Addressable::URI.parse(url)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class DailyMotion
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if context.working_url.host == 'dai.ly'
|
8
|
+
video_id = path[1..-1]
|
9
|
+
|
10
|
+
elsif path[0..6] == '/video/'
|
11
|
+
video_id = clean_video_id(path)
|
12
|
+
|
13
|
+
elsif path[0..6] == '/embed/'
|
14
|
+
video_id = path[13..-1]
|
15
|
+
|
16
|
+
elsif path[0..9] == '/playlist/'
|
17
|
+
playlist_id = clean_playlist_id(path)
|
18
|
+
end
|
19
|
+
|
20
|
+
unless video_id.nil?
|
21
|
+
context.set_working_url("https://www.dailymotion.com/video/#{video_id}")
|
22
|
+
context.finalize
|
23
|
+
context.attributes[:embed_url] = "https://www.dailymotion.com/embed/video/#{video_id}"
|
24
|
+
end
|
25
|
+
|
26
|
+
unless playlist_id.nil?
|
27
|
+
context.set_working_url("https://www.dailymotion.com/playlist/#{playlist_id}")
|
28
|
+
context.finalize
|
29
|
+
end
|
30
|
+
|
31
|
+
# DailyMotion supports both HTTP and HTTPS and doesn't redirect between them, so we prefer HTTPS
|
32
|
+
context.working_url.scheme = 'https'
|
33
|
+
end
|
34
|
+
|
35
|
+
def clean_video_id(path)
|
36
|
+
if path.index('_')
|
37
|
+
path[7..path.index('_') - 1]
|
38
|
+
else
|
39
|
+
path[7..-1]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def clean_playlist_id(path)
|
44
|
+
cpath = path[10..-1]
|
45
|
+
|
46
|
+
if cpath.index('_')
|
47
|
+
cpath[0..cpath.index('_') - 1]
|
48
|
+
elsif cpath.index('/')
|
49
|
+
cpath[0..cpath.index('/') - 1]
|
50
|
+
else
|
51
|
+
cpath[0..-1]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class NicoVideo
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if path[0..6] == '/watch/'
|
8
|
+
video_id = path.split('/')[2]
|
9
|
+
|
10
|
+
elsif path[0..12] == '/thumb_watch/'
|
11
|
+
video_id = path.split('/')[2]
|
12
|
+
end
|
13
|
+
|
14
|
+
if video_id
|
15
|
+
context.set_working_url("http://www.nicovideo.jp/watch/#{video_id}")
|
16
|
+
context.finalize
|
17
|
+
context.attributes[:embed_url] = "http://embed.nicovideo.jp/watch/#{video_id}"
|
18
|
+
end
|
19
|
+
|
20
|
+
# Nico Video only supports HTTP
|
21
|
+
context.working_url.scheme = 'http'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class Twitter
|
4
|
+
def execute(context)
|
5
|
+
fragment = context.working_url.fragment
|
6
|
+
|
7
|
+
# Special handling to collapse Twitter hashbang (#!) URLs
|
8
|
+
unless fragment.nil?
|
9
|
+
if fragment.start_with?('!/')
|
10
|
+
context.working_url.path = fragment[1..-1]
|
11
|
+
context.working_url.fragment = nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
path = context.working_url.path
|
16
|
+
|
17
|
+
if path =~ /^\/\w+\/status\/\d+/
|
18
|
+
parts = path.split('/')
|
19
|
+
context.set_working_url("https://twitter.com/#{parts[1].downcase}/status/#{parts[3]}")
|
20
|
+
context.finalize
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class Vimeo
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if context.working_url.host == 'player.vimeo.com'
|
8
|
+
video_id = path[7..-1]
|
9
|
+
|
10
|
+
elsif path =~ /^\/channels\/\w+\/\d+$/
|
11
|
+
video_id = path.split('/').last
|
12
|
+
|
13
|
+
elsif path =~ /^\/\d+$/
|
14
|
+
video_id = path[1..-1]
|
15
|
+
end
|
16
|
+
|
17
|
+
if video_id
|
18
|
+
context.set_working_url("https://vimeo.com/#{video_id}")
|
19
|
+
context.finalize
|
20
|
+
context.attributes[:embed_url] = "https://player.vimeo.com/video/#{video_id}"
|
21
|
+
end
|
22
|
+
|
23
|
+
# Vimeo supports both HTTP and HTTPS and doesn't redirect between them, so we prefer HTTPS
|
24
|
+
context.working_url.scheme = 'https'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class YouTube
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if context.working_url.host == 'youtu.be'
|
8
|
+
video_id = path[1..-1]
|
9
|
+
|
10
|
+
elsif path == '/watch'
|
11
|
+
video_id = context.working_url.query_values['v']
|
12
|
+
|
13
|
+
elsif path == '/playlist'
|
14
|
+
playlist_id = context.working_url.query_values['list']
|
15
|
+
|
16
|
+
elsif path[0..17] == '/embed/videoseries'
|
17
|
+
playlist_id = context.working_url.query_values['list']
|
18
|
+
|
19
|
+
elsif path[0..6] == '/embed/'
|
20
|
+
video_id = path[7..-1]
|
21
|
+
end
|
22
|
+
|
23
|
+
unless video_id.nil?
|
24
|
+
context.set_working_url("https://www.youtube.com/watch?v=#{video_id}")
|
25
|
+
context.finalize
|
26
|
+
context.attributes[:embed_url] = "https://www.youtube.com/embed/#{video_id}"
|
27
|
+
context.attributes[:embed_url_private] = "https://www.youtube-nocookie.com/embed/#{video_id}"
|
28
|
+
end
|
29
|
+
|
30
|
+
unless playlist_id.nil?
|
31
|
+
context.set_working_url("https://www.youtube.com/playlist?list=#{playlist_id}")
|
32
|
+
context.finalize
|
33
|
+
context.attributes[:embed_url] = "https://www.youtube.com/embed/videoseries?list=#{playlist_id}"
|
34
|
+
context.attributes[:embed_url_private] = "https://www.youtube-nocookie.com/embed/videoseries?list=#{playlist_id}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'true_url/strategy/dailymotion'
|
2
|
+
require 'true_url/strategy/nicovideo'
|
3
|
+
require 'true_url/strategy/twitter'
|
4
|
+
require 'true_url/strategy/vimeo'
|
5
|
+
require 'true_url/strategy/youtube'
|
6
|
+
|
7
|
+
class TrueURL
|
8
|
+
module Strategy
|
9
|
+
def self.default_list
|
10
|
+
[
|
11
|
+
[/youtube.com$/, TrueURL::Strategy::YouTube.new],
|
12
|
+
[/youtube-nocookie.com$/, TrueURL::Strategy::YouTube.new],
|
13
|
+
[/^youtu.be$/, TrueURL::Strategy::YouTube.new],
|
14
|
+
[/dailymotion.com$/, TrueURL::Strategy::DailyMotion.new],
|
15
|
+
[/^dai.ly$/, TrueURL::Strategy::DailyMotion.new],
|
16
|
+
[/vimeo.com$/, TrueURL::Strategy::Vimeo.new],
|
17
|
+
[/nicovideo.jp$/, TrueURL::Strategy::NicoVideo.new],
|
18
|
+
[/^nico.ms$/, TrueURL::Strategy::NicoVideo.new],
|
19
|
+
[/twitter.com$/, TrueURL::Strategy::Twitter.new]
|
20
|
+
]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/true_url.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
class TrueURL
|
4
|
+
autoload :Version, 'true_url/version'
|
5
|
+
autoload :Context, 'true_url/context'
|
6
|
+
autoload :Strategy, 'true_url/strategy'
|
7
|
+
|
8
|
+
attr_accessor :context, :strategies
|
9
|
+
|
10
|
+
OPTIONS = {
|
11
|
+
scheme_override: nil, # Possible choices: "https", "http", nil (preserve scheme)
|
12
|
+
fetch: true # Whether to fetch the URL
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
QUERY_VALUES_TO_REMOVE = %w(
|
16
|
+
utm_source
|
17
|
+
utm_medium
|
18
|
+
utm_term
|
19
|
+
utm_content
|
20
|
+
utm_campaign
|
21
|
+
sms_ss
|
22
|
+
awesm
|
23
|
+
xtor
|
24
|
+
PHPSESSID
|
25
|
+
).freeze
|
26
|
+
|
27
|
+
def initialize(url, options = {})
|
28
|
+
@context = TrueURL::Context.new(url, OPTIONS.merge(options))
|
29
|
+
@strategies = TrueURL::Strategy.default_list
|
30
|
+
@executed = false
|
31
|
+
end
|
32
|
+
|
33
|
+
def canonical
|
34
|
+
execute
|
35
|
+
@context.working_url.to_s
|
36
|
+
end
|
37
|
+
|
38
|
+
def attributes
|
39
|
+
execute
|
40
|
+
@context.attributes
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def execute
|
46
|
+
return if @executed
|
47
|
+
|
48
|
+
execute_strategies
|
49
|
+
|
50
|
+
unless @context.finalized?
|
51
|
+
if attempt_fetch?
|
52
|
+
fetch
|
53
|
+
execute_strategies
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
scheme_override
|
58
|
+
remove_fragments
|
59
|
+
clean_query_values
|
60
|
+
|
61
|
+
@executed = true
|
62
|
+
end
|
63
|
+
|
64
|
+
def execute_strategies
|
65
|
+
@strategies.each do |s|
|
66
|
+
match_criteria = s[0]
|
67
|
+
strategy = s[1]
|
68
|
+
|
69
|
+
strategy.execute(@context) unless @context.finalized? || !strategy_match?(match_criteria)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def strategy_match?(match_criteria)
|
74
|
+
return true if match_criteria.nil?
|
75
|
+
|
76
|
+
host = @context.working_url.host
|
77
|
+
host.nil? ? false : host.match(match_criteria)
|
78
|
+
end
|
79
|
+
|
80
|
+
def attempt_fetch?
|
81
|
+
return false unless @context.options[:fetch]
|
82
|
+
|
83
|
+
# Must at least have a host, otherwise we can't find the site to crawl
|
84
|
+
return false if @context.working_url.host.nil?
|
85
|
+
|
86
|
+
# We only support HTTP or HTTPS
|
87
|
+
%w(http https).include?(@context.working_url.scheme)
|
88
|
+
end
|
89
|
+
|
90
|
+
def fetch
|
91
|
+
require 'http' unless defined? HTTP
|
92
|
+
|
93
|
+
starting_url = @context.working_url
|
94
|
+
|
95
|
+
response = HTTP.follow
|
96
|
+
.get(starting_url)
|
97
|
+
|
98
|
+
canonical_url = find_canonical_header(response.headers) || find_canonical_url(response.to_s) || response.uri
|
99
|
+
@context.set_working_url(canonical_url, starting_url)
|
100
|
+
end
|
101
|
+
|
102
|
+
def find_canonical_header(headers)
|
103
|
+
return if headers['Link'].nil?
|
104
|
+
|
105
|
+
links = (headers['Link'].is_a? String) ? [headers['Link']] : headers['Link']
|
106
|
+
links.each { |link| return link.split(/[<>;]/)[1] if link.end_with?('rel="canonical"') }
|
107
|
+
nil
|
108
|
+
end
|
109
|
+
|
110
|
+
def find_canonical_url(html)
|
111
|
+
require 'nokogiri' unless defined? Nokogiri::HTML
|
112
|
+
|
113
|
+
doc = Nokogiri::HTML(html)
|
114
|
+
|
115
|
+
elem = doc.at('link[rel="canonical"]')
|
116
|
+
canonical_url = elem['href'] unless elem.nil?
|
117
|
+
|
118
|
+
elem = doc.at('meta[property="og:url"]')
|
119
|
+
og_url = elem['content'] unless elem.nil?
|
120
|
+
|
121
|
+
canonical_url || og_url
|
122
|
+
end
|
123
|
+
|
124
|
+
def scheme_override
|
125
|
+
@context.working_url.scheme = @context.options[:scheme_override] unless @context.options[:scheme_override].nil?
|
126
|
+
end
|
127
|
+
|
128
|
+
def remove_fragments
|
129
|
+
@context.working_url.fragment = nil
|
130
|
+
end
|
131
|
+
|
132
|
+
def clean_query_values
|
133
|
+
query_values = @context.working_url.query_values
|
134
|
+
|
135
|
+
unless query_values.nil?
|
136
|
+
QUERY_VALUES_TO_REMOVE.each { |p| query_values.delete(p) }
|
137
|
+
@context.working_url.query_values = query_values.empty? ? nil : query_values
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'true_url'
|
2
|
+
|
3
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
4
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
5
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
6
|
+
# this file to always be loaded, without a need to explicitly require it in any
|
7
|
+
# files.
|
8
|
+
#
|
9
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
10
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
11
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
12
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
13
|
+
# a separate helper file that requires the additional dependencies and performs
|
14
|
+
# the additional setup, and require it from the spec files that actually need
|
15
|
+
# it.
|
16
|
+
#
|
17
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
18
|
+
# users commonly want.
|
19
|
+
#
|
20
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
21
|
+
RSpec.configure do |config|
|
22
|
+
# rspec-expectations config goes here. You can use an alternate
|
23
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
24
|
+
# assertions if you prefer.
|
25
|
+
config.expect_with :rspec do |expectations|
|
26
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
27
|
+
# and `failure_message` of custom matchers include text for helper methods
|
28
|
+
# defined using `chain`, e.g.:
|
29
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
30
|
+
# # => "be bigger than 2 and smaller than 4"
|
31
|
+
# ...rather than:
|
32
|
+
# # => "be bigger than 2"
|
33
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
34
|
+
end
|
35
|
+
|
36
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
37
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
38
|
+
config.mock_with :rspec do |mocks|
|
39
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
40
|
+
# a real object. This is generally recommended, and will default to
|
41
|
+
# `true` in RSpec 4.
|
42
|
+
mocks.verify_partial_doubles = true
|
43
|
+
end
|
44
|
+
|
45
|
+
# The settings below are suggested to provide a good initial experience
|
46
|
+
# with RSpec, but feel free to customize to your heart's content.
|
47
|
+
# # These two settings work together to allow you to limit a spec run
|
48
|
+
# # to individual examples or groups you care about by tagging them with
|
49
|
+
# # `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
50
|
+
# # get run.
|
51
|
+
# config.filter_run :focus
|
52
|
+
# config.run_all_when_everything_filtered = true
|
53
|
+
#
|
54
|
+
# # Allows RSpec to persist some state between runs in order to support
|
55
|
+
# # the `--only-failures` and `--next-failure` CLI options. We recommend
|
56
|
+
# # you configure your source control system to ignore this file.
|
57
|
+
# config.example_status_persistence_file_path = "spec/examples.txt"
|
58
|
+
#
|
59
|
+
# # Limits the available syntax to the non-monkey patched syntax that is
|
60
|
+
# # recommended. For more details, see:
|
61
|
+
# # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
|
62
|
+
# # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
63
|
+
# # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
|
64
|
+
# config.disable_monkey_patching!
|
65
|
+
#
|
66
|
+
# # This setting enables warnings. It's recommended, but in some cases may
|
67
|
+
# # be too noisy due to issues in dependencies.
|
68
|
+
# config.warnings = true
|
69
|
+
#
|
70
|
+
# # Many RSpec users commonly either run the entire suite or an individual
|
71
|
+
# # file, and it's useful to allow more verbose output when running an
|
72
|
+
# # individual spec file.
|
73
|
+
# if config.files_to_run.one?
|
74
|
+
# # Use the documentation formatter for detailed output,
|
75
|
+
# # unless a formatter has already been configured
|
76
|
+
# # (e.g. via a command-line flag).
|
77
|
+
# config.default_formatter = 'doc'
|
78
|
+
# end
|
79
|
+
#
|
80
|
+
# # Print the 10 slowest examples and example groups at the
|
81
|
+
# # end of the spec run, to help surface which specs are running
|
82
|
+
# # particularly slow.
|
83
|
+
# config.profile_examples = 10
|
84
|
+
#
|
85
|
+
# # Run specs in random order to surface order dependencies. If you find an
|
86
|
+
# # order dependency and want to debug it, you can fix the order by providing
|
87
|
+
# # the seed, which is printed after each run.
|
88
|
+
# # --seed 1234
|
89
|
+
# config.order = :random
|
90
|
+
#
|
91
|
+
# # Seed global randomization in this process using the `--seed` CLI option.
|
92
|
+
# # Setting this allows you to use `--seed` to deterministically reproduce
|
93
|
+
# # test failures related to randomization by passing the same `--seed` value
|
94
|
+
# # as the one that triggered the failure.
|
95
|
+
# Kernel.srand config.seed
|
96
|
+
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
def gc(unclean_url, options = {})
|
4
|
+
TrueURL.new(unclean_url, options).canonical
|
5
|
+
end
|
6
|
+
|
7
|
+
describe TrueURL do
|
8
|
+
describe 'YouTube' do
|
9
|
+
it 'supports direct video links' do
|
10
|
+
t = 'https://www.youtube.com/watch?v=RDocnbkHjhI'
|
11
|
+
expect(gc('https://www.youtube.com/watch?v=RDocnbkHjhI')).to eq t
|
12
|
+
expect(gc('https://www.youtube.com/watch?v=RDocnbkHjhI&feature=youtu.be&list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
|
13
|
+
expect(gc('https://youtu.be/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'supports embedded video links' do
|
17
|
+
t = 'https://www.youtube.com/watch?v=RDocnbkHjhI'
|
18
|
+
expect(gc('https://www.youtube.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
|
19
|
+
expect(gc('https://www.youtube-nocookie.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0&controls=0&showinfo=0')).to eq t
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'supports direct playlist links' do
|
23
|
+
t = 'https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
|
24
|
+
expect(gc('https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
25
|
+
expect(gc('https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'supports embedded playlist links' do
|
29
|
+
t = 'https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
|
30
|
+
expect(gc('https://www.youtube.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
31
|
+
expect(gc('https://www.youtube-nocookie.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'supports direct channel links' do
|
35
|
+
t = 'https://www.youtube.com/user/WatchMojo'
|
36
|
+
expect(gc('https://www.youtube.com/channel/UCaWd5_7JhbQBe4dknZhsHJg')).to eq t
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'supports retrieving embed links as attributes' do
|
40
|
+
x = TrueURL.new('https://www.youtube-nocookie.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')
|
41
|
+
expect(x.attributes[:embed_url]).to eq 'https://www.youtube.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
|
42
|
+
|
43
|
+
x = TrueURL.new('https://www.youtube.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0&controls=0&showinfo=0')
|
44
|
+
expect(x.attributes[:embed_url_private]).to eq 'https://www.youtube-nocookie.com/embed/RDocnbkHjhI'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe 'DailyMotion' do
|
49
|
+
it 'supports direct video links' do
|
50
|
+
t = 'https://www.dailymotion.com/video/x2k01a9'
|
51
|
+
expect(gc('http://dai.ly/x2k01a9')).to eq t
|
52
|
+
expect(gc('http://www.dailymotion.com/video/x2k01a9_battlefield-what-s-it-like-to-be-in-a-real-life-video-game_fun')).to eq t
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'supports embedded video links' do
|
56
|
+
t = 'https://www.dailymotion.com/video/x2k01a9'
|
57
|
+
expect(gc('http://www.dailymotion.com/embed/video/x2k01a9?autoPlay=1&start=40')).to eq t
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'supports direct playlist links' do
|
61
|
+
t = 'https://www.dailymotion.com/playlist/x1ybux'
|
62
|
+
expect(gc('https://www.dailymotion.com/playlist/x1ybux/1#video=xlbw3e')).to eq t
|
63
|
+
expect(gc('https://www.dailymotion.com/playlist/x1ybux')).to eq t
|
64
|
+
expect(gc('http://www.dailymotion.com/playlist/x1ybux_ODNandfinally_amazing-world-records/1#video=xlbw3e')).to eq t
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'supports retrieving embed links as attributes' do
|
68
|
+
x = TrueURL.new('http://www.dailymotion.com/video/x2k01a9_battlefield-what-s-it-like-to-be-in-a-real-life-video-game_fun')
|
69
|
+
expect(x.attributes[:embed_url]).to eq 'https://www.dailymotion.com/embed/video/x2k01a9'
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'supports force HTTPS' do
|
73
|
+
t = 'https://www.dailymotion.com/ODNandfinally'
|
74
|
+
expect(gc('http://www.dailymotion.com/ODNandfinally')).to eq t
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe 'Vimeo' do
|
79
|
+
it 'supports direct video links' do
|
80
|
+
t = 'https://vimeo.com/122258599'
|
81
|
+
expect(gc('https://vimeo.com/channels/staffpicks/122258599')).to eq t
|
82
|
+
expect(gc('http://vimeo.com/122258599')).to eq t
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'supports embedded video links' do
|
86
|
+
t = 'https://vimeo.com/122258599'
|
87
|
+
expect(gc('https://player.vimeo.com/video/122258599?loop=1&color=c9ff23&title=0')).to eq t
|
88
|
+
end
|
89
|
+
|
90
|
+
it "supports Vimeo's relative canonical links" do
|
91
|
+
t = 'https://vimeo.com/channels/staffpicks'
|
92
|
+
expect(gc('http://vimeo.com/channels/staffpicks?some=silly¶ms=here')).to eq t
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'supports retrieving embed links as attributes' do
|
96
|
+
x = TrueURL.new('https://vimeo.com/channels/staffpicks/122258599')
|
97
|
+
expect(x.attributes[:embed_url]).to eq 'https://player.vimeo.com/video/122258599'
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'supports force HTTPS' do
|
101
|
+
t = 'https://vimeo.com/user3190002'
|
102
|
+
expect(gc('http://vimeo.com/user3190002')).to eq t
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe 'Nico Nico Douga' do
|
107
|
+
it 'should work with direct and embedded video links' do
|
108
|
+
t = 'http://www.nicovideo.jp/watch/sm25956031'
|
109
|
+
expect(gc('http://ext.nicovideo.jp/thumb_watch/sm25956031?w=490&h=307')).to eq t
|
110
|
+
expect(gc('http://embed.nicovideo.jp/watch/sm25956031/script?w=490&h=307&redirect=1')).to eq t
|
111
|
+
expect(gc('http://embed.nicovideo.jp/watch/sm25956031?oldScript=1')).to eq t
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'supports retrieving embed links as attributes' do
|
115
|
+
x = TrueURL.new('http://ext.nicovideo.jp/thumb_watch/sm25956031?w=490&h=307')
|
116
|
+
expect(x.attributes[:embed_url]).to eq 'http://embed.nicovideo.jp/watch/sm25956031'
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe 'Twitter' do
|
121
|
+
it 'supports direct links to tweets' do
|
122
|
+
t = 'https://twitter.com/gangsta_project/status/578483098284748801'
|
123
|
+
expect(gc('https://twitter.com/GANGSTA_Project/status/578483098284748801/photo/1')).to eq t
|
124
|
+
expect(gc('https://twitter.com/GANGSTA_Project/status/578483098284748801/')).to eq t
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'should handle hashbangs' do
|
128
|
+
t = 'https://twitter.com/gangsta_project/status/578483098284748801'
|
129
|
+
expect(gc('https://twitter.com/#!/GANGSTA_Project/status/578483098284748801/')).to eq t
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
describe 'URL Shorteners' do
|
134
|
+
it 'should work with t.co' do
|
135
|
+
t = 'http://www.prdaily.com/Main/Articles/3_essential_skills_for_todays_PR_pro__18404.aspx'
|
136
|
+
expect(gc('http://t.co/fvaGuRa5Za')).to eq t
|
137
|
+
expect(gc('https://t.co/fvaGuRa5Za')).to eq t
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should work with fb.me' do
|
141
|
+
t = 'https://www.facebook.com/aksuperdance/posts/1388968827814771'
|
142
|
+
expect(gc('http://fb.me/8qm5kW89k')).to eq t
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'should work with ift.tt' do
|
146
|
+
t = 'http://tedxtaipei.com/articles/the_best_kindergarten_you_have_ever_seen/'
|
147
|
+
expect(gc('http://ift.tt/2iCbPy8')).to eq t
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'should work with compounded URL shorteners' do
|
151
|
+
t = 'https://www.youtube.com/watch?v=jLhjsPjR-xk'
|
152
|
+
expect(gc('https://t.co/g4NYtZE3lW')).to eq t # http://bit.ly/2iCKic3 --> http://youtu.be/jLhjsPjR-xk?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
describe 'WordPress' do
|
157
|
+
it 'supports missing trailing slashes' do
|
158
|
+
t = 'http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
|
159
|
+
expect(gc('http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled')).to eq t
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
describe 'Blogger' do
|
164
|
+
it 'supports missing localized Blogger domains' do
|
165
|
+
t = 'http://thevikiblog.blogspot.com/2015/12/soompi-ios-android.html'
|
166
|
+
expect(gc('http://thevikiblog.blogspot.sg/2015/12/soompi-ios-android.html')).to eq t
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe 'Other Scenarios' do
|
171
|
+
it 'supports missing schemes' do
|
172
|
+
t = 'http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
|
173
|
+
expect(gc('//wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/')).to eq t
|
174
|
+
end
|
175
|
+
|
176
|
+
it 'supports scheme override' do
|
177
|
+
t = 'https://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
|
178
|
+
expect(gc('//wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/', scheme_override: 'https')).to eq t
|
179
|
+
end
|
180
|
+
|
181
|
+
it 'supports CDJapan' do
|
182
|
+
t = 'http://www.cdjapan.co.jp/product/MDR-1012'
|
183
|
+
expect(gc('http://www.cdjapan.co.jp/aff/click.cgi/e86NDzbdSLQ/4323/A323439/detailview.html?KEY=MDR-1012')).to eq t
|
184
|
+
end
|
185
|
+
|
186
|
+
it 'supports MyAnimeList' do
|
187
|
+
t = 'https://myanimelist.net/forum/?topicid=1371295'
|
188
|
+
expect(gc('https://myanimelist.net/forum?topicid=1371295&goto=newpost')).to eq t
|
189
|
+
end
|
190
|
+
|
191
|
+
it 'supports URLs with escapable characters' do
|
192
|
+
t = 'http://goboiano.com/news/2568-attack-on-titan%2527s-first-live-action-trailer-finally-launches'
|
193
|
+
expect(gc("http://media.goboiano.com/news/2568-attack-on-titan's-first-live-action-trailer-finally-launches")).to eq t
|
194
|
+
|
195
|
+
t = 'http://randomc.net/image/Kekkai%20Sensen/Kekkai%20Sensen%20-%2001%20-%20Large%2001.jpg'
|
196
|
+
expect(gc('http://randomc.net/image/Kekkai Sensen/Kekkai Sensen - 01 - Large 01.jpg')).to eq t
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'supports canonical HTTP headers' do
|
200
|
+
t = 'http://www.seoreviewtools.com/canonical-url-location-checker/'
|
201
|
+
expect(gc('http://www.seoreviewtools.com/tests/canonical-header.php')).to eq t
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
data/true_url.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require File.expand_path('../lib/true_url/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = 'true_url'
|
5
|
+
spec.version = TrueURL::VERSION
|
6
|
+
spec.authors = ['Jonathan Wong']
|
7
|
+
spec.email = ['jonathan@armchairtheorist.com']
|
8
|
+
spec.summary = 'A multi-strategy approach to find the absolutely cleanest and most likely canonical URL of any given URL.'
|
9
|
+
spec.homepage = 'http://github.com/armchairtheorist/true_url'
|
10
|
+
spec.license = 'MIT'
|
11
|
+
|
12
|
+
spec.files = `git ls-files`.split("\n")
|
13
|
+
spec.test_files = `git ls-files -- {spec}/*`.split("\n")
|
14
|
+
spec.require_paths = ['lib']
|
15
|
+
|
16
|
+
spec.add_development_dependency 'rspec', '~> 0'
|
17
|
+
spec.add_development_dependency 'rake', '~> 0'
|
18
|
+
spec.add_development_dependency 'http', '~> 2.1', ">= 2.1.0"
|
19
|
+
spec.add_development_dependency 'nokogiri', '~> 1.6', ">= 1.6.8"
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'addressable', '~> 2.4', ">= 2.4.0"
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: true_url
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jonathan Wong
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-12-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: http
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.1'
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: 2.1.0
|
51
|
+
type: :development
|
52
|
+
prerelease: false
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - "~>"
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '2.1'
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 2.1.0
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: nokogiri
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.6'
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: 1.6.8
|
71
|
+
type: :development
|
72
|
+
prerelease: false
|
73
|
+
version_requirements: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - "~>"
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.6'
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 1.6.8
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: addressable
|
83
|
+
requirement: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - "~>"
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '2.4'
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 2.4.0
|
91
|
+
type: :runtime
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - "~>"
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '2.4'
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 2.4.0
|
101
|
+
description:
|
102
|
+
email:
|
103
|
+
- jonathan@armchairtheorist.com
|
104
|
+
executables: []
|
105
|
+
extensions: []
|
106
|
+
extra_rdoc_files: []
|
107
|
+
files:
|
108
|
+
- ".gitignore"
|
109
|
+
- ".rspec"
|
110
|
+
- Gemfile
|
111
|
+
- LICENSE.txt
|
112
|
+
- README.md
|
113
|
+
- Rakefile
|
114
|
+
- lib/true_url.rb
|
115
|
+
- lib/true_url/context.rb
|
116
|
+
- lib/true_url/strategy.rb
|
117
|
+
- lib/true_url/strategy/dailymotion.rb
|
118
|
+
- lib/true_url/strategy/nicovideo.rb
|
119
|
+
- lib/true_url/strategy/twitter.rb
|
120
|
+
- lib/true_url/strategy/vimeo.rb
|
121
|
+
- lib/true_url/strategy/youtube.rb
|
122
|
+
- lib/true_url/version.rb
|
123
|
+
- spec/spec_helper.rb
|
124
|
+
- spec/true_url_spec.rb
|
125
|
+
- true_url.gemspec
|
126
|
+
homepage: http://github.com/armchairtheorist/true_url
|
127
|
+
licenses:
|
128
|
+
- MIT
|
129
|
+
metadata: {}
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project:
|
146
|
+
rubygems_version: 2.5.1
|
147
|
+
signing_key:
|
148
|
+
specification_version: 4
|
149
|
+
summary: A multi-strategy approach to find the absolutely cleanest and most likely
|
150
|
+
canonical URL of any given URL.
|
151
|
+
test_files: []
|