true_url 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +1 -0
- data/Rakefile +5 -0
- data/lib/true_url/context.rb +39 -0
- data/lib/true_url/strategy/dailymotion.rb +56 -0
- data/lib/true_url/strategy/nicovideo.rb +25 -0
- data/lib/true_url/strategy/twitter.rb +25 -0
- data/lib/true_url/strategy/vimeo.rb +28 -0
- data/lib/true_url/strategy/youtube.rb +39 -0
- data/lib/true_url/strategy.rb +23 -0
- data/lib/true_url/version.rb +3 -0
- data/lib/true_url.rb +140 -0
- data/spec/spec_helper.rb +96 -0
- data/spec/true_url_spec.rb +204 -0
- data/true_url.gemspec +22 -0
- metadata +151 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 496c04ece239c5302c9548b0a4b7d93630125172
|
4
|
+
data.tar.gz: 74c68ce403f401579bbc9b354b283595b5516302
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cfbf6e4099c911e66ff6061ecd98b0e98c60b8411659904d57d76c9bf1b4a2da207f0a19b9f11597ff8bf22a8ae716a7d49706b0a46730b6e824c99e970542cd
|
7
|
+
data.tar.gz: c186b1d04c781de99d61d64b9b3052e2f667ba140cba10c05db5e51ab0b43beddc0ee20d2318a534b84ed7ac8019294949a4fd18bcfe50a4492230431e13db6a
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Jonathan Wong
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
# true_url
|
data/Rakefile
ADDED
@@ -0,0 +1,39 @@
|
|
1
|
+
class TrueURL
|
2
|
+
class Context
|
3
|
+
attr_reader :original_url, :options, :attributes, :working_url
|
4
|
+
|
5
|
+
def initialize(original_url, options)
|
6
|
+
@original_url = parse(original_url)
|
7
|
+
@options = options
|
8
|
+
@finalized = false
|
9
|
+
@attributes = {}
|
10
|
+
|
11
|
+
set_working_url(original_url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def set_working_url(url, base_url = nil)
|
15
|
+
@working_url = base_url.nil? ? parse(url) : parse(base_url).join(parse(url))
|
16
|
+
|
17
|
+
# If the URL has no scheme, then we assume HTTP
|
18
|
+
if @working_url.scheme.nil?
|
19
|
+
@working_url = url.to_s.start_with?('//') ? parse("http:#{url}") : parse("http://#{url}")
|
20
|
+
end
|
21
|
+
|
22
|
+
@working_url.normalize
|
23
|
+
end
|
24
|
+
|
25
|
+
def finalize
|
26
|
+
@finalized = true
|
27
|
+
end
|
28
|
+
|
29
|
+
def finalized?
|
30
|
+
@finalized
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def parse(url)
|
36
|
+
(url.is_a? Addressable::URI) ? url : Addressable::URI.parse(url)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class DailyMotion
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if context.working_url.host == 'dai.ly'
|
8
|
+
video_id = path[1..-1]
|
9
|
+
|
10
|
+
elsif path[0..6] == '/video/'
|
11
|
+
video_id = clean_video_id(path)
|
12
|
+
|
13
|
+
elsif path[0..6] == '/embed/'
|
14
|
+
video_id = path[13..-1]
|
15
|
+
|
16
|
+
elsif path[0..9] == '/playlist/'
|
17
|
+
playlist_id = clean_playlist_id(path)
|
18
|
+
end
|
19
|
+
|
20
|
+
unless video_id.nil?
|
21
|
+
context.set_working_url("https://www.dailymotion.com/video/#{video_id}")
|
22
|
+
context.finalize
|
23
|
+
context.attributes[:embed_url] = "https://www.dailymotion.com/embed/video/#{video_id}"
|
24
|
+
end
|
25
|
+
|
26
|
+
unless playlist_id.nil?
|
27
|
+
context.set_working_url("https://www.dailymotion.com/playlist/#{playlist_id}")
|
28
|
+
context.finalize
|
29
|
+
end
|
30
|
+
|
31
|
+
# DailyMotion supports both HTTP and HTTPS and doesn't redirect between them, so we prefer HTTPS
|
32
|
+
context.working_url.scheme = 'https'
|
33
|
+
end
|
34
|
+
|
35
|
+
def clean_video_id(path)
|
36
|
+
if path.index('_')
|
37
|
+
path[7..path.index('_') - 1]
|
38
|
+
else
|
39
|
+
path[7..-1]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def clean_playlist_id(path)
|
44
|
+
cpath = path[10..-1]
|
45
|
+
|
46
|
+
if cpath.index('_')
|
47
|
+
cpath[0..cpath.index('_') - 1]
|
48
|
+
elsif cpath.index('/')
|
49
|
+
cpath[0..cpath.index('/') - 1]
|
50
|
+
else
|
51
|
+
cpath[0..-1]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class NicoVideo
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if path[0..6] == '/watch/'
|
8
|
+
video_id = path.split('/')[2]
|
9
|
+
|
10
|
+
elsif path[0..12] == '/thumb_watch/'
|
11
|
+
video_id = path.split('/')[2]
|
12
|
+
end
|
13
|
+
|
14
|
+
if video_id
|
15
|
+
context.set_working_url("http://www.nicovideo.jp/watch/#{video_id}")
|
16
|
+
context.finalize
|
17
|
+
context.attributes[:embed_url] = "http://embed.nicovideo.jp/watch/#{video_id}"
|
18
|
+
end
|
19
|
+
|
20
|
+
# Nico Video only supports HTTP
|
21
|
+
context.working_url.scheme = 'http'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class Twitter
|
4
|
+
def execute(context)
|
5
|
+
fragment = context.working_url.fragment
|
6
|
+
|
7
|
+
# Special handling to collapse Twitter hashbang (#!) URLs
|
8
|
+
unless fragment.nil?
|
9
|
+
if fragment.start_with?('!/')
|
10
|
+
context.working_url.path = fragment[1..-1]
|
11
|
+
context.working_url.fragment = nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
path = context.working_url.path
|
16
|
+
|
17
|
+
if path =~ /^\/\w+\/status\/\d+/
|
18
|
+
parts = path.split('/')
|
19
|
+
context.set_working_url("https://twitter.com/#{parts[1].downcase}/status/#{parts[3]}")
|
20
|
+
context.finalize
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class Vimeo
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if context.working_url.host == 'player.vimeo.com'
|
8
|
+
video_id = path[7..-1]
|
9
|
+
|
10
|
+
elsif path =~ /^\/channels\/\w+\/\d+$/
|
11
|
+
video_id = path.split('/').last
|
12
|
+
|
13
|
+
elsif path =~ /^\/\d+$/
|
14
|
+
video_id = path[1..-1]
|
15
|
+
end
|
16
|
+
|
17
|
+
if video_id
|
18
|
+
context.set_working_url("https://vimeo.com/#{video_id}")
|
19
|
+
context.finalize
|
20
|
+
context.attributes[:embed_url] = "https://player.vimeo.com/video/#{video_id}"
|
21
|
+
end
|
22
|
+
|
23
|
+
# Vimeo supports both HTTP and HTTPS and doesn't redirect between them, so we prefer HTTPS
|
24
|
+
context.working_url.scheme = 'https'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
class TrueURL
|
2
|
+
module Strategy
|
3
|
+
class YouTube
|
4
|
+
def execute(context)
|
5
|
+
path = context.working_url.path
|
6
|
+
|
7
|
+
if context.working_url.host == 'youtu.be'
|
8
|
+
video_id = path[1..-1]
|
9
|
+
|
10
|
+
elsif path == '/watch'
|
11
|
+
video_id = context.working_url.query_values['v']
|
12
|
+
|
13
|
+
elsif path == '/playlist'
|
14
|
+
playlist_id = context.working_url.query_values['list']
|
15
|
+
|
16
|
+
elsif path[0..17] == '/embed/videoseries'
|
17
|
+
playlist_id = context.working_url.query_values['list']
|
18
|
+
|
19
|
+
elsif path[0..6] == '/embed/'
|
20
|
+
video_id = path[7..-1]
|
21
|
+
end
|
22
|
+
|
23
|
+
unless video_id.nil?
|
24
|
+
context.set_working_url("https://www.youtube.com/watch?v=#{video_id}")
|
25
|
+
context.finalize
|
26
|
+
context.attributes[:embed_url] = "https://www.youtube.com/embed/#{video_id}"
|
27
|
+
context.attributes[:embed_url_private] = "https://www.youtube-nocookie.com/embed/#{video_id}"
|
28
|
+
end
|
29
|
+
|
30
|
+
unless playlist_id.nil?
|
31
|
+
context.set_working_url("https://www.youtube.com/playlist?list=#{playlist_id}")
|
32
|
+
context.finalize
|
33
|
+
context.attributes[:embed_url] = "https://www.youtube.com/embed/videoseries?list=#{playlist_id}"
|
34
|
+
context.attributes[:embed_url_private] = "https://www.youtube-nocookie.com/embed/videoseries?list=#{playlist_id}"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'true_url/strategy/dailymotion'
|
2
|
+
require 'true_url/strategy/nicovideo'
|
3
|
+
require 'true_url/strategy/twitter'
|
4
|
+
require 'true_url/strategy/vimeo'
|
5
|
+
require 'true_url/strategy/youtube'
|
6
|
+
|
7
|
+
class TrueURL
|
8
|
+
module Strategy
|
9
|
+
def self.default_list
|
10
|
+
[
|
11
|
+
[/youtube.com$/, TrueURL::Strategy::YouTube.new],
|
12
|
+
[/youtube-nocookie.com$/, TrueURL::Strategy::YouTube.new],
|
13
|
+
[/^youtu.be$/, TrueURL::Strategy::YouTube.new],
|
14
|
+
[/dailymotion.com$/, TrueURL::Strategy::DailyMotion.new],
|
15
|
+
[/^dai.ly$/, TrueURL::Strategy::DailyMotion.new],
|
16
|
+
[/vimeo.com$/, TrueURL::Strategy::Vimeo.new],
|
17
|
+
[/nicovideo.jp$/, TrueURL::Strategy::NicoVideo.new],
|
18
|
+
[/^nico.ms$/, TrueURL::Strategy::NicoVideo.new],
|
19
|
+
[/twitter.com$/, TrueURL::Strategy::Twitter.new]
|
20
|
+
]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/true_url.rb
ADDED
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
|
3
|
+
class TrueURL
|
4
|
+
autoload :Version, 'true_url/version'
|
5
|
+
autoload :Context, 'true_url/context'
|
6
|
+
autoload :Strategy, 'true_url/strategy'
|
7
|
+
|
8
|
+
attr_accessor :context, :strategies
|
9
|
+
|
10
|
+
OPTIONS = {
|
11
|
+
scheme_override: nil, # Possible choices: "https", "http", nil (preserve scheme)
|
12
|
+
fetch: true # Whether to fetch the URL
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
QUERY_VALUES_TO_REMOVE = %w(
|
16
|
+
utm_source
|
17
|
+
utm_medium
|
18
|
+
utm_term
|
19
|
+
utm_content
|
20
|
+
utm_campaign
|
21
|
+
sms_ss
|
22
|
+
awesm
|
23
|
+
xtor
|
24
|
+
PHPSESSID
|
25
|
+
).freeze
|
26
|
+
|
27
|
+
def initialize(url, options = {})
|
28
|
+
@context = TrueURL::Context.new(url, OPTIONS.merge(options))
|
29
|
+
@strategies = TrueURL::Strategy.default_list
|
30
|
+
@executed = false
|
31
|
+
end
|
32
|
+
|
33
|
+
def canonical
|
34
|
+
execute
|
35
|
+
@context.working_url.to_s
|
36
|
+
end
|
37
|
+
|
38
|
+
def attributes
|
39
|
+
execute
|
40
|
+
@context.attributes
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def execute
|
46
|
+
return if @executed
|
47
|
+
|
48
|
+
execute_strategies
|
49
|
+
|
50
|
+
unless @context.finalized?
|
51
|
+
if attempt_fetch?
|
52
|
+
fetch
|
53
|
+
execute_strategies
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
scheme_override
|
58
|
+
remove_fragments
|
59
|
+
clean_query_values
|
60
|
+
|
61
|
+
@executed = true
|
62
|
+
end
|
63
|
+
|
64
|
+
def execute_strategies
|
65
|
+
@strategies.each do |s|
|
66
|
+
match_criteria = s[0]
|
67
|
+
strategy = s[1]
|
68
|
+
|
69
|
+
strategy.execute(@context) unless @context.finalized? || !strategy_match?(match_criteria)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def strategy_match?(match_criteria)
|
74
|
+
return true if match_criteria.nil?
|
75
|
+
|
76
|
+
host = @context.working_url.host
|
77
|
+
host.nil? ? false : host.match(match_criteria)
|
78
|
+
end
|
79
|
+
|
80
|
+
def attempt_fetch?
|
81
|
+
return false unless @context.options[:fetch]
|
82
|
+
|
83
|
+
# Must at least have a host, otherwise we can't find the site to crawl
|
84
|
+
return false if @context.working_url.host.nil?
|
85
|
+
|
86
|
+
# We only support HTTP or HTTPS
|
87
|
+
%w(http https).include?(@context.working_url.scheme)
|
88
|
+
end
|
89
|
+
|
90
|
+
def fetch
|
91
|
+
require 'http' unless defined? HTTP
|
92
|
+
|
93
|
+
starting_url = @context.working_url
|
94
|
+
|
95
|
+
response = HTTP.follow
|
96
|
+
.get(starting_url)
|
97
|
+
|
98
|
+
canonical_url = find_canonical_header(response.headers) || find_canonical_url(response.to_s) || response.uri
|
99
|
+
@context.set_working_url(canonical_url, starting_url)
|
100
|
+
end
|
101
|
+
|
102
|
+
def find_canonical_header(headers)
|
103
|
+
return if headers['Link'].nil?
|
104
|
+
|
105
|
+
links = (headers['Link'].is_a? String) ? [headers['Link']] : headers['Link']
|
106
|
+
links.each { |link| return link.split(/[<>;]/)[1] if link.end_with?('rel="canonical"') }
|
107
|
+
nil
|
108
|
+
end
|
109
|
+
|
110
|
+
def find_canonical_url(html)
|
111
|
+
require 'nokogiri' unless defined? Nokogiri::HTML
|
112
|
+
|
113
|
+
doc = Nokogiri::HTML(html)
|
114
|
+
|
115
|
+
elem = doc.at('link[rel="canonical"]')
|
116
|
+
canonical_url = elem['href'] unless elem.nil?
|
117
|
+
|
118
|
+
elem = doc.at('meta[property="og:url"]')
|
119
|
+
og_url = elem['content'] unless elem.nil?
|
120
|
+
|
121
|
+
canonical_url || og_url
|
122
|
+
end
|
123
|
+
|
124
|
+
def scheme_override
|
125
|
+
@context.working_url.scheme = @context.options[:scheme_override] unless @context.options[:scheme_override].nil?
|
126
|
+
end
|
127
|
+
|
128
|
+
def remove_fragments
|
129
|
+
@context.working_url.fragment = nil
|
130
|
+
end
|
131
|
+
|
132
|
+
def clean_query_values
|
133
|
+
query_values = @context.working_url.query_values
|
134
|
+
|
135
|
+
unless query_values.nil?
|
136
|
+
QUERY_VALUES_TO_REMOVE.each { |p| query_values.delete(p) }
|
137
|
+
@context.working_url.query_values = query_values.empty? ? nil : query_values
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'true_url'
|
2
|
+
|
3
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
4
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
5
|
+
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
6
|
+
# this file to always be loaded, without a need to explicitly require it in any
|
7
|
+
# files.
|
8
|
+
#
|
9
|
+
# Given that it is always loaded, you are encouraged to keep this file as
|
10
|
+
# light-weight as possible. Requiring heavyweight dependencies from this file
|
11
|
+
# will add to the boot time of your test suite on EVERY test run, even for an
|
12
|
+
# individual file that may not need all of that loaded. Instead, consider making
|
13
|
+
# a separate helper file that requires the additional dependencies and performs
|
14
|
+
# the additional setup, and require it from the spec files that actually need
|
15
|
+
# it.
|
16
|
+
#
|
17
|
+
# The `.rspec` file also contains a few flags that are not defaults but that
|
18
|
+
# users commonly want.
|
19
|
+
#
|
20
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
21
|
+
RSpec.configure do |config|
|
22
|
+
# rspec-expectations config goes here. You can use an alternate
|
23
|
+
# assertion/expectation library such as wrong or the stdlib/minitest
|
24
|
+
# assertions if you prefer.
|
25
|
+
config.expect_with :rspec do |expectations|
|
26
|
+
# This option will default to `true` in RSpec 4. It makes the `description`
|
27
|
+
# and `failure_message` of custom matchers include text for helper methods
|
28
|
+
# defined using `chain`, e.g.:
|
29
|
+
# be_bigger_than(2).and_smaller_than(4).description
|
30
|
+
# # => "be bigger than 2 and smaller than 4"
|
31
|
+
# ...rather than:
|
32
|
+
# # => "be bigger than 2"
|
33
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
34
|
+
end
|
35
|
+
|
36
|
+
# rspec-mocks config goes here. You can use an alternate test double
|
37
|
+
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
38
|
+
config.mock_with :rspec do |mocks|
|
39
|
+
# Prevents you from mocking or stubbing a method that does not exist on
|
40
|
+
# a real object. This is generally recommended, and will default to
|
41
|
+
# `true` in RSpec 4.
|
42
|
+
mocks.verify_partial_doubles = true
|
43
|
+
end
|
44
|
+
|
45
|
+
# The settings below are suggested to provide a good initial experience
|
46
|
+
# with RSpec, but feel free to customize to your heart's content.
|
47
|
+
# # These two settings work together to allow you to limit a spec run
|
48
|
+
# # to individual examples or groups you care about by tagging them with
|
49
|
+
# # `:focus` metadata. When nothing is tagged with `:focus`, all examples
|
50
|
+
# # get run.
|
51
|
+
# config.filter_run :focus
|
52
|
+
# config.run_all_when_everything_filtered = true
|
53
|
+
#
|
54
|
+
# # Allows RSpec to persist some state between runs in order to support
|
55
|
+
# # the `--only-failures` and `--next-failure` CLI options. We recommend
|
56
|
+
# # you configure your source control system to ignore this file.
|
57
|
+
# config.example_status_persistence_file_path = "spec/examples.txt"
|
58
|
+
#
|
59
|
+
# # Limits the available syntax to the non-monkey patched syntax that is
|
60
|
+
# # recommended. For more details, see:
|
61
|
+
# # - http://rspec.info/blog/2012/06/rspecs-new-expectation-syntax/
|
62
|
+
# # - http://www.teaisaweso.me/blog/2013/05/27/rspecs-new-message-expectation-syntax/
|
63
|
+
# # - http://rspec.info/blog/2014/05/notable-changes-in-rspec-3/#zero-monkey-patching-mode
|
64
|
+
# config.disable_monkey_patching!
|
65
|
+
#
|
66
|
+
# # This setting enables warnings. It's recommended, but in some cases may
|
67
|
+
# # be too noisy due to issues in dependencies.
|
68
|
+
# config.warnings = true
|
69
|
+
#
|
70
|
+
# # Many RSpec users commonly either run the entire suite or an individual
|
71
|
+
# # file, and it's useful to allow more verbose output when running an
|
72
|
+
# # individual spec file.
|
73
|
+
# if config.files_to_run.one?
|
74
|
+
# # Use the documentation formatter for detailed output,
|
75
|
+
# # unless a formatter has already been configured
|
76
|
+
# # (e.g. via a command-line flag).
|
77
|
+
# config.default_formatter = 'doc'
|
78
|
+
# end
|
79
|
+
#
|
80
|
+
# # Print the 10 slowest examples and example groups at the
|
81
|
+
# # end of the spec run, to help surface which specs are running
|
82
|
+
# # particularly slow.
|
83
|
+
# config.profile_examples = 10
|
84
|
+
#
|
85
|
+
# # Run specs in random order to surface order dependencies. If you find an
|
86
|
+
# # order dependency and want to debug it, you can fix the order by providing
|
87
|
+
# # the seed, which is printed after each run.
|
88
|
+
# # --seed 1234
|
89
|
+
# config.order = :random
|
90
|
+
#
|
91
|
+
# # Seed global randomization in this process using the `--seed` CLI option.
|
92
|
+
# # Setting this allows you to use `--seed` to deterministically reproduce
|
93
|
+
# # test failures related to randomization by passing the same `--seed` value
|
94
|
+
# # as the one that triggered the failure.
|
95
|
+
# Kernel.srand config.seed
|
96
|
+
end
|
@@ -0,0 +1,204 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
def gc(unclean_url, options = {})
|
4
|
+
TrueURL.new(unclean_url, options).canonical
|
5
|
+
end
|
6
|
+
|
7
|
+
describe TrueURL do
|
8
|
+
describe 'YouTube' do
|
9
|
+
it 'supports direct video links' do
|
10
|
+
t = 'https://www.youtube.com/watch?v=RDocnbkHjhI'
|
11
|
+
expect(gc('https://www.youtube.com/watch?v=RDocnbkHjhI')).to eq t
|
12
|
+
expect(gc('https://www.youtube.com/watch?v=RDocnbkHjhI&feature=youtu.be&list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
|
13
|
+
expect(gc('https://youtu.be/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'supports embedded video links' do
|
17
|
+
t = 'https://www.youtube.com/watch?v=RDocnbkHjhI'
|
18
|
+
expect(gc('https://www.youtube.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0')).to eq t
|
19
|
+
expect(gc('https://www.youtube-nocookie.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0&controls=0&showinfo=0')).to eq t
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'supports direct playlist links' do
|
23
|
+
t = 'https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
|
24
|
+
expect(gc('https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
25
|
+
expect(gc('https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
26
|
+
end
|
27
|
+
|
28
|
+
it 'supports embedded playlist links' do
|
29
|
+
t = 'https://www.youtube.com/playlist?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
|
30
|
+
expect(gc('https://www.youtube.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
31
|
+
expect(gc('https://www.youtube-nocookie.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')).to eq t
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'supports direct channel links' do
|
35
|
+
t = 'https://www.youtube.com/user/WatchMojo'
|
36
|
+
expect(gc('https://www.youtube.com/channel/UCaWd5_7JhbQBe4dknZhsHJg')).to eq t
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'supports retrieving embed links as attributes' do
|
40
|
+
x = TrueURL.new('https://www.youtube-nocookie.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY')
|
41
|
+
expect(x.attributes[:embed_url]).to eq 'https://www.youtube.com/embed/videoseries?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY'
|
42
|
+
|
43
|
+
x = TrueURL.new('https://www.youtube.com/embed/RDocnbkHjhI?list=PLs4hTtftqnlAkiQNdWn6bbKUr-P1wuSm0&controls=0&showinfo=0')
|
44
|
+
expect(x.attributes[:embed_url_private]).to eq 'https://www.youtube-nocookie.com/embed/RDocnbkHjhI'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe 'DailyMotion' do
|
49
|
+
it 'supports direct video links' do
|
50
|
+
t = 'https://www.dailymotion.com/video/x2k01a9'
|
51
|
+
expect(gc('http://dai.ly/x2k01a9')).to eq t
|
52
|
+
expect(gc('http://www.dailymotion.com/video/x2k01a9_battlefield-what-s-it-like-to-be-in-a-real-life-video-game_fun')).to eq t
|
53
|
+
end
|
54
|
+
|
55
|
+
it 'supports embedded video links' do
|
56
|
+
t = 'https://www.dailymotion.com/video/x2k01a9'
|
57
|
+
expect(gc('http://www.dailymotion.com/embed/video/x2k01a9?autoPlay=1&start=40')).to eq t
|
58
|
+
end
|
59
|
+
|
60
|
+
it 'supports direct playlist links' do
|
61
|
+
t = 'https://www.dailymotion.com/playlist/x1ybux'
|
62
|
+
expect(gc('https://www.dailymotion.com/playlist/x1ybux/1#video=xlbw3e')).to eq t
|
63
|
+
expect(gc('https://www.dailymotion.com/playlist/x1ybux')).to eq t
|
64
|
+
expect(gc('http://www.dailymotion.com/playlist/x1ybux_ODNandfinally_amazing-world-records/1#video=xlbw3e')).to eq t
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'supports retrieving embed links as attributes' do
|
68
|
+
x = TrueURL.new('http://www.dailymotion.com/video/x2k01a9_battlefield-what-s-it-like-to-be-in-a-real-life-video-game_fun')
|
69
|
+
expect(x.attributes[:embed_url]).to eq 'https://www.dailymotion.com/embed/video/x2k01a9'
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'supports force HTTPS' do
|
73
|
+
t = 'https://www.dailymotion.com/ODNandfinally'
|
74
|
+
expect(gc('http://www.dailymotion.com/ODNandfinally')).to eq t
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
describe 'Vimeo' do
|
79
|
+
it 'supports direct video links' do
|
80
|
+
t = 'https://vimeo.com/122258599'
|
81
|
+
expect(gc('https://vimeo.com/channels/staffpicks/122258599')).to eq t
|
82
|
+
expect(gc('http://vimeo.com/122258599')).to eq t
|
83
|
+
end
|
84
|
+
|
85
|
+
it 'supports embedded video links' do
|
86
|
+
t = 'https://vimeo.com/122258599'
|
87
|
+
expect(gc('https://player.vimeo.com/video/122258599?loop=1&color=c9ff23&title=0')).to eq t
|
88
|
+
end
|
89
|
+
|
90
|
+
it "supports Vimeo's relative canonical links" do
|
91
|
+
t = 'https://vimeo.com/channels/staffpicks'
|
92
|
+
expect(gc('http://vimeo.com/channels/staffpicks?some=silly¶ms=here')).to eq t
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'supports retrieving embed links as attributes' do
|
96
|
+
x = TrueURL.new('https://vimeo.com/channels/staffpicks/122258599')
|
97
|
+
expect(x.attributes[:embed_url]).to eq 'https://player.vimeo.com/video/122258599'
|
98
|
+
end
|
99
|
+
|
100
|
+
it 'supports force HTTPS' do
|
101
|
+
t = 'https://vimeo.com/user3190002'
|
102
|
+
expect(gc('http://vimeo.com/user3190002')).to eq t
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
describe 'Nico Nico Douga' do
|
107
|
+
it 'should work with direct and embedded video links' do
|
108
|
+
t = 'http://www.nicovideo.jp/watch/sm25956031'
|
109
|
+
expect(gc('http://ext.nicovideo.jp/thumb_watch/sm25956031?w=490&h=307')).to eq t
|
110
|
+
expect(gc('http://embed.nicovideo.jp/watch/sm25956031/script?w=490&h=307&redirect=1')).to eq t
|
111
|
+
expect(gc('http://embed.nicovideo.jp/watch/sm25956031?oldScript=1')).to eq t
|
112
|
+
end
|
113
|
+
|
114
|
+
it 'supports retrieving embed links as attributes' do
|
115
|
+
x = TrueURL.new('http://ext.nicovideo.jp/thumb_watch/sm25956031?w=490&h=307')
|
116
|
+
expect(x.attributes[:embed_url]).to eq 'http://embed.nicovideo.jp/watch/sm25956031'
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
describe 'Twitter' do
|
121
|
+
it 'supports direct links to tweets' do
|
122
|
+
t = 'https://twitter.com/gangsta_project/status/578483098284748801'
|
123
|
+
expect(gc('https://twitter.com/GANGSTA_Project/status/578483098284748801/photo/1')).to eq t
|
124
|
+
expect(gc('https://twitter.com/GANGSTA_Project/status/578483098284748801/')).to eq t
|
125
|
+
end
|
126
|
+
|
127
|
+
it 'should handle hashbangs' do
|
128
|
+
t = 'https://twitter.com/gangsta_project/status/578483098284748801'
|
129
|
+
expect(gc('https://twitter.com/#!/GANGSTA_Project/status/578483098284748801/')).to eq t
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
describe 'URL Shorteners' do
|
134
|
+
it 'should work with t.co' do
|
135
|
+
t = 'http://www.prdaily.com/Main/Articles/3_essential_skills_for_todays_PR_pro__18404.aspx'
|
136
|
+
expect(gc('http://t.co/fvaGuRa5Za')).to eq t
|
137
|
+
expect(gc('https://t.co/fvaGuRa5Za')).to eq t
|
138
|
+
end
|
139
|
+
|
140
|
+
it 'should work with fb.me' do
|
141
|
+
t = 'https://www.facebook.com/aksuperdance/posts/1388968827814771'
|
142
|
+
expect(gc('http://fb.me/8qm5kW89k')).to eq t
|
143
|
+
end
|
144
|
+
|
145
|
+
it 'should work with ift.tt' do
|
146
|
+
t = 'http://tedxtaipei.com/articles/the_best_kindergarten_you_have_ever_seen/'
|
147
|
+
expect(gc('http://ift.tt/2iCbPy8')).to eq t
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'should work with compounded URL shorteners' do
|
151
|
+
t = 'https://www.youtube.com/watch?v=jLhjsPjR-xk'
|
152
|
+
expect(gc('https://t.co/g4NYtZE3lW')).to eq t # http://bit.ly/2iCKic3 --> http://youtu.be/jLhjsPjR-xk?list=PLVL8S3lUHf0RqD7TZ6hohWk8Sd3asaqnY
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
describe 'WordPress' do
|
157
|
+
it 'supports missing trailing slashes' do
|
158
|
+
t = 'http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
|
159
|
+
expect(gc('http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled')).to eq t
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
describe 'Blogger' do
|
164
|
+
it 'supports missing localized Blogger domains' do
|
165
|
+
t = 'http://thevikiblog.blogspot.com/2015/12/soompi-ios-android.html'
|
166
|
+
expect(gc('http://thevikiblog.blogspot.sg/2015/12/soompi-ios-android.html')).to eq t
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
describe 'Other Scenarios' do
|
171
|
+
it 'supports missing schemes' do
|
172
|
+
t = 'http://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
|
173
|
+
expect(gc('//wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/')).to eq t
|
174
|
+
end
|
175
|
+
|
176
|
+
it 'supports scheme override' do
|
177
|
+
t = 'https://wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/'
|
178
|
+
expect(gc('//wowjapan.asia/2015/04/anime-gargantia-on-the-verdurous-planet-2nd-season-cancelled/', scheme_override: 'https')).to eq t
|
179
|
+
end
|
180
|
+
|
181
|
+
it 'supports CDJapan' do
|
182
|
+
t = 'http://www.cdjapan.co.jp/product/MDR-1012'
|
183
|
+
expect(gc('http://www.cdjapan.co.jp/aff/click.cgi/e86NDzbdSLQ/4323/A323439/detailview.html?KEY=MDR-1012')).to eq t
|
184
|
+
end
|
185
|
+
|
186
|
+
it 'supports MyAnimeList' do
|
187
|
+
t = 'https://myanimelist.net/forum/?topicid=1371295'
|
188
|
+
expect(gc('https://myanimelist.net/forum?topicid=1371295&goto=newpost')).to eq t
|
189
|
+
end
|
190
|
+
|
191
|
+
it 'supports URLs with escapable characters' do
|
192
|
+
t = 'http://goboiano.com/news/2568-attack-on-titan%2527s-first-live-action-trailer-finally-launches'
|
193
|
+
expect(gc("http://media.goboiano.com/news/2568-attack-on-titan's-first-live-action-trailer-finally-launches")).to eq t
|
194
|
+
|
195
|
+
t = 'http://randomc.net/image/Kekkai%20Sensen/Kekkai%20Sensen%20-%2001%20-%20Large%2001.jpg'
|
196
|
+
expect(gc('http://randomc.net/image/Kekkai Sensen/Kekkai Sensen - 01 - Large 01.jpg')).to eq t
|
197
|
+
end
|
198
|
+
|
199
|
+
it 'supports canonical HTTP headers' do
|
200
|
+
t = 'http://www.seoreviewtools.com/canonical-url-location-checker/'
|
201
|
+
expect(gc('http://www.seoreviewtools.com/tests/canonical-header.php')).to eq t
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
data/true_url.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require File.expand_path('../lib/true_url/version', __FILE__)
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = 'true_url'
|
5
|
+
spec.version = TrueURL::VERSION
|
6
|
+
spec.authors = ['Jonathan Wong']
|
7
|
+
spec.email = ['jonathan@armchairtheorist.com']
|
8
|
+
spec.summary = 'A multi-strategy approach to find the absolutely cleanest and most likely canonical URL of any given URL.'
|
9
|
+
spec.homepage = 'http://github.com/armchairtheorist/true_url'
|
10
|
+
spec.license = 'MIT'
|
11
|
+
|
12
|
+
spec.files = `git ls-files`.split("\n")
|
13
|
+
spec.test_files = `git ls-files -- {spec}/*`.split("\n")
|
14
|
+
spec.require_paths = ['lib']
|
15
|
+
|
16
|
+
spec.add_development_dependency 'rspec', '~> 0'
|
17
|
+
spec.add_development_dependency 'rake', '~> 0'
|
18
|
+
spec.add_development_dependency 'http', '~> 2.1', ">= 2.1.0"
|
19
|
+
spec.add_development_dependency 'nokogiri', '~> 1.6', ">= 1.6.8"
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'addressable', '~> 2.4', ">= 2.4.0"
|
22
|
+
end
|
metadata
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: true_url
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jonathan Wong
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-12-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: http
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '2.1'
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: 2.1.0
|
51
|
+
type: :development
|
52
|
+
prerelease: false
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - "~>"
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '2.1'
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 2.1.0
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: nokogiri
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.6'
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: 1.6.8
|
71
|
+
type: :development
|
72
|
+
prerelease: false
|
73
|
+
version_requirements: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - "~>"
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '1.6'
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 1.6.8
|
81
|
+
- !ruby/object:Gem::Dependency
|
82
|
+
name: addressable
|
83
|
+
requirement: !ruby/object:Gem::Requirement
|
84
|
+
requirements:
|
85
|
+
- - "~>"
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: '2.4'
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: 2.4.0
|
91
|
+
type: :runtime
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - "~>"
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '2.4'
|
98
|
+
- - ">="
|
99
|
+
- !ruby/object:Gem::Version
|
100
|
+
version: 2.4.0
|
101
|
+
description:
|
102
|
+
email:
|
103
|
+
- jonathan@armchairtheorist.com
|
104
|
+
executables: []
|
105
|
+
extensions: []
|
106
|
+
extra_rdoc_files: []
|
107
|
+
files:
|
108
|
+
- ".gitignore"
|
109
|
+
- ".rspec"
|
110
|
+
- Gemfile
|
111
|
+
- LICENSE.txt
|
112
|
+
- README.md
|
113
|
+
- Rakefile
|
114
|
+
- lib/true_url.rb
|
115
|
+
- lib/true_url/context.rb
|
116
|
+
- lib/true_url/strategy.rb
|
117
|
+
- lib/true_url/strategy/dailymotion.rb
|
118
|
+
- lib/true_url/strategy/nicovideo.rb
|
119
|
+
- lib/true_url/strategy/twitter.rb
|
120
|
+
- lib/true_url/strategy/vimeo.rb
|
121
|
+
- lib/true_url/strategy/youtube.rb
|
122
|
+
- lib/true_url/version.rb
|
123
|
+
- spec/spec_helper.rb
|
124
|
+
- spec/true_url_spec.rb
|
125
|
+
- true_url.gemspec
|
126
|
+
homepage: http://github.com/armchairtheorist/true_url
|
127
|
+
licenses:
|
128
|
+
- MIT
|
129
|
+
metadata: {}
|
130
|
+
post_install_message:
|
131
|
+
rdoc_options: []
|
132
|
+
require_paths:
|
133
|
+
- lib
|
134
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
requirements: []
|
145
|
+
rubyforge_project:
|
146
|
+
rubygems_version: 2.5.1
|
147
|
+
signing_key:
|
148
|
+
specification_version: 4
|
149
|
+
summary: A multi-strategy approach to find the absolutely cleanest and most likely
|
150
|
+
canonical URL of any given URL.
|
151
|
+
test_files: []
|